def download(args):
    if os.path.exists(DOWNLOAD_DIR + FILENAME):
        os.remove(DOWNLOAD_DIR + FILENAME)
    start_time = time.time()
    if not args['--quiet']:
        print "\nDownloading ", BASE_URL + FILENAME
        print "         to ", DOWNLOAD_DIR + FILENAME
    urllib.urlretrieve(BASE_URL + FILENAME, DOWNLOAD_DIR + FILENAME)
    elapsed = time.time() - start_time
    if not args['--quiet']:
        print "Done. Elapsed time: {}".format(slmf.secs2str(elapsed))
Exemple #2
0
def download(args):
    gzfn = DOWNLOAD_DIR + FILENAME
    if os.path.exists(gzfn):
        os.remove(gzfn)
    fn = gzfn.replace('.gz', '')
    if os.path.exists(fn):
        os.remove(fn)
    start_time = time.time()
    if not args['--quiet']:
        print "\nDownloading ", BASE_URL + FILENAME
        print "         to ", gzfn
    urlretrieve(BASE_URL + FILENAME, gzfn)
    print "Uncompressing", gzfn
    ifh = gzip.open(gzfn, 'rb')
    ofh = open(fn, 'wb')
    ofh.write(ifh.read())
    ifh.close()
    ofh.close()
    if not args['--quiet']:
        elapsed = time.time() - start_time
        print "Done. Elapsed time: {}".format(slmf.secs2str(elapsed))
Exemple #3
0
                if rv:
                    dct += 1
                else:
                    dba_err_ct += 1
        pbar.update(ct)
    pbar.finish()

    if not args['--quiet']:
        print "\n{} targets processed.".format(ct)
        print "  {} non-Tclin targets have upstream Tclin target(s)".format(
            len(umark))
        print "    Inserted {} upstream kegg_nearest_tclin rows".format(uct)
        print "  {} non-Tclin targets have downstream Tclin target(s)".format(
            len(dmark))
        print "    Inserted {} upstream kegg_nearest_tclin rows".format(dct)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)


if __name__ == '__main__':
    print "\n{} (v{}) [{}]:".format(PROGRAM, __version__, time.strftime("%c"))
    args = docopt(__doc__, version=__version__)
    if args['--debug']:
        print "\n[*DEBUG*] ARGS:\n%s\n" % repr(args)
    start_time = time.time()
    calc_and_load(args)
    elapsed = time.time() - start_time
    print "\n{}: Done. Elapsed time: {}\n".format(PROGRAM,
                                                  slmf.secs2str(elapsed))
Exemple #4
0
      print "  Skipped {} PPIs involving the same protein".format(same12_ct)
    if notfnd:
      print "  No target found for {} UniProts/Syms/GeneIDs. See logfile {} for details.".format(len(notfnd), logfile) 
    if dba_err_ct > 0:
      print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

def find_target(dba, k):
  (up, sym, geneid) = k.split("|")
  targets = False
  if up != '': # No UniProt accessions in update files
    targets = dba.find_targets({'uniprot': up})
  if not targets:
    targets = dba.find_targets({'sym': sym})
  if not targets:
    targets = dba.find_targets({'geneid': geneid})
  if targets:
    return targets[0]
  else:
    return None


if __name__ == '__main__':
  print "\n{} (v{}) [{}]:".format(PROGRAM, __version__, time.strftime("%c"))
  args = docopt(__doc__, version=__version__)
  if args['--debug']:
    print "\n[*DEBUG*] ARGS:\n%s\n"%repr(args)
  start_time = time.time()
  load(args)
  elapsed = time.time() - start_time
  print "\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed))
Exemple #5
0
        })
        if rv:
            tiurl_ct += 1
        else:
            dba_err_ct += 1
        time.sleep(1)
        pbar.update(ct)
    pbar.finish()
    print "{} TCRD targets processed.".format(ct)
    print "  Inserted {} Ab Count tdl_info rows".format(tiab_ct)
    print "  Inserted {} MAb Count tdl_info rows".format(timab_ct)
    print "  Inserted {} Antibodypedia.com URL tdl_info rows".format(tiurl_ct)
    if net_err_ct > 0:
        print "WARNING: Network error for {} targets. See logfile {} for details.".format(
            net_err_ct, logfile)
    if dba_err_ct > 0:
        print "WARNING: {} DB errors occurred. See logfile {} for details.".format(
            dba_err_ct, logfile)


if __name__ == '__main__':
    print "\n%s (v%s) [%s]:" % (PROGRAM, __version__, time.strftime("%c"))
    args = docopt(__doc__, version=__version__)
    debug = int(args['--debug'])
    if debug:
        print "\n[*DEBUG*] ARGS:\n%s\n" % repr(args)
    start_time = time.time()
    load(args)
    elapsed = time.time() - start_time
    print "\n%s: Done. Elapsed time: %s\n" % (PROGRAM, slmf.secs2str(elapsed))
Exemple #6
0
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    print "\nWorking on OMIM..."
    download_OMIM()
    start_time = time.time()
    load_OMIM(args, dba, logger, logfile)
    elapsed = time.time() - start_time
    print "Done with OMIM. Elapsed time: {}".format(slmf.secs2str(elapsed))

    print "\nWorking on GWAS Catalog..."
    start_time = time.time()
    load_GWASCatalog(args, dba, logger, logfile)
    elapsed = time.time() - start_time
    print "Done with GWAS Catalog. Elapsed time: {}".format(
        slmf.secs2str(elapsed))

    print "\nWorking on IMPC..."
    start_time = time.time()
    load_IMPC(args, dba, logger, logfile)
    elapsed = time.time() - start_time
    print "Done with IMPC. Elapsed time: {}".format(slmf.secs2str(elapsed))

    print "\nWorking on JAX..."
Exemple #7
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  # DBAdaptor uses same logger as load()
  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Human loaded into target, protein, etc.
  # Datasets and Provenances
  start_time = time.time()
  dataset_id = dba.ins_dataset( {'name': 'UniProt', 'source': 'XML file downloaded from from UniProt query reviewed:yes AND organism:"H**o sapiens (Human) [9606]"', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.uniprot.org/uniprot'} )
  if not dataset_id:
    print "WARNING: Error inserting dataset. See logfile %s for details." % logfile
    sys.exit(1)
  provs = [ {'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'ttype'},
            {'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'name'},
            {'dataset_id': dataset_id, 'table_name': 'protein'},
            {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'UniProt Function'"},
            {'dataset_id': dataset_id, 'table_name': 'goa'},  
            {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'UniProt Tissue'"},
            {'dataset_id': dataset_id, 'table_name': 'pathway', 'where_clause': "type = 'uniprot'"},
            {'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'uniprot'"},
            {'dataset_id': dataset_id, 'table_name': 'feature'},
            {'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d"%dataset_id},
            {'dataset_id': dataset_id, 'table_name': 'alias', 'where_clause': "dataset_id = %d"%dataset_id} ]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    if not rv:
      print "WARNING: Error inserting provenance. See logfile %s for details." % logfile
      sys.exit(1)

  # UniProt uses Evidence Ontology ECO IDs, not GO evidence codes, so get a mapping of
  # ECO IDs to GO evidence codes
  eco_map = mk_eco_map()
    
  print "\nParsing file {}".format(UP_HUMAN_FILE)
  root = objectify.parse(UP_HUMAN_FILE).getroot()
  up_ct = len(root.entry)
  print "Loading data for {} UniProt records".format(up_ct)
  logger.info("Loading data for {} UniProt records in file {}".format(up_ct, UP_HUMAN_FILE))
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
  pbar = ProgressBar(widgets=pbar_widgets, maxval=up_ct).start()
  ct = 0
  load_ct = 0
  xml_err_ct = 0
  dba_err_ct = 0
  for i in range(len(root.entry)):
    ct += 1
    entry = root.entry[i]
    logger.info("Processing entry {}".format(entry.accession))
    target = entry2target(entry, dataset_id, eco_map)
    if not target:
      xml_err_ct += 1
      logger.error("XML Error for %s" % entry.accession)
      continue
    tid = dba.ins_target(target)
    if not tid:
      dba_err_ct += 1
      continue
    logger.debug("Target insert id: %s" % tid)
    load_ct += 1
    pbar.update(ct)
  pbar.finish()
  elapsed = time.time() - start_time
  print "Processed {} UniProt records. Elapsed time: {}".format(ct, slmf.secs2str(elapsed))
  print "  Loaded {} targets/proteins".format(load_ct)
  if xml_err_ct > 0:
    print "WARNING: {} XML parsing errors occurred. See logfile {} for details.".format(xml_err_ct, logfile)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  # Mouse and Rat loaded into nhprotein
  dataset_id = dba.ins_dataset( {'name': 'UniProt Mouse Proteins', 'source': 'XML file downloaded from from UniProt query organism: "Mus musculus (Mouse) [10090]"', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.uniprot.org/uniprot'} )
  assert dataset_id, "Error inserting dataset. See logfile {} for details.".format(logfile)
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'nhprotein', 'where_clause': "taxid = 10090"})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)
  # Rat
  dataset_id = dba.ins_dataset( {'name': 'UniProt Rat Proteins', 'source': 'XML file downloaded from from UniProt query organism: "Rattus norvegicus (Rat) [10116]"', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.uniprot.org/uniprot'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'nhprotein', 'where_clause': "taxid = 10116"})
  assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  for ifn in (UP_MOUSE_FILE, UP_RAT_FILE):
    start_time = time.time()
    print "\nParsing file {}".format(ifn)
    root = objectify.parse(ifn).getroot()
    up_ct = len(root.entry)
    print "Loading data for {} UniProt records".format(up_ct)
    logger.info("Loading data for {} UniProt records in file {}".format(up_ct, ifn))
    pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]
    pbar = ProgressBar(widgets=pbar_widgets, maxval=up_ct).start()
    ct = 0
    nhp_ct = 0
    xml_err_ct = 0
    dba_err_ct = 0
    for i in range(len(root.entry)):
      ct += 1
      entry = root.entry[i]
      logger.info("Processing entry {}".format(entry.accession))
      nhprotein = entry2nhprotein(entry, dataset_id)
      if not nhprotein:
        xml_err_ct += 1
        logger.error("XML Error for {}".format(entry.accession))
        continue
      nhpid = dba.ins_nhprotein(nhprotein)
      if not nhpid:
        dba_err_ct += 1
        continue
      logger.debug("Nhprotein insert id: {}".format(nhpid))
      nhp_ct += 1
      pbar.update(ct)
    pbar.finish()
    elapsed = time.time() - start_time
    print "Processed {} UniProt records. Elapsed time: {}".format(ct, slmf.secs2str(elapsed))
    print "  Loaded {} nhproteins".format(nhp_ct)
    if xml_err_ct > 0:
      print "WARNING: {} XML parsing errors occurred. See logfile {} for details.".format(xml_err_ct, logfile)
    if dba_err_ct > 0:
      print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
Exemple #8
0
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    print "\nWorking on JensenLab DISEASES..."
    download_DISEASES(args)
    start_time = time.time()
    load_DISEASES(args, dba, logger, logfile)
    elapsed = time.time() - start_time
    print "Done with DISEASES. Elapsed time: {}".format(slmf.secs2str(elapsed))

    print "\nWorking on DisGeNET..."
    download_DisGeNET(args)
    start_time = time.time()
    load_DisGeNET(args, dba, logger, logfile)
    elapsed = time.time() - start_time
    print "Done with DisGeNET. Elapsed time: {}".format(slmf.secs2str(elapsed))

    # Monarch
    print "\nWorking on Monarch..."
    start_time = time.time()
    load_Monarch(args, dba, logger, logfile)
    elapsed = time.time() - start_time
    print "Done with Monarch. Elapsed time: {}".format(slmf.secs2str(elapsed))
Exemple #9
0
def load(args, dba, logger):
  assert os.path.exists(SYM2PID_P), "Error: No mapping file {}. Run with -c map first.".format(SYM2PID_P)
  print "\nLoading mapping of TCRD targets to Harmonizome genes from pickle file {}".format(SYM2PID_P)
  sym2pid = pickle.load( open(SYM2PID_P, 'rb') )
  print "  Got {} symbol to protein_id mappings".format(len(sym2pid))

  if os.path.isfile(DATASET_DONE_FILE):
    # If we are restarting, this file has the names of datasets already loaded
    with open(DATASET_DONE_FILE) as f:
      datasets_done = f.read().splitlines()
  else:
    datasets_done = []
  datasets = get_datasets(HARMO_API_BASE_URL)
  print "\nProcessing {} Harmonizome datasets".format(len(datasets))
  ct = 0
  gat_ct = 0
  total_ga_ct = 0
  err_ct = 0
  dba_err_ct = 0
  for dsname in datasets.keys():
    ct += 1
    ds_start_time = time.time()
    if dsname in datasets_done:
      print "  Skipping previously loaded dataset \"{}\"".format(dsname)
      continue
    ds_ga_ct = 0
    ds = get_dataset(HARMO_API_BASE_URL, datasets[dsname])
    if not ds:
      logger.error("Error getting dataset {} ({})".format(dsname, datasets[dsname]))
      err_ct += 1
      continue
    if not args['--quiet']:
      print "  Processing dataset \"{}\" containing {} gene sets".format(dsname, len(ds['geneSets']))
    logger.info("Processing dataset \"{}\" containing {} gene sets".format(dsname, len(ds['geneSets'])))
    rsc = get_resource(HARMO_API_BASE_URL, ds['resource']['href'])
    gat_id = dba.ins_gene_attribute_type( {'name': ds['name'], 'association': ds['association'], 'description': rsc['description'], 'resource_group': ds['datasetGroup'], 'measurement': ds['measurement'], 'attribute_group': ds['attributeGroup'], 'attribute_type': ds['attributeType'], 'pubmed_ids': "|".join([str(pmid) for pmid in rsc['pubMedIds']]), 'url': rsc['url']} )
    if gat_id:
      gat_ct += 1
    else:
      dba_err_ct += 1
    for d in ds['geneSets']:
      name = d['name'].encode('utf-8')
      gs = get_geneset(HARMO_API_BASE_URL, d['href'])
      if not gs:
        logger.error("Error getting gene set {} ({})".format(name, d['href']))
        err_ct += 1
        continue
      if 'associations' not in gs: # used to be 'features'
        logger.error("No associations in gene set {}".format(name))
        err_ct += 1
        continue
      logger.info("  Processing gene set \"{}\" containing {} associations".format(name, len(gs['associations'])))
      ga_ct = 0
      for f in gs['associations']: # used to be 'features'
        sym = f['gene']['symbol']
        if sym not in sym2pid: continue # symbol does not map to a TCRD target
        rv = dba.ins_gene_attribute( {'protein_id': sym2pid[sym], 'gat_id': gat_id,
                                      'name': name, 'value': f['thresholdValue']} )
        if not rv:
          dba_err_ct += 1
        else:
          ga_ct += 1
      ds_ga_ct += ga_ct
      time.sleep(1)
    total_ga_ct += ds_ga_ct
    ds_elapsed = time.time() - ds_start_time
    logger.info("  Inserted a total of {} new gene_attribute rows for dataset {}. Elapsed time: {}".format(ds_ga_ct, dsname, slmf.secs2str(ds_elapsed)))
    if err_ct > 0:
      logger.info("  WARNING: Error getting {} gene set(s) ".format(err_ct))
    # Save dataset names that are loaded, in case we need to restart
    with open(DATASET_DONE_FILE, "a") as dsdfile:
      dsdfile.write(dsname+'\n')
  print "\nProcessed {} Ma'ayan Lab datasets.".format(ct)
  print "Inserted {} new gene_attribute_type rows".format(gat_ct)
  print "Inserted a total of {} gene_attribute rows".format(total_ga_ct)
  if err_ct > 0:
    print "WARNING: {} errors occurred. See logfile {} for details.".format(err_ct, LOGFILE)
  if dba_err_ct > 0:
    print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, LOGFILE)

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'Harmonizome', 'source': "API at %s"%HARMO_API_BASE_URL, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://amp.pharm.mssm.edu/Harmonizome/'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(LOGFILE)
  # Provenance
  provs = [ {'dataset_id': dataset_id, 'table_name': 'gene_attribute'},
            {'dataset_id': dataset_id, 'table_name': 'gene_attribute_type'},
            {'dataset_id': dataset_id, 'table_name': 'hgram_cdf'} ]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    assert rv, "Error inserting provenance. See logfile {} for details.".format(LOGFILE)