def download(args): if os.path.exists(DOWNLOAD_DIR + FILENAME): os.remove(DOWNLOAD_DIR + FILENAME) start_time = time.time() if not args['--quiet']: print "\nDownloading ", BASE_URL + FILENAME print " to ", DOWNLOAD_DIR + FILENAME urllib.urlretrieve(BASE_URL + FILENAME, DOWNLOAD_DIR + FILENAME) elapsed = time.time() - start_time if not args['--quiet']: print "Done. Elapsed time: {}".format(slmf.secs2str(elapsed))
def download(args): gzfn = DOWNLOAD_DIR + FILENAME if os.path.exists(gzfn): os.remove(gzfn) fn = gzfn.replace('.gz', '') if os.path.exists(fn): os.remove(fn) start_time = time.time() if not args['--quiet']: print "\nDownloading ", BASE_URL + FILENAME print " to ", gzfn urlretrieve(BASE_URL + FILENAME, gzfn) print "Uncompressing", gzfn ifh = gzip.open(gzfn, 'rb') ofh = open(fn, 'wb') ofh.write(ifh.read()) ifh.close() ofh.close() if not args['--quiet']: elapsed = time.time() - start_time print "Done. Elapsed time: {}".format(slmf.secs2str(elapsed))
if rv: dct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() if not args['--quiet']: print "\n{} targets processed.".format(ct) print " {} non-Tclin targets have upstream Tclin target(s)".format( len(umark)) print " Inserted {} upstream kegg_nearest_tclin rows".format(uct) print " {} non-Tclin targets have downstream Tclin target(s)".format( len(dmark)) print " Inserted {} upstream kegg_nearest_tclin rows".format(dct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) if __name__ == '__main__': print "\n{} (v{}) [{}]:".format(PROGRAM, __version__, time.strftime("%c")) args = docopt(__doc__, version=__version__) if args['--debug']: print "\n[*DEBUG*] ARGS:\n%s\n" % repr(args) start_time = time.time() calc_and_load(args) elapsed = time.time() - start_time print "\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed))
print " Skipped {} PPIs involving the same protein".format(same12_ct) if notfnd: print " No target found for {} UniProts/Syms/GeneIDs. See logfile {} for details.".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) def find_target(dba, k): (up, sym, geneid) = k.split("|") targets = False if up != '': # No UniProt accessions in update files targets = dba.find_targets({'uniprot': up}) if not targets: targets = dba.find_targets({'sym': sym}) if not targets: targets = dba.find_targets({'geneid': geneid}) if targets: return targets[0] else: return None if __name__ == '__main__': print "\n{} (v{}) [{}]:".format(PROGRAM, __version__, time.strftime("%c")) args = docopt(__doc__, version=__version__) if args['--debug']: print "\n[*DEBUG*] ARGS:\n%s\n"%repr(args) start_time = time.time() load(args) elapsed = time.time() - start_time print "\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed))
}) if rv: tiurl_ct += 1 else: dba_err_ct += 1 time.sleep(1) pbar.update(ct) pbar.finish() print "{} TCRD targets processed.".format(ct) print " Inserted {} Ab Count tdl_info rows".format(tiab_ct) print " Inserted {} MAb Count tdl_info rows".format(timab_ct) print " Inserted {} Antibodypedia.com URL tdl_info rows".format(tiurl_ct) if net_err_ct > 0: print "WARNING: Network error for {} targets. See logfile {} for details.".format( net_err_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) if __name__ == '__main__': print "\n%s (v%s) [%s]:" % (PROGRAM, __version__, time.strftime("%c")) args = docopt(__doc__, version=__version__) debug = int(args['--debug']) if debug: print "\n[*DEBUG*] ARGS:\n%s\n" % repr(args) start_time = time.time() load(args) elapsed = time.time() - start_time print "\n%s: Done. Elapsed time: %s\n" % (PROGRAM, slmf.secs2str(elapsed))
} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) print "\nWorking on OMIM..." download_OMIM() start_time = time.time() load_OMIM(args, dba, logger, logfile) elapsed = time.time() - start_time print "Done with OMIM. Elapsed time: {}".format(slmf.secs2str(elapsed)) print "\nWorking on GWAS Catalog..." start_time = time.time() load_GWASCatalog(args, dba, logger, logfile) elapsed = time.time() - start_time print "Done with GWAS Catalog. Elapsed time: {}".format( slmf.secs2str(elapsed)) print "\nWorking on IMPC..." start_time = time.time() load_IMPC(args, dba, logger, logfile) elapsed = time.time() - start_time print "Done with IMPC. Elapsed time: {}".format(slmf.secs2str(elapsed)) print "\nWorking on JAX..."
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as load() dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Human loaded into target, protein, etc. # Datasets and Provenances start_time = time.time() dataset_id = dba.ins_dataset( {'name': 'UniProt', 'source': 'XML file downloaded from from UniProt query reviewed:yes AND organism:"H**o sapiens (Human) [9606]"', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.uniprot.org/uniprot'} ) if not dataset_id: print "WARNING: Error inserting dataset. See logfile %s for details." % logfile sys.exit(1) provs = [ {'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'ttype'}, {'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'name'}, {'dataset_id': dataset_id, 'table_name': 'protein'}, {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'UniProt Function'"}, {'dataset_id': dataset_id, 'table_name': 'goa'}, {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'UniProt Tissue'"}, {'dataset_id': dataset_id, 'table_name': 'pathway', 'where_clause': "type = 'uniprot'"}, {'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'uniprot'"}, {'dataset_id': dataset_id, 'table_name': 'feature'}, {'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d"%dataset_id}, {'dataset_id': dataset_id, 'table_name': 'alias', 'where_clause': "dataset_id = %d"%dataset_id} ] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) # UniProt uses Evidence Ontology ECO IDs, not GO evidence codes, so get a mapping of # ECO IDs to GO evidence codes eco_map = mk_eco_map() print "\nParsing file {}".format(UP_HUMAN_FILE) root = objectify.parse(UP_HUMAN_FILE).getroot() up_ct = len(root.entry) print "Loading data for {} UniProt records".format(up_ct) logger.info("Loading data for {} UniProt records in file {}".format(up_ct, UP_HUMAN_FILE)) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=up_ct).start() ct = 0 load_ct = 0 xml_err_ct = 0 dba_err_ct = 0 for i in range(len(root.entry)): ct += 1 entry = root.entry[i] logger.info("Processing entry {}".format(entry.accession)) target = entry2target(entry, dataset_id, eco_map) if not target: xml_err_ct += 1 logger.error("XML Error for %s" % entry.accession) continue tid = dba.ins_target(target) if not tid: dba_err_ct += 1 continue logger.debug("Target insert id: %s" % tid) load_ct += 1 pbar.update(ct) pbar.finish() elapsed = time.time() - start_time print "Processed {} UniProt records. Elapsed time: {}".format(ct, slmf.secs2str(elapsed)) print " Loaded {} targets/proteins".format(load_ct) if xml_err_ct > 0: print "WARNING: {} XML parsing errors occurred. See logfile {} for details.".format(xml_err_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) # Mouse and Rat loaded into nhprotein dataset_id = dba.ins_dataset( {'name': 'UniProt Mouse Proteins', 'source': 'XML file downloaded from from UniProt query organism: "Mus musculus (Mouse) [10090]"', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.uniprot.org/uniprot'} ) assert dataset_id, "Error inserting dataset. See logfile {} for details.".format(logfile) rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'nhprotein', 'where_clause': "taxid = 10090"}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) # Rat dataset_id = dba.ins_dataset( {'name': 'UniProt Rat Proteins', 'source': 'XML file downloaded from from UniProt query organism: "Rattus norvegicus (Rat) [10116]"', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.uniprot.org/uniprot'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'nhprotein', 'where_clause': "taxid = 10116"}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) for ifn in (UP_MOUSE_FILE, UP_RAT_FILE): start_time = time.time() print "\nParsing file {}".format(ifn) root = objectify.parse(ifn).getroot() up_ct = len(root.entry) print "Loading data for {} UniProt records".format(up_ct) logger.info("Loading data for {} UniProt records in file {}".format(up_ct, ifn)) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=up_ct).start() ct = 0 nhp_ct = 0 xml_err_ct = 0 dba_err_ct = 0 for i in range(len(root.entry)): ct += 1 entry = root.entry[i] logger.info("Processing entry {}".format(entry.accession)) nhprotein = entry2nhprotein(entry, dataset_id) if not nhprotein: xml_err_ct += 1 logger.error("XML Error for {}".format(entry.accession)) continue nhpid = dba.ins_nhprotein(nhprotein) if not nhpid: dba_err_ct += 1 continue logger.debug("Nhprotein insert id: {}".format(nhpid)) nhp_ct += 1 pbar.update(ct) pbar.finish() elapsed = time.time() - start_time print "Processed {} UniProt records. Elapsed time: {}".format(ct, slmf.secs2str(elapsed)) print " Loaded {} nhproteins".format(nhp_ct) if xml_err_ct > 0: print "WARNING: {} XML parsing errors occurred. See logfile {} for details.".format(xml_err_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) print "\nWorking on JensenLab DISEASES..." download_DISEASES(args) start_time = time.time() load_DISEASES(args, dba, logger, logfile) elapsed = time.time() - start_time print "Done with DISEASES. Elapsed time: {}".format(slmf.secs2str(elapsed)) print "\nWorking on DisGeNET..." download_DisGeNET(args) start_time = time.time() load_DisGeNET(args, dba, logger, logfile) elapsed = time.time() - start_time print "Done with DisGeNET. Elapsed time: {}".format(slmf.secs2str(elapsed)) # Monarch print "\nWorking on Monarch..." start_time = time.time() load_Monarch(args, dba, logger, logfile) elapsed = time.time() - start_time print "Done with Monarch. Elapsed time: {}".format(slmf.secs2str(elapsed))
def load(args, dba, logger): assert os.path.exists(SYM2PID_P), "Error: No mapping file {}. Run with -c map first.".format(SYM2PID_P) print "\nLoading mapping of TCRD targets to Harmonizome genes from pickle file {}".format(SYM2PID_P) sym2pid = pickle.load( open(SYM2PID_P, 'rb') ) print " Got {} symbol to protein_id mappings".format(len(sym2pid)) if os.path.isfile(DATASET_DONE_FILE): # If we are restarting, this file has the names of datasets already loaded with open(DATASET_DONE_FILE) as f: datasets_done = f.read().splitlines() else: datasets_done = [] datasets = get_datasets(HARMO_API_BASE_URL) print "\nProcessing {} Harmonizome datasets".format(len(datasets)) ct = 0 gat_ct = 0 total_ga_ct = 0 err_ct = 0 dba_err_ct = 0 for dsname in datasets.keys(): ct += 1 ds_start_time = time.time() if dsname in datasets_done: print " Skipping previously loaded dataset \"{}\"".format(dsname) continue ds_ga_ct = 0 ds = get_dataset(HARMO_API_BASE_URL, datasets[dsname]) if not ds: logger.error("Error getting dataset {} ({})".format(dsname, datasets[dsname])) err_ct += 1 continue if not args['--quiet']: print " Processing dataset \"{}\" containing {} gene sets".format(dsname, len(ds['geneSets'])) logger.info("Processing dataset \"{}\" containing {} gene sets".format(dsname, len(ds['geneSets']))) rsc = get_resource(HARMO_API_BASE_URL, ds['resource']['href']) gat_id = dba.ins_gene_attribute_type( {'name': ds['name'], 'association': ds['association'], 'description': rsc['description'], 'resource_group': ds['datasetGroup'], 'measurement': ds['measurement'], 'attribute_group': ds['attributeGroup'], 'attribute_type': ds['attributeType'], 'pubmed_ids': "|".join([str(pmid) for pmid in rsc['pubMedIds']]), 'url': rsc['url']} ) if gat_id: gat_ct += 1 else: dba_err_ct += 1 for d in ds['geneSets']: name = d['name'].encode('utf-8') gs = get_geneset(HARMO_API_BASE_URL, d['href']) if not gs: logger.error("Error getting gene set {} ({})".format(name, d['href'])) err_ct += 1 continue if 'associations' not in gs: # used to be 'features' logger.error("No associations in gene set {}".format(name)) err_ct += 1 continue logger.info(" Processing gene set \"{}\" containing {} associations".format(name, len(gs['associations']))) ga_ct = 0 for f in gs['associations']: # used to be 'features' sym = f['gene']['symbol'] if sym not in sym2pid: continue # symbol does not map to a TCRD target rv = dba.ins_gene_attribute( {'protein_id': sym2pid[sym], 'gat_id': gat_id, 'name': name, 'value': f['thresholdValue']} ) if not rv: dba_err_ct += 1 else: ga_ct += 1 ds_ga_ct += ga_ct time.sleep(1) total_ga_ct += ds_ga_ct ds_elapsed = time.time() - ds_start_time logger.info(" Inserted a total of {} new gene_attribute rows for dataset {}. Elapsed time: {}".format(ds_ga_ct, dsname, slmf.secs2str(ds_elapsed))) if err_ct > 0: logger.info(" WARNING: Error getting {} gene set(s) ".format(err_ct)) # Save dataset names that are loaded, in case we need to restart with open(DATASET_DONE_FILE, "a") as dsdfile: dsdfile.write(dsname+'\n') print "\nProcessed {} Ma'ayan Lab datasets.".format(ct) print "Inserted {} new gene_attribute_type rows".format(gat_ct) print "Inserted a total of {} gene_attribute rows".format(total_ga_ct) if err_ct > 0: print "WARNING: {} errors occurred. See logfile {} for details.".format(err_ct, LOGFILE) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, LOGFILE) # Dataset dataset_id = dba.ins_dataset( {'name': 'Harmonizome', 'source': "API at %s"%HARMO_API_BASE_URL, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://amp.pharm.mssm.edu/Harmonizome/'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(LOGFILE) # Provenance provs = [ {'dataset_id': dataset_id, 'table_name': 'gene_attribute'}, {'dataset_id': dataset_id, 'table_name': 'gene_attribute_type'}, {'dataset_id': dataset_id, 'table_name': 'hgram_cdf'} ] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format(LOGFILE)