def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'NCBI GI Numbers', 'source': 'UniProt ID Mapping file %s' % (BASE_URL + FILENAME), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.uniprot.org/' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) start_time = time.time() pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] infile = (DOWNLOAD_DIR + FILENAME).replace('.gz', '') line_ct = slmf.wcl(infile) # ID Mappiing fields # 1. UniProtKB-AC # 2. UniProtKB-ID # 3. GeneID (EntrezGene) # 4. RefSeq # 5. GI # 6. PDB # 7. GO # 8. UniRef100 # 9. UniRef90 # 10. UniRef50 # 11. UniParc # 12. PIR # 13. NCBI-taxon # 14. MIM # 15. UniGene # 16. PubMed # 17. EMBL # 18. EMBL-CDS # 19. Ensembl # 20. Ensembl_TRS # 21. Ensembl_PRO # 22. Additional PubMed if not args['--quiet']: print "\nProcessing {} rows in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 tmark = {} xref_ct = 0 skip_ct = 0 dba_err_ct = 0 for line in tsv: data = line.split('\t') ct += 1 up = data[0] if not data[4]: # no gi skip_ct += 1 continue targets = dba.find_targets({'uniprot': up}) if not targets: skip_ct += 1 continue target = targets[0] tmark[target['id']] = True pid = target['components']['protein'][0]['id'] for gi in data[4].split('; '): rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'NCBI GI', 'dataset_id': dataset_id, 'value': gi }) if rv: xref_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "\n{} rows processed".format(ct) print " Inserted {} new GI xref rows for {} targets".format( xref_ct, len(tmark)) print " Skipped {} rows with no GI".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not debug: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'AnimalTFDB', 'source': 'http://www.bioguo.org/AnimalTFDB/BrowseAllTF.php?spe=Homo_sapiens', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.bioguo.org/AnimalTFDB/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'Is Transcription Factor'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] TDLs = {'Tdark': 0, 'Tbio': 0, 'Tchem': 0, 'Tclin': 0} line_ct = slmf.wcl(INFILE) if not args['--quiet']: print "\nProcessing {} lines in input file {}\n".format( line_ct, INFILE) with open(INFILE, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 ti_ct = 0 notfnd = [] dba_err_ct = 0 for row in tsvreader: ct += 1 sym = row[3] targets = dba.find_targets({'sym': sym}) if not targets: gid = row[2] targets = dba.find_targets({'geneid': gid}) if not targets: ensg = row[1] targets = dba.find_targets_by_xref({ 'xtype': 'Ensembl', 'value': ensg }) if not targets: notfnd.append(row) continue t = targets[0] TDLs[t['tdl']] += 1 pid = t['components']['protein'][0]['id'] rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'Is Transcription Factor', 'boolean_value': 1 }) if rv: ti_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "\n{} lines processed.".format(ct) print " Inserted {} new Is Transcription Factor tdl_infos".format(ti_ct) if notfnd: print "No target found for {} rows:".format(len(notfnd)) if dba_err_ct > 0: print "WARNING: %d DB errors occurred. See logfile %s for details." % ( dba_err_ct, logfile) for tdl in ['Tclin', 'Tchem', 'Tbio', 'Tdark']: print "{}: {}".format(tdl, TDLs[tdl])
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'PubTator Text-mining Scores', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTator/', 'comments': 'PubTator data was subjected to the same counting scheme used to generate JensenLab PubMed Scores.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'ptscore' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'PubTator PubMed Score'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] ptscores = {} # protein.id => sum(all scores) pts_ct = 0 dba_err_ct = 0 infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 geneid2pid = {} notfnd = set() for row in tsvreader: # NCBI Gene ID year score ct += 1 pbar.update(ct) gidstr = row[0].replace(',', ';') geneids = gidstr.split(';') for geneid in geneids: if not geneid or '(tax:' in geneid: continue if geneid in geneid2pid: # we've already found it pids = geneid2pid[geneid] elif geneid in notfnd: # we've already not found it continue else: targets = dba.find_targets({'geneid': geneid}) if not targets: notfnd.add(geneid) logger.warn("No target found for {}".format(geneid)) continue pids = [] for target in targets: pids.append(target['components']['protein'][0]['id']) geneid2pid[ geneid] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_ptscore({ 'protein_id': pid, 'year': row[1], 'score': row[2] }) if rv: pts_ct += 1 else: dba_err_ct += 1 if pid in ptscores: ptscores[pid] += float(row[2]) else: ptscores[pid] = float(row[2]) pbar.finish() print "{} lines processed.".format(ct) print " Inserted {} new ptscore rows for {} targets.".format( pts_ct, len(ptscores)) if notfnd: print "No target found for {} NCBI Gene IDs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) print "\nLoading {} PubTator Score tdl_infos".format(len(ptscores)) ct = 0 ti_ct = 0 dba_err_ct = 0 for pid, score in ptscores.items(): ct += 1 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'PubTator Score', 'number_value': score }) if rv: ti_ct += 1 else: dba_err_ct += 1 print "{} processed".format(ct) print "Inserted {} new PubTator PubMed Score tdl_info rows".format(ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'MLP Assay Info', 'source': 'IDG-KMC generated data by Jeremy Yang at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': "This data is generated at UNM from PubChem and EUtils data. It contains details about targets studied in assays that were part of NIH's Molecular Libraries Program."} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': 3, 'table_name': 'mlp_assay_info'}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) if os.path.isfile(T2AID_PICKLE): t2aid = pickle.load( open(T2AID_PICKLE, 'rb')) act = 0 for tid in t2aid.keys(): for aid in t2aid[tid]: act += 1 if not args['--debug']: print "\n{} targets have link(s) to {} PubChem MLP assay(s)".format(len(t2aid), act) else: pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] line_ct = slmf.wcl(AIDGI_FILE) t2aid = {} if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, AIDGI_FILE) with open(AIDGI_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) ct = 0 skip_ct = 0 fndgi_ct = 0 fndpl_ct = 0 notfnd = set() assay_ct = 0 dba_err_ct = 0 for row in csvreader: # aid, tgt_gi, tgt_species, tgt_name #print "[DEBUG]", row ct += 1 if row[2] != 'H**o sapiens': skip_ct += 1 continue gi = row[1] targets = dba.find_targets_by_xref({'xtype': 'NCBI GI', 'value': gi}) if targets: fndgi_ct += 1 else: url = EFETCH_PROTEIN_URL + gi r = requests.get(url) if r.status_code == 200: soup = BeautifulSoup(r.text, "xml") grl = soup.find('Gene-ref_locus') if grl: sym = grl.text targets = dba.find_targets({'sym': sym}) if targets: fndpl_ct += 1 else: notfnd.append(gi) logger.warn("No target found for GI {}".format(gi)) continue t = targets[0] tid = t['id'] if tid in t2aid: t2aid[tid].append(row[0]) assay_ct += 1 else: t2aid[tid] = [row[0]] assay_ct += 1 pbar.update(ct) pbar.finish() pickle.dump(t2aid, open(T2AID_PICKLE, "wb")) print "\n{} rows processed.".format(ct) print " {} assays linked to {} TCRD targets".format(assay_ct, len(t2aid)) print " Skipped {} non-huamn assay rows".format(skip_ct) print " {} linked by GI; {} linked via EUtils".format(fndgi_ct, fndpl_ct) print " No target found for {} GIs. See logfile {} for details".format(len(notfnd), logfile) assay_info = {} pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] line_ct = slmf.wcl(ASSAYS_FILE) if not args['--quiet']: print "\nProcessing {} rows in file {}".format(line_ct, ASSAYS_FILE) with open(ASSAYS_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) ct = 0 for row in csvreader: # ID,ActivityOutcomeMethod,AssayName,SourceName,ModifyDate,DepositDate,ActiveSidCount,InactiveSidCount,InconclusiveSidCount,TotalSidCount,ActiveCidCount,TotalCidCount,ProteinTargetList aid = row[0] assay_info[aid] = row[1:] pbar.update(ct) pbar.finish() elapsed = time.time() - start_time print "Got assay info for {} assays.".format(len(assay_info)) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] tct = len(t2aid.keys()) if not args['--quiet']: print "\nLoading MLP Assay Info for {} targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 ti_ct = 0 mai_ct = 0 dba_err_ct = 0 for tid, aids in t2aid.items(): ct += 1 for aid in aids: ainfo = assay_info[aid] rv = dba.ins_mlp_assay_info({'protein_id': tid, 'aid': aid, 'assay_name': ainfo[1], 'method': ainfo[0], 'active_sids': ainfo[5], 'inactive_sids': ainfo[6], 'iconclusive_sids': ainfo[7], 'total_sids': ainfo[8]}) if rv: mai_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "\n{} targets processed.".format(ct) print " Inserted {} new mlp_assay_info rows".format(mai_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'LocSigDB', 'source': 'File %s from %s' % (FILENAME, BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://genome.unmc.edu/LocSigDB/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'locsig'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) fn = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, fn) with open(fn, 'rU') as f: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 up2pid = {} notfnd = set() ls_ct = 0 skip_ct = 0 pmark = set() dba_err_ct = 0 for line in f: ct += 1 data = line.split(',') if 'H**o sapiens' not in data[5]: skip_ct += 1 continue fnd = False for up in data[4].split(';'): if up in up2pid: # we've already found it pid = up2pid[up] elif up in notfnd: # we've already not found it continue else: targets = dba.find_targets({'uniprot': up}) if not targets: notfnd.add(up) continue pid = targets[0]['components']['protein'][0]['id'] up2pid[up] = pid rv = dba.ins_locsig({ 'protein_id': pid, 'location': data[2], 'signal': data[0], 'pmids': data[3] }) if not rv: dba_err_ct += 1 continue ls_ct += 1 pmark.add(pid) pbar.update(ct) pbar.finish() for up in notfnd: logger.warn("No target found for {}".format(up)) print "{} lines processed.".format(ct) print " Inserted {} new locsig rows for {} proteins".format( ls_ct, len(pmark)) print " Skipped {} non-human rows".format(skip_ct) if notfnd: print "No target found for {} UniProts. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Reactome Pathways', 'source': 'File %s' % BASE_URL + PATHWAYS_FILE, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.reactome.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'pathway', 'where_clause': "pwtype = 'Reactome'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] infile = (DOWNLOAD_DIR + PATHWAYS_FILE).replace('.zip', '') line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} input line from Reactome Pathways file {}".format( line_ct, infile) with open(infile, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') # Example line: # Apoptosis R-HSA-109581 Reactome Pathway ACIN1 ADD1 AKT1 AKT2 ... pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 sym2pids = defaultdict(list) pmark = set() notfnd = set() pw_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 pwname = row[0] pwid = row[1] url = 'http://www.reactome.org/content/detail/' + pwid syms = row[3:] for sym in syms: if sym in sym2pids: pids = sym2pids[sym] elif sym in notfnd: continue else: targets = dba.find_targets({'sym': sym}) if not targets: notfnd.add(sym) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) sym2pids[ sym] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_pathway({ 'protein_id': pid, 'pwtype': 'Reactome', 'name': pwname, 'id_in_source': pwid, 'url': url }) if rv: pw_ct += 1 pmark.add(pid) else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for sym in sym2pids: logger.warn("No target found for {}".format(sym)) print "Processed {} Reactome Pathways.".format(ct) print " Inserted {} pathway rows for {} proteins.".format( pw_ct, len(pmark)) if notfnd: print " No target found for {} Gene IDs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def main(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(LOGLEVEL) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nLoading project info from pickle file {}".format(PROJECTS_P) projects = pickle.load(open(PROJECTS_P, 'rb')) if not args['--quiet']: print "\nCreating Tagger..." tgr = Tagger() tgr.load_names(ENTITIES_FILE, NAMES_FILE) tgr.load_global(GLOBAL_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] for year in [str(yr) for yr in range(2000, 2018)]: # 2000-2017 pct = len(projects[year]) print "\nTagging {} projects from {}".format(pct, year) logger.info("Tagging {} projects from {}".format(pct, year)) pbar = ProgressBar(widgets=pbar_widgets, maxval=pct).start() start_time = time.time() ct = 0 ttag_ct = 0 abstag_ct = 0 skip_ct = 0 ttagsnotfnd = set() ttag2targetid = {} appid2targets = defaultdict(set) target2appids = defaultdict(set) for appid in projects[year].keys(): ct += 1 logger.debug(" Processing appid {}".format(appid)) ginfo = projects[year][appid] # if there's no $$, we're not interested if ginfo['TOTAL_COST']: gcost = int(ginfo['TOTAL_COST']) elif ginfo['TOTAL_COST_SUB_PROJECT']: gcost = int(ginfo['TOTAL_COST_SUB_PROJECT']) else: continue # also, if there's less than $10k we're not interested if gcost < 10000: skip_ct += 1 continue # # tag titles # matches = tgr.get_matches(projects[year][appid]['PROJECT_TITLE'], appid, [9606]) if matches: ttag_ct += 1 # the same tag can match multiple times, so get a set of ENSPs ensps = set() for m in matches: ensps.add(m[2][0][1]) ensps = list(ensps) for ensp in ensps: if ensp in ttag2targetid: tid = ttag2targetid[ensp] elif ensp in ttagsnotfnd: continue else: targets = dba.find_targets({'stringid': ensp}, idg=False) if not targets: targets = dba.find_targets_by_xref( { 'xtype': 'Ensembl', 'value': ensp }, idg=False) if not targets: ttagsnotfnd.add(ensp) continue tid = targets[0]['id'] ttag2targetid[ ensp] = tid # save this so we don't look up the targets again appid2targets[appid].add(tid) target2appids[tid].add(appid) # # tag abstracts # if 'ABSTRACT' in projects[year][appid]: matches = tgr.get_matches(projects[year][appid]['ABSTRACT'], appid, [9606]) if matches: abstag_ct += 1 # the same tag can match multiple times, so get a set of ENSPs ensps = set() for m in matches: ensps.add(m[2][0][1]) ensps = list(ensps) for ensp in ensps: if ensp in ttag2targetid: tid = ttag2targetid[ensp] elif ensp in ttagsnotfnd: continue else: targets = dba.find_targets({'stringid': ensp}, idg=False) if not targets: targets = dba.find_targets_by_xref( { 'xtype': 'Ensembl', 'value': ensp }, idg=False) if not targets: ttagsnotfnd.add(ensp) continue tid = targets[0]['id'] ttag2targetid[ ensp] = tid # save this so we don't look up the targets again appid2targets[appid].add(tid) target2appids[tid].add(appid) pbar.update(ct) pbar.finish() del_ct = 0 for appid, tidset in appid2targets.items(): if len(tidset) > 10: del_ct += 1 del (appid2targets[appid]) logger.info("{} projects processed.".format(ct)) logger.info(" Removed {} projects with > 10 targets" % del_ct) logger.info( " Skipped {} projects with funds less than $10k:".format(skip_ct)) logger.info(" {} titles have tagging result(s)".format(ttag_ct)) logger.info(" {} abstracts have tagging result(s)".format(abstag_ct)) logger.info("{} total tags map to {}/{} distinct targets".format( len(ttag2targetid.keys()), len(set(ttag2targetid.values())), len(target2appids.keys()))) logger.info("{} project applications map to target(s)".format( len(appid2targets.keys()))) if ttagsnotfnd: logger.info(" No target found for {} tags".format( len(ttagsnotfnd))) pfile = "%s/AppID2Targets%s.p" % (TAGGING_RESULTS_DIR, year) pickle.dump(appid2targets, open(pfile, 'wb')) logger.info("Tagging results saved to pickle {} for {}".format( pfile, year)) pfile = "%s/Target2AppIDs%s.p" % (TAGGING_RESULTS_DIR, year) pickle.dump(target2appids, open(pfile, 'wb')) logger.info("Tagging results saved to pickle {} for {}".format( pfile, year)) print "{} projects processed. See logfile {} for details.".format( ct, LOGFILE)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'WikiPathways', 'source': 'File %s' % BASE_URL + PATHWAYS_FILE, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.wikipathways.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'pathway', 'where_clause': "pwtype = 'WikiPathways'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] fn = DOWNLOAD_DIR + PATHWAYS_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} input lines from WikiPathways file {}".format( line_ct, fn) with open(fn, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') # Example line: # Apoptosis Modulation and Signaling%WikiPathways_20160516%WP1772%H**o sapiens http://www.wikipathways.org/instance/WP1772_r85184 843 3725 842 ... pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 gid2pids = defaultdict(list) pmark = set() notfnd = set() pw_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 name = row[0].split('%')[0] wpid = row[1].split('/')[-1] geneids = row[2:] for gid in geneids: if gid in gid2pids: pids = gid2pids[gid] elif gid in notfnd: continue else: targets = dba.find_targets({'geneid': gid}) if not targets: notfnd.add(gid) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) gid2pids[ gid] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_pathway({ 'protein_id': pid, 'pwtype': 'WikiPathways', 'name': name, 'id_in_source': wpid, 'url': row[1] }) if rv: pw_ct += 1 pmark.add(pid) else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for gid in gid2pids: logger.warn("No target found for {}".format(gid)) print "Processed {} WikiPathways.".format(ct) print " Inserted {} pathway rows for {} proteins.".format( pw_ct, len(pmark)) if notfnd: print " No target found for {} Gene IDs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(): args = docopt(__doc__, version=__version__) debug = int(args['--debug']) if debug: print "\n[*DEBUG*] ARGS:\n%s\n" % repr(args) loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = "%s.log" % PROGRAM logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not debug: logger.propagate = False # turns off console logging when debug is 0 fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "Connected to TCRD database %s (schema ver %s; data ver %s)" % ( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'TechDev Worklist Info', 'source': 'Files from TechDev Groups', 'app': PROGRAM, 'app_version': __version__, 'comments': 'Loading app uses data from spreadsheets submitted by the TechDev groups listing targets being investigated.' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'techdev_contact', 'comment': "" }, { 'dataset_id': dataset_id, 'table_name': 'techdev_info', 'comment': "" }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] for tdid, filename in INPUTFILES.items(): line_ct = wcl(filename) if not args['--quiet']: print '\nProcessing %d lines from input file: %s' % (line_ct, filename) with open(filename, 'rU') as csvfile: csvreader = csv.reader(csvfile) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 contact = {} skip_ct = 0 err_ct = 0 info_ct = 0 notfnd = [] dba_err_ct = 0 for row in csvreader: ct += 1 if row[0] == 'TechDev ID:': techdev_id = int(row[1]) contact['id'] = techdev_id continue if row[0] == 'Grant Number:': contact['grant_number'] = row[1] continue if row[0] == 'Submitter name:': contact['contact_name'] = row[1] continue if row[0] == 'Contact email:': contact['contact_email'] = row[1] continue if row[0] == 'Submission date:': contact['date'] = row[1] continue if row[0] == 'tcrd_target_id': contact['pi'] = PIS[techdev_id] contact_id = dba.ins_techdev_contact(contact) if not contact_id: logger.error("DBA error inserting techdev_contact.") print "Exiting due to DBA error inserting techdev_contact. See logfile %s for details." % logfile break continue if not row[6]: skip_ct += 1 continue sym = row[1] targets = dba.find_targets({'sym': sym}) if not targets: notfnd.append(sym) continue t = targets[0] pid = t['components']['protein'][0]['id'] init = {'contact_id': contact_id, 'protein_id': pid} if not row[7]: err_ct += 1 continue init['comment'] = row[7] if row[8]: init['publication_pcmid'] = row[8] if row[9]: init['publication_pmid'] = row[9] if row[11]: init['resource_url'] = row[11] if row[10]: init['data_url'] = row[10] rv = dba.ins_techdev_info(init) if rv: info_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() if not args['--quiet']: print "%d lines processed." % ct print " Skipped %d lines not under investigation" % skip_ct if err_ct > 0: print " WARNING: %d lines did not have a comment!" % err_ct if notfnd: print " WARNING: %d symbols did not find a target!" for sym in notfnd: print " %s" % sym print " Inserted 1 new techdev_contact row" print " Inserted %d new techdev_info rows" % info_ct if dba_err_ct > 0: print "WARNING: %d DB errors occurred. See logfile %s for details." % ( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Pathway Commons', 'source': 'File %s' % BASE_URL + PATHWAYS_FILE, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.pathwaycommons.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'pathway', 'where_clause': "pwtype LIKE 'PathwayCommons %s'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] infile = (DOWNLOAD_DIR + PATHWAYS_FILE).replace('.gz', '') line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} records from PathwayCommons file {}".format( line_ct, infile) with open(infile, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') # Example line: # http://identifiers.org/kegg.pathway/hsa00010 name: Glycolysis / Gluconeogenesis; datasource: kegg; organism: 9606; idtype: uniprot A8K7J7 B4DDQ8 B4DNK4 E9PCR7 P04406 P06744 P07205 P07738 P09467 P09622 P09972 P10515 P11177 P14550 P30838 P35557 P51648 P60174 Q01813 Q16822 Q53Y25 Q6FHV6 Q6IRT1 Q6ZMR3 Q8IUN7 Q96C23 Q9BRR6 Q9NQR9 Q9NR19 # However, note that pathway commons URLs in file give 404. # E.g. URL from this line: # http://pathwaycommons.org/pc2/Pathway_0136871cbdf9a3ecc09529f1878171df name: VEGFR1 specific signals; datasource: pid; organism: 9606; idtype: uniprot O14786 O15530 O60462 P05771 P07900 P15692 P16333 P17252 P17612 P17948 P19174 P20936 P22681 P27361 P27986 P28482 P29474 P31749 P42336 P49763 P49765 P62158 P98077 Q03135 Q06124 Q16665 Q9Y5K6 # needs to be converted to: # http://apps.pathwaycommons.org/pathways?uri=http%3A%2F%2Fpathwaycommons.org%2Fpc2%2FPathway_0136871cbdf9a3ecc09529f1878171df pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 up2pid = {} pmark = set() notfnd = set() pw_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 src = re.search(r'datasource: (\w+)', row[1]).groups()[0] if src in ['kegg', 'wikipathways', 'reactome']: skip_ct += 1 continue pwtype = 'PathwayCommons: ' + src name = re.search(r'name: (.+?);', row[1]).groups()[0] url = PCAPP_BASE_URL + urllib.quote(row[0], safe='') ups = row[2:] for up in ups: if up in up2pid: pid = up2pid[up] elif up in notfnd: continue else: targets = dba.find_targets({'uniprot': up}) if not targets: notfnd.add(up) continue t = targets[0] pid = t['components']['protein'][0]['id'] up2pid[up] = pid rv = dba.ins_pathway({ 'protein_id': pid, 'pwtype': pwtype, 'name': name, 'url': url }) if rv: pw_ct += 1 pmark.add(pid) else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for up in notfnd: logger.warn("No target found for {}".format(up)) print "Processed {} Pathway Commons records.".format(ct) print " Inserted {} new pathway rows for {} proteins.".format( pw_ct, len(pmark)) print " Skipped {} records from 'kegg', 'wikipathways', 'reactome'".format( skip_ct) if notfnd: print " No target found for {} UniProt accessions. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as main() dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'Drugable Epigenome Domains', 'source': 'Files from http://www.nature.com/nrd/journal/v11/n5/suppinfo/nrd3674.html', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.nature.com/nrd/journal/v11/n5/suppinfo/nrd3674.html'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'Drugable Epigenome Class'"}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) total_ti_ct = 0 notfnd = set() for k,d in FILE_LIST.items(): if not args['--quiet']: print "\nProcessing Epigenetic {}s".format(k) for dom,f in d.items(): f = INPUT_DIR + f line_ct = slmf.wcl(f) if not args['--quiet']: print 'Processing {} lines from {} input file {}'.format(line_ct, dom, f) with open(f, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header lines ct = 0 not_fnd_ct = 0 tct = 0 ti_ct = 0 dba_err_ct = 0 for row in csvreader: ct += 1 targets = dba.find_targets({'sym': row[0]}) if not targets: targets = dba.find_targets({'geneid': row[3]}) if not targets: targets = dba.find_targets({'uniprot': row[2]}) if not targets: k = "%s|%s|%s"%(row[0],row[3],row[2]) notfnd.add(k) logger.warn("No target found for: {}".format(k)) continue tct += 1 t = targets[0] p = t['components']['protein'][0] if len(row) == 5: val = "Epigenetic %s - %s" % (k, dom) else: val = "Epigenetic %s - %s %s: %s" % (k, dom, row[4], row[5]) rv = dba.ins_tdl_info({'protein_id': p['id'], 'itype': 'Drugable Epigenome Class', 'string_value': val}) if not rv: dba_err_ct += 1 continue ti_ct += 1 if not args['--quiet']: print " {} lines processed. Found {}, skipped {}".format(ct, tct, not_fnd_ct) print " Inserted {} new tdl_info rows".format(ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) total_ti_ct += ti_ct if not args['--quiet']: print "\nInserted a total of {} new Drugable Epigenome Class tdl_infos".format(total_ti_ct) if len(notfnd) > 0: print " No target found for {} sym/geneid/uniprots. See logfile {} for details.".format(len(notfnd), logfile)