def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'NCBI GI Numbers', 'source': 'UniProt ID Mapping file %s' % (BASE_URL + FILENAME), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.uniprot.org/' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) start_time = time.time() pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] infile = (DOWNLOAD_DIR + FILENAME).replace('.gz', '') line_ct = slmf.wcl(infile) # ID Mappiing fields # 1. UniProtKB-AC # 2. UniProtKB-ID # 3. GeneID (EntrezGene) # 4. RefSeq # 5. GI # 6. PDB # 7. GO # 8. UniRef100 # 9. UniRef90 # 10. UniRef50 # 11. UniParc # 12. PIR # 13. NCBI-taxon # 14. MIM # 15. UniGene # 16. PubMed # 17. EMBL # 18. EMBL-CDS # 19. Ensembl # 20. Ensembl_TRS # 21. Ensembl_PRO # 22. Additional PubMed if not args['--quiet']: print "\nProcessing {} rows in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 tmark = {} xref_ct = 0 skip_ct = 0 dba_err_ct = 0 for line in tsv: data = line.split('\t') ct += 1 up = data[0] if not data[4]: # no gi skip_ct += 1 continue targets = dba.find_targets({'uniprot': up}) if not targets: skip_ct += 1 continue target = targets[0] tmark[target['id']] = True pid = target['components']['protein'][0]['id'] for gi in data[4].split('; '): rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'NCBI GI', 'dataset_id': dataset_id, 'value': gi }) if rv: xref_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "\n{} rows processed".format(ct) print " Inserted {} new GI xref rows for {} targets".format( xref_ct, len(tmark)) print " Skipped {} rows with no GI".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging when debug is 0 fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'Reactome Protein-Protein Interactions', 'source': "File %s"%BASE_URL+FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.reactome.org/'} ) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'ppi', 'where_clause': "ppitype = 'Reactome'"}) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} lines from Reactome PPI file {}".format(line_ct, infile) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(infile, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct = 1 skip_ct = 0 same12_ct = 0 dup_ct = 0 ppis = {} ppi_ct = 0 up2pid = {} notfnd = set() dba_err_ct = 0 for row in tsvreader: # 0: Interactor 1 uniprot id # 1: Interactor 1 Ensembl gene id # 2: Interactor 1 Entrez Gene id # 3: Interactor 2 uniprot id # 4: Interactor 2 Ensembl gene id # 5: Interactor 2 Entrez Gene id # 6: Interaction type # 7: Interaction context Pubmed references ct += 1 pbar.update(ct) if not row[0].startswith('uniprotkb:'): continue if not row[3].startswith('uniprotkb:'): continue up1 = row[0].replace('uniprotkb:', '') up2 = row[3].replace('uniprotkb:', '') if not up1 or not up2: skip_ct += 1 continue # protein1 if up1 in up2pid: pid1 = up2pid[up1] elif up1 in notfnd: continue else: t1 = find_target(dba, up1) if not t1: notfnd.add(up1) continue pid1 = t1['components']['protein'][0]['id'] up2pid[up1] = pid1 # protein2 if up2 in up2pid: pid2 = up2pid[up2] elif up2 in notfnd: continue else: t2 = find_target(dba, up2) if not t2: notfnd.add(up2) continue pid2 = t2['components']['protein'][0]['id'] up2pid[up2] = pid2 int_type = row[6] ppik = up1 + "|" + up2 + 'int_type' if ppik in ppis: dup_ct += 1 continue if pid1 == pid2: same12_ct += 1 continue # Insert PPI rv = dba.ins_ppi( {'ppitype': 'Reactome', 'interaction_type': int_type, 'protein1_id': pid1, 'protein1_str': up1, 'protein2_id': pid2, 'protein2_str': up2} ) if rv: ppi_ct += 1 ppis[ppik] = True else: dba_err_ct += 1 pbar.finish() for up in notfnd: logger.warn("No target found for: {}".format(up)) print "{} Reactome PPI rows processed.".format(ct) print " Inserted {} ({}) new ppi rows".format(ppi_ct, len(ppis)) if skip_ct: print " Skipped {} rows without two UniProt interactors".format(skip_ct) if dup_ct: print " Skipped {} duplicate PPIs".format(dup_ct) if same12_ct: print " Skipped {} PPIs involving the same protein".format(same12_ct) if notfnd: print " No target found for {} UniProt accessions. See logfile {} for details.".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'MLP Assay Info', 'source': 'IDG-KMC generated data by Jeremy Yang at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': "This data is generated at UNM from PubChem and EUtils data. It contains details about targets studied in assays that were part of NIH's Molecular Libraries Program."} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': 3, 'table_name': 'mlp_assay_info'}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) if os.path.isfile(T2AID_PICKLE): t2aid = pickle.load( open(T2AID_PICKLE, 'rb')) act = 0 for tid in t2aid.keys(): for aid in t2aid[tid]: act += 1 if not args['--debug']: print "\n{} targets have link(s) to {} PubChem MLP assay(s)".format(len(t2aid), act) else: pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] line_ct = slmf.wcl(AIDGI_FILE) t2aid = {} if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, AIDGI_FILE) with open(AIDGI_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) ct = 0 skip_ct = 0 fndgi_ct = 0 fndpl_ct = 0 notfnd = set() assay_ct = 0 dba_err_ct = 0 for row in csvreader: # aid, tgt_gi, tgt_species, tgt_name #print "[DEBUG]", row ct += 1 if row[2] != 'H**o sapiens': skip_ct += 1 continue gi = row[1] targets = dba.find_targets_by_xref({'xtype': 'NCBI GI', 'value': gi}) if targets: fndgi_ct += 1 else: url = EFETCH_PROTEIN_URL + gi r = requests.get(url) if r.status_code == 200: soup = BeautifulSoup(r.text, "xml") grl = soup.find('Gene-ref_locus') if grl: sym = grl.text targets = dba.find_targets({'sym': sym}) if targets: fndpl_ct += 1 else: notfnd.append(gi) logger.warn("No target found for GI {}".format(gi)) continue t = targets[0] tid = t['id'] if tid in t2aid: t2aid[tid].append(row[0]) assay_ct += 1 else: t2aid[tid] = [row[0]] assay_ct += 1 pbar.update(ct) pbar.finish() pickle.dump(t2aid, open(T2AID_PICKLE, "wb")) print "\n{} rows processed.".format(ct) print " {} assays linked to {} TCRD targets".format(assay_ct, len(t2aid)) print " Skipped {} non-huamn assay rows".format(skip_ct) print " {} linked by GI; {} linked via EUtils".format(fndgi_ct, fndpl_ct) print " No target found for {} GIs. See logfile {} for details".format(len(notfnd), logfile) assay_info = {} pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] line_ct = slmf.wcl(ASSAYS_FILE) if not args['--quiet']: print "\nProcessing {} rows in file {}".format(line_ct, ASSAYS_FILE) with open(ASSAYS_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) ct = 0 for row in csvreader: # ID,ActivityOutcomeMethod,AssayName,SourceName,ModifyDate,DepositDate,ActiveSidCount,InactiveSidCount,InconclusiveSidCount,TotalSidCount,ActiveCidCount,TotalCidCount,ProteinTargetList aid = row[0] assay_info[aid] = row[1:] pbar.update(ct) pbar.finish() elapsed = time.time() - start_time print "Got assay info for {} assays.".format(len(assay_info)) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] tct = len(t2aid.keys()) if not args['--quiet']: print "\nLoading MLP Assay Info for {} targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 ti_ct = 0 mai_ct = 0 dba_err_ct = 0 for tid, aids in t2aid.items(): ct += 1 for aid in aids: ainfo = assay_info[aid] rv = dba.ins_mlp_assay_info({'protein_id': tid, 'aid': aid, 'assay_name': ainfo[1], 'method': ainfo[0], 'active_sids': ainfo[5], 'inactive_sids': ainfo[6], 'iconclusive_sids': ainfo[7], 'total_sids': ainfo[8]}) if rv: mai_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "\n{} targets processed.".format(ct) print " Inserted {} new mlp_assay_info rows".format(mai_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Reactome Pathways', 'source': 'File %s' % BASE_URL + PATHWAYS_FILE, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.reactome.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'pathway', 'where_clause': "pwtype = 'Reactome'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] infile = (DOWNLOAD_DIR + PATHWAYS_FILE).replace('.zip', '') line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} input line from Reactome Pathways file {}".format( line_ct, infile) with open(infile, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') # Example line: # Apoptosis R-HSA-109581 Reactome Pathway ACIN1 ADD1 AKT1 AKT2 ... pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 sym2pids = defaultdict(list) pmark = set() notfnd = set() pw_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 pwname = row[0] pwid = row[1] url = 'http://www.reactome.org/content/detail/' + pwid syms = row[3:] for sym in syms: if sym in sym2pids: pids = sym2pids[sym] elif sym in notfnd: continue else: targets = dba.find_targets({'sym': sym}) if not targets: notfnd.add(sym) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) sym2pids[ sym] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_pathway({ 'protein_id': pid, 'pwtype': 'Reactome', 'name': pwname, 'id_in_source': pwid, 'url': url }) if rv: pw_ct += 1 pmark.add(pid) else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for sym in sym2pids: logger.warn("No target found for {}".format(sym)) print "Processed {} Reactome Pathways.".format(ct) print " Inserted {} pathway rows for {} proteins.".format( pw_ct, len(pmark)) if notfnd: print " No target found for {} Gene IDs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'PubTator Text-mining Scores', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ncbi.nlm.nih.gov/CBBresearch/Lu/Demo/PubTator/', 'comments': 'PubTator data was subjected to the same counting scheme used to generate JensenLab PubMed Scores.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'ptscore' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'PubTator PubMed Score'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] ptscores = {} # protein.id => sum(all scores) pts_ct = 0 dba_err_ct = 0 infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 geneid2pid = {} notfnd = set() for row in tsvreader: # NCBI Gene ID year score ct += 1 pbar.update(ct) gidstr = row[0].replace(',', ';') geneids = gidstr.split(';') for geneid in geneids: if not geneid or '(tax:' in geneid: continue if geneid in geneid2pid: # we've already found it pids = geneid2pid[geneid] elif geneid in notfnd: # we've already not found it continue else: targets = dba.find_targets({'geneid': geneid}) if not targets: notfnd.add(geneid) logger.warn("No target found for {}".format(geneid)) continue pids = [] for target in targets: pids.append(target['components']['protein'][0]['id']) geneid2pid[ geneid] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_ptscore({ 'protein_id': pid, 'year': row[1], 'score': row[2] }) if rv: pts_ct += 1 else: dba_err_ct += 1 if pid in ptscores: ptscores[pid] += float(row[2]) else: ptscores[pid] = float(row[2]) pbar.finish() print "{} lines processed.".format(ct) print " Inserted {} new ptscore rows for {} targets.".format( pts_ct, len(ptscores)) if notfnd: print "No target found for {} NCBI Gene IDs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) print "\nLoading {} PubTator Score tdl_infos".format(len(ptscores)) ct = 0 ti_ct = 0 dba_err_ct = 0 for pid, score in ptscores.items(): ct += 1 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'PubTator Score', 'number_value': score }) if rv: ti_ct += 1 else: dba_err_ct += 1 print "{} processed".format(ct) print "Inserted {} new PubTator PubMed Score tdl_info rows".format(ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Pathway Commons', 'source': 'File %s' % BASE_URL + PATHWAYS_FILE, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.pathwaycommons.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'pathway', 'where_clause': "pwtype LIKE 'PathwayCommons %s'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] infile = (DOWNLOAD_DIR + PATHWAYS_FILE).replace('.gz', '') line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} records from PathwayCommons file {}".format( line_ct, infile) with open(infile, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') # Example line: # http://identifiers.org/kegg.pathway/hsa00010 name: Glycolysis / Gluconeogenesis; datasource: kegg; organism: 9606; idtype: uniprot A8K7J7 B4DDQ8 B4DNK4 E9PCR7 P04406 P06744 P07205 P07738 P09467 P09622 P09972 P10515 P11177 P14550 P30838 P35557 P51648 P60174 Q01813 Q16822 Q53Y25 Q6FHV6 Q6IRT1 Q6ZMR3 Q8IUN7 Q96C23 Q9BRR6 Q9NQR9 Q9NR19 # However, note that pathway commons URLs in file give 404. # E.g. URL from this line: # http://pathwaycommons.org/pc2/Pathway_0136871cbdf9a3ecc09529f1878171df name: VEGFR1 specific signals; datasource: pid; organism: 9606; idtype: uniprot O14786 O15530 O60462 P05771 P07900 P15692 P16333 P17252 P17612 P17948 P19174 P20936 P22681 P27361 P27986 P28482 P29474 P31749 P42336 P49763 P49765 P62158 P98077 Q03135 Q06124 Q16665 Q9Y5K6 # needs to be converted to: # http://apps.pathwaycommons.org/pathways?uri=http%3A%2F%2Fpathwaycommons.org%2Fpc2%2FPathway_0136871cbdf9a3ecc09529f1878171df pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 up2pid = {} pmark = set() notfnd = set() pw_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 src = re.search(r'datasource: (\w+)', row[1]).groups()[0] if src in ['kegg', 'wikipathways', 'reactome']: skip_ct += 1 continue pwtype = 'PathwayCommons: ' + src name = re.search(r'name: (.+?);', row[1]).groups()[0] url = PCAPP_BASE_URL + urllib.quote(row[0], safe='') ups = row[2:] for up in ups: if up in up2pid: pid = up2pid[up] elif up in notfnd: continue else: targets = dba.find_targets({'uniprot': up}) if not targets: notfnd.add(up) continue t = targets[0] pid = t['components']['protein'][0]['id'] up2pid[up] = pid rv = dba.ins_pathway({ 'protein_id': pid, 'pwtype': pwtype, 'name': name, 'url': url }) if rv: pw_ct += 1 pmark.add(pid) else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for up in notfnd: logger.warn("No target found for {}".format(up)) print "Processed {} Pathway Commons records.".format(ct) print " Inserted {} new pathway rows for {} proteins.".format( pw_ct, len(pmark)) print " Skipped {} records from 'kegg', 'wikipathways', 'reactome'".format( skip_ct) if notfnd: print " No target found for {} UniProt accessions. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'LocSigDB', 'source': 'File %s from %s' % (FILENAME, BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://genome.unmc.edu/LocSigDB/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'locsig'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) fn = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, fn) with open(fn, 'rU') as f: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 up2pid = {} notfnd = set() ls_ct = 0 skip_ct = 0 pmark = set() dba_err_ct = 0 for line in f: ct += 1 data = line.split(',') if 'H**o sapiens' not in data[5]: skip_ct += 1 continue fnd = False for up in data[4].split(';'): if up in up2pid: # we've already found it pid = up2pid[up] elif up in notfnd: # we've already not found it continue else: targets = dba.find_targets({'uniprot': up}) if not targets: notfnd.add(up) continue pid = targets[0]['components']['protein'][0]['id'] up2pid[up] = pid rv = dba.ins_locsig({ 'protein_id': pid, 'location': data[2], 'signal': data[0], 'pmids': data[3] }) if not rv: dba_err_ct += 1 continue ls_ct += 1 pmark.add(pid) pbar.update(ct) pbar.finish() for up in notfnd: logger.warn("No target found for {}".format(up)) print "{} lines processed.".format(ct) print " Inserted {} new locsig rows for {} proteins".format( ls_ct, len(pmark)) print " Skipped {} non-human rows".format(skip_ct) if notfnd: print "No target found for {} UniProts. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def main(): argparser = argparse.ArgumentParser( description= "Create a pickle file containing IDG family geneids/uniprots") group = argparser.add_mutually_exclusive_group() group.add_argument("-v", "--verbose", action='count', default=0, help="Set output verbosity level") group.add_argument("-q", "--quiet", action="store_true") argparser.add_argument('-dh', '--dbhost', help='Database host.', default=DBHOST) argparser.add_argument('-db', '--dbname', help='Database name.', default=DBNAME) argparser.add_argument('-o', '--outfile', help='Database name.') args = argparser.parse_args() dba_params = {'dbhost': args.dbhost, 'dbname': args.dbname} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() tct = dba.get_target_count(idg=True) if not args.quiet: print "\n%s (v%s) [%s]:" % (PROGRAM, __version__, time.strftime("%c")) print " Connected to TCRD database %s (schema ver: %s, data ver: %s)" % ( args.dbname, dbi['schema_ver'], dbi['data_ver']) print " Dumping TCRD IDG Families for %d targets" % tct start_time = time.time() pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() idgs = {'GPCR': [], 'oGPCR': [], 'Kinase': [], 'IC': [], 'NR': []} ct = 0 for t in dba.get_targets(idg=True, include_annotations=False): ct += 1 p = t['components']['protein'][0] idg = t['idgfam'] idgs[idg].append({ 'sym': p['sym'], 'geneid': p['geneid'], 'uniprot': p['uniprot'] }) pbar.update(ct) pbar.finish() elapsed = time.time() - start_time print "%d TCRD targets processed. Elapsed time: %s" % (ct, secs2str(elapsed)) print "Saving info for following IDG Family counts to pickle file %s" % args.outfile for idgfam in idgs.keys(): print " %s: %d" % (idgfam, len(idgs[idgfam])) pickle.dump(idgs, open(args.outfile, 'wb')) print "\n%s: Done.\n" % PROGRAM
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'WikiPathways', 'source': 'File %s' % BASE_URL + PATHWAYS_FILE, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.wikipathways.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'pathway', 'where_clause': "pwtype = 'WikiPathways'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] fn = DOWNLOAD_DIR + PATHWAYS_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} input lines from WikiPathways file {}".format( line_ct, fn) with open(fn, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') # Example line: # Apoptosis Modulation and Signaling%WikiPathways_20160516%WP1772%H**o sapiens http://www.wikipathways.org/instance/WP1772_r85184 843 3725 842 ... pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 gid2pids = defaultdict(list) pmark = set() notfnd = set() pw_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 name = row[0].split('%')[0] wpid = row[1].split('/')[-1] geneids = row[2:] for gid in geneids: if gid in gid2pids: pids = gid2pids[gid] elif gid in notfnd: continue else: targets = dba.find_targets({'geneid': gid}) if not targets: notfnd.add(gid) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) gid2pids[ gid] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_pathway({ 'protein_id': pid, 'pwtype': 'WikiPathways', 'name': name, 'id_in_source': wpid, 'url': row[1] }) if rv: pw_ct += 1 pmark.add(pid) else: dba_err_ct += 1 pbar.update(ct) pbar.finish() for gid in gid2pids: logger.warn("No target found for {}".format(gid)) print "Processed {} WikiPathways.".format(ct) print " Inserted {} pathway rows for {} proteins.".format( pw_ct, len(pmark)) if notfnd: print " No target found for {} Gene IDs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(): args = docopt(__doc__, version=__version__) debug = int(args['--debug']) if debug: print "\n[*DEBUG*] ARGS:\n%s\n" % repr(args) loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = "%s.log" % PROGRAM logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not debug: logger.propagate = False # turns off console logging when debug is 0 fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "Connected to TCRD database %s (schema ver %s; data ver %s)" % ( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'TechDev Worklist Info', 'source': 'Files from TechDev Groups', 'app': PROGRAM, 'app_version': __version__, 'comments': 'Loading app uses data from spreadsheets submitted by the TechDev groups listing targets being investigated.' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'techdev_contact', 'comment': "" }, { 'dataset_id': dataset_id, 'table_name': 'techdev_info', 'comment': "" }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] for tdid, filename in INPUTFILES.items(): line_ct = wcl(filename) if not args['--quiet']: print '\nProcessing %d lines from input file: %s' % (line_ct, filename) with open(filename, 'rU') as csvfile: csvreader = csv.reader(csvfile) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 contact = {} skip_ct = 0 err_ct = 0 info_ct = 0 notfnd = [] dba_err_ct = 0 for row in csvreader: ct += 1 if row[0] == 'TechDev ID:': techdev_id = int(row[1]) contact['id'] = techdev_id continue if row[0] == 'Grant Number:': contact['grant_number'] = row[1] continue if row[0] == 'Submitter name:': contact['contact_name'] = row[1] continue if row[0] == 'Contact email:': contact['contact_email'] = row[1] continue if row[0] == 'Submission date:': contact['date'] = row[1] continue if row[0] == 'tcrd_target_id': contact['pi'] = PIS[techdev_id] contact_id = dba.ins_techdev_contact(contact) if not contact_id: logger.error("DBA error inserting techdev_contact.") print "Exiting due to DBA error inserting techdev_contact. See logfile %s for details." % logfile break continue if not row[6]: skip_ct += 1 continue sym = row[1] targets = dba.find_targets({'sym': sym}) if not targets: notfnd.append(sym) continue t = targets[0] pid = t['components']['protein'][0]['id'] init = {'contact_id': contact_id, 'protein_id': pid} if not row[7]: err_ct += 1 continue init['comment'] = row[7] if row[8]: init['publication_pcmid'] = row[8] if row[9]: init['publication_pmid'] = row[9] if row[11]: init['resource_url'] = row[11] if row[10]: init['data_url'] = row[10] rv = dba.ins_techdev_info(init) if rv: info_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() if not args['--quiet']: print "%d lines processed." % ct print " Skipped %d lines not under investigation" % skip_ct if err_ct > 0: print " WARNING: %d lines did not have a comment!" % err_ct if notfnd: print " WARNING: %d symbols did not find a target!" for sym in notfnd: print " %s" % sym print " Inserted 1 new techdev_contact row" print " Inserted %d new techdev_info rows" % info_ct if dba_err_ct > 0: print "WARNING: %d DB errors occurred. See logfile %s for details." % ( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nLoading project info from pickle file %s" % PROJECTS_P projects = pickle.load( open(PROJECTS_P, 'rb') ) # Dataset dataset_id = dba.ins_dataset( {'name': 'NIH Grant Textmining Info', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': "Grant info is generated from textmining results of running Lars Jensen's tagger software on project info downloaded from NIHExporter."} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance provs = [ {'dataset_id': dataset_id, 'table_name': 'grant'}, {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': 'itype is "NIHRePORTER 2000-2017 R01 Count"'} ] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] if not args['--quiet']: print "\nLoading tagging results in %s" % TAGGING_RESULTS_DIR r01cts = {} for year in [str(yr) for yr in range(2000, 2018)]: # 2000-2017 pfile = "%s/Target2AppIDs%s.p" % (TAGGING_RESULTS_DIR, year) target2appids = pickle.load( open(pfile, 'rb') ) tct = len(target2appids.keys()) if not args['--quiet']: print "\nProcessing tagging results for {}: {} targets".format(year, tct) pfile = "%s/AppID2Targets%s.p" % (TAGGING_RESULTS_DIR, year) appid2targets = pickle.load( open(pfile, 'rb') ) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 t2g_ct = 0 dba_err_ct = 0 for tid,appids in target2appids.items(): ct += 1 pbar.update(ct) for appid in appids: if appid not in appid2targets: # need to do this check because of projects removed with > 10 targets tagged continue app_target_ct = len(appid2targets[appid]) # number of targets tagged in this grant ginfo = projects[year][appid] # gcost is total grant dollars if ginfo['TOTAL_COST']: gcost = float(ginfo['TOTAL_COST']) elif ginfo['TOTAL_COST_SUB_PROJECT']: gcost = float(ginfo['TOTAL_COST_SUB_PROJECT']) else: continue # grant_target_cost is dollars per target for this grant grant_target_cost = gcost/app_target_ct rv = dba.ins_grant( {'target_id': tid, 'appid': appid, 'year': year, 'full_project_num': ginfo['FULL_PROJECT_NUM'], 'activity': ginfo['ACTIVITY'], 'funding_ics': ginfo['FUNDING_ICs'], 'cost': "%.2f"%grant_target_cost } ) if not rv: dba_err_ct += 1 continue t2g_ct += 1 # track R01s if ginfo['ACTIVITY'] == 'R01': if tid in r01cts: r01cts[tid] += 1 else: r01cts[tid] = 1 pbar.finish() print "Processed {} target tagging records.".format(ct) print " Inserted {} new target2grant rows".format(t2g_ct) # Now load 'NIHRePORTER 2000-2017 R01 Count' tdl_infos print "\nLoading 'NIHRePORTER 2010-2017 R01 Count' tdl_infos for {} targets".format(len(r01cts)) ti_ct = 0 for tid in r01cts: rv = dba.ins_tdl_info( {'target_id': tid, 'itype': 'NIHRePORTER 2000-2017 R01 Count', 'integer_value': r01cts[tid]} ) if not rv: dba_err_ct += 1 continue ti_ct += 1 print " Inserted {} new tdl_info rows".format(ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not debug: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as load() dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % (args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'UniProt', 'source': 'Web API at %s'%BASEURL, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.uniprot.org/uniprot'} ) if not dataset_id: print "WARNING: Error inserting dataset. See logfile %s for details." % logfile sys.exit(1) # Provenance provs = [ {'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'ttype'}, {'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'name'}, {'dataset_id': dataset_id, 'table_name': 'protein'}, {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'UniProt Function'"}, {'dataset_id': dataset_id, 'table_name': 'goa'}, {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'UniProt Tissue'"}, {'dataset_id': dataset_id, 'table_name': 'pathway', 'where_clause': "type = 'uniprot'"}, {'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'uniprot'"}, {'dataset_id': dataset_id, 'table_name': 'feature'}, {'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d"%dataset_id}, {'dataset_id': dataset_id, 'table_name': 'alias', 'where_clause': "dataset_id = %d"%dataset_id} ] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) start_time = time.time() xtypes = dba.get_xref_types() # see EvidenceOntology.ipynb for where this comes from e2e = {'ECO:0000250': 'ISS', 'ECO:0000269': 'EXP', 'ECO:0000270': 'IEP', 'ECO:0000303': 'NAS', 'ECO:0000304': 'TAS', 'ECO:0000305': 'IC' ,'ECO:0000314': 'IDA','ECO:0000315': 'IMP', 'ECO:0000316': 'IGI','ECO:0000318': 'IBA', 'ECO:0000353': 'IPI', 'ECO:0000501': 'IEA'} s = shelve.open(SHELF_FILE, writeback=True) s['ups'] = [] s['loaded'] = {} s['retries'] = {} s['errors'] = {} line_ct = wcl(UPHUMAN_FILE) line_ct -= 1 # file has header row if not args['--quiet']: print "\nProcessing %d records in UniProt file %s" % (line_ct, UPHUMAN_FILE) with open(UPHUMAN_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') tsvreader.next() # skip header line for row in tsvreader: up = row[0] s['ups'].append(up) print "\nLoading data for %d proteins" % len(s['ups']) logger.info("Loading data for %d proteins" % len(s['ups'])) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=len(s['ups'])).start() ct = 0 xml_err_ct = 0 dba_err_ct = 0 for i,up in enumerate(s['ups']): ct += 1 logger.info("Processing UniProt entry %d: %s" % (i, up)) (status, headers, upxml) = get_uniprot(up) # Code Description # 200 The request was processed successfully. # 300 Obsolete. # 400 Bad request. There is a problem with your input. # 404 Not found. The resource you requested doesn't exist. # 410 Gone. The resource you requested was removed. # 500 Internal server error. Most likely a temporary problem, but if the problem persists please contact us. # 503 Service not available. The server is being updated, try again later. if not status: logger.warn("Failed getting accession %s" % up) s['retries'][up] = True continue if status != 200: logger.error("Bad UniProt API response for %s: %s" % (up, status)) s['errors'][up] = status continue target = uniprotxml2target(up, upxml, dataset_id, xtypes, e2e) if not target: xml_err_ct += 1 logger.error("XML Error for %s" % up) continue tid = dba.ins_target(target) if tid: logger.debug("Target insert id: %s" % tid) s['loaded'][up] = tid else: dba_err_ct += 1 time.sleep(0.5) pbar.update(ct) pbar.finish() print "Processed %d UniProt records." % ct print " Total loaded targets/proteins: %d" % len(s['loaded'].keys()) if len(s['retries']) > 0: print " Total targets/proteins remaining for retries: %d " % len(s['retries']) if len(s['errors']) > 0: print "WARNING: %d API errors occurred. See logfile %s for details." % (len(s['errors']), logfile) if xml_err_ct > 0: print "WARNING: %d XML parsing errors occurred." % xml_err_ct if dba_err_ct > 0: print "WARNING: %d DB errors occurred. See logfile %s for details." % (dba_err_ct, logfile) loop = 1 while len(s['retries']) > 0: print "\nRetry loop %d: Trying to load data for %d proteins" % (loop, len(s['retries'])) logger.info("Retry loop %d: Trying to load data for %d proteins" % (loop, len(s['retries']))) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=len(s['retries'])).start() ct = 0 tct = 0 xml_err_ct = 0 dba_err_ct = 0 for up,_ in s['retries'].items(): ct += 1 logger.info("Processing UniProt entry %s" % up) (status, headers, upxml) = get_uniprot(up) # Code Description # 200 The request was processed successfully. # 300 Obsolete. # 400 Bad request. There is a problem with your input. # 404 Not found. The resource you requested doesn't exist. # 410 Gone. The resource you requested was removed. # 500 Internal server error. Most likely a temporary problem, but if the problem persists please contact us. # 503 Service not available. The server is being updated, try again later. if not status: logger.warn("Failed getting accession %s" % up) continue if status != 200: logger.error("Bad UniProt API response for %s: %s" % (up, status)) s['errors'][up] = status continue target = uniprotxml2target(up, upxml, dataset_id, xtypes, e2e) if not target: xml_err_ct += 1 logger.error("XML Error for %s" % up) continue tid = dba.ins_target(target) if tid: tct += 1 logger.debug("Target insert id: %s" % tid) s['loaded'][up] = tid del s['retries'][up] else: dba_err_ct += 1 time.sleep(0.5) pbar.update(ct) loop += 1 pbar.finish() print "Processed %d UniProt records." % ct print " Loaded %d new targets/proteins" % tct print " Total loaded targets/proteins: %d" % len(s['loaded'].keys()) if len(s['retries']) > 0: print " Total targets/proteins remaining for next loop: %d " % len(s['retries']) if len(s['errors']) > 0: print "WARNING: %d API errors occurred. See logfile %s for details." % (len(s['errors']), logfile) if xml_err_ct > 0: print "WARNING: %d XML parsing errors occurred." % xml_err_ct if dba_err_ct > 0: print "WARNING: %d DB errors occurred. See logfile %s for details." % (dba_err_ct, logfile) s.close()
args = docopt(__doc__, version=__version__) if args['--debug']: print "\n[*DEBUG*] ARGS:\n%s\n"%repr(args) loglevel = int(args['--loglevel']) logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) start_time = time.time() if args['--command'] == 'map': pickle_sym2pid(args, dba, logger) elif args['--command'] == 'load': load(args, dba, logger) elapsed = time.time() - start_time print "\n{}: Done. Elapsed time: {}\n".format(PROGRAM, slmf.secs2str(elapsed)) # ct = 0 # sym2pid = {}
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'KEGG Pathways', 'source': 'API at %s' % KEGG_BASE_URL, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.genome.jp/kegg/pathway.html' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'pathway', 'where_clause': "pwtype = 'KEGG'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) print "\nMapping KEGG pathways to gene lists" kpw2geneids = {} url = "%s/link/hsa/pathway" % KEGG_BASE_URL r = None attempts = 0 while attempts < 3: try: r = requests.get(url) break except Exception, e: attempts += 1
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not debug: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'AnimalTFDB', 'source': 'http://www.bioguo.org/AnimalTFDB/BrowseAllTF.php?spe=Homo_sapiens', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.bioguo.org/AnimalTFDB/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'Is Transcription Factor'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] TDLs = {'Tdark': 0, 'Tbio': 0, 'Tchem': 0, 'Tclin': 0} line_ct = slmf.wcl(INFILE) if not args['--quiet']: print "\nProcessing {} lines in input file {}\n".format( line_ct, INFILE) with open(INFILE, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 ti_ct = 0 notfnd = [] dba_err_ct = 0 for row in tsvreader: ct += 1 sym = row[3] targets = dba.find_targets({'sym': sym}) if not targets: gid = row[2] targets = dba.find_targets({'geneid': gid}) if not targets: ensg = row[1] targets = dba.find_targets_by_xref({ 'xtype': 'Ensembl', 'value': ensg }) if not targets: notfnd.append(row) continue t = targets[0] TDLs[t['tdl']] += 1 pid = t['components']['protein'][0]['id'] rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'Is Transcription Factor', 'boolean_value': 1 }) if rv: ti_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "\n{} lines processed.".format(ct) print " Inserted {} new Is Transcription Factor tdl_infos".format(ti_ct) if notfnd: print "No target found for {} rows:".format(len(notfnd)) if dba_err_ct > 0: print "WARNING: %d DB errors occurred. See logfile %s for details." % ( dba_err_ct, logfile) for tdl in ['Tclin', 'Tchem', 'Tbio', 'Tdark']: print "{}: {}".format(tdl, TDLs[tdl])
def main(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(LOGLEVEL) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nLoading project info from pickle file {}".format(PROJECTS_P) projects = pickle.load(open(PROJECTS_P, 'rb')) if not args['--quiet']: print "\nCreating Tagger..." tgr = Tagger() tgr.load_names(ENTITIES_FILE, NAMES_FILE) tgr.load_global(GLOBAL_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] for year in [str(yr) for yr in range(2000, 2018)]: # 2000-2017 pct = len(projects[year]) print "\nTagging {} projects from {}".format(pct, year) logger.info("Tagging {} projects from {}".format(pct, year)) pbar = ProgressBar(widgets=pbar_widgets, maxval=pct).start() start_time = time.time() ct = 0 ttag_ct = 0 abstag_ct = 0 skip_ct = 0 ttagsnotfnd = set() ttag2targetid = {} appid2targets = defaultdict(set) target2appids = defaultdict(set) for appid in projects[year].keys(): ct += 1 logger.debug(" Processing appid {}".format(appid)) ginfo = projects[year][appid] # if there's no $$, we're not interested if ginfo['TOTAL_COST']: gcost = int(ginfo['TOTAL_COST']) elif ginfo['TOTAL_COST_SUB_PROJECT']: gcost = int(ginfo['TOTAL_COST_SUB_PROJECT']) else: continue # also, if there's less than $10k we're not interested if gcost < 10000: skip_ct += 1 continue # # tag titles # matches = tgr.get_matches(projects[year][appid]['PROJECT_TITLE'], appid, [9606]) if matches: ttag_ct += 1 # the same tag can match multiple times, so get a set of ENSPs ensps = set() for m in matches: ensps.add(m[2][0][1]) ensps = list(ensps) for ensp in ensps: if ensp in ttag2targetid: tid = ttag2targetid[ensp] elif ensp in ttagsnotfnd: continue else: targets = dba.find_targets({'stringid': ensp}, idg=False) if not targets: targets = dba.find_targets_by_xref( { 'xtype': 'Ensembl', 'value': ensp }, idg=False) if not targets: ttagsnotfnd.add(ensp) continue tid = targets[0]['id'] ttag2targetid[ ensp] = tid # save this so we don't look up the targets again appid2targets[appid].add(tid) target2appids[tid].add(appid) # # tag abstracts # if 'ABSTRACT' in projects[year][appid]: matches = tgr.get_matches(projects[year][appid]['ABSTRACT'], appid, [9606]) if matches: abstag_ct += 1 # the same tag can match multiple times, so get a set of ENSPs ensps = set() for m in matches: ensps.add(m[2][0][1]) ensps = list(ensps) for ensp in ensps: if ensp in ttag2targetid: tid = ttag2targetid[ensp] elif ensp in ttagsnotfnd: continue else: targets = dba.find_targets({'stringid': ensp}, idg=False) if not targets: targets = dba.find_targets_by_xref( { 'xtype': 'Ensembl', 'value': ensp }, idg=False) if not targets: ttagsnotfnd.add(ensp) continue tid = targets[0]['id'] ttag2targetid[ ensp] = tid # save this so we don't look up the targets again appid2targets[appid].add(tid) target2appids[tid].add(appid) pbar.update(ct) pbar.finish() del_ct = 0 for appid, tidset in appid2targets.items(): if len(tidset) > 10: del_ct += 1 del (appid2targets[appid]) logger.info("{} projects processed.".format(ct)) logger.info(" Removed {} projects with > 10 targets" % del_ct) logger.info( " Skipped {} projects with funds less than $10k:".format(skip_ct)) logger.info(" {} titles have tagging result(s)".format(ttag_ct)) logger.info(" {} abstracts have tagging result(s)".format(abstag_ct)) logger.info("{} total tags map to {}/{} distinct targets".format( len(ttag2targetid.keys()), len(set(ttag2targetid.values())), len(target2appids.keys()))) logger.info("{} project applications map to target(s)".format( len(appid2targets.keys()))) if ttagsnotfnd: logger.info(" No target found for {} tags".format( len(ttagsnotfnd))) pfile = "%s/AppID2Targets%s.p" % (TAGGING_RESULTS_DIR, year) pickle.dump(appid2targets, open(pfile, 'wb')) logger.info("Tagging results saved to pickle {} for {}".format( pfile, year)) pfile = "%s/Target2AppIDs%s.p" % (TAGGING_RESULTS_DIR, year) pickle.dump(target2appids, open(pfile, 'wb')) logger.info("Tagging results saved to pickle {} for {}".format( pfile, year)) print "{} projects processed. See logfile {} for details.".format( ct, LOGFILE)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'BioPlex Protein-Protein Interactions', 'source': "Files %s from http://wren.hms.harvard.edu/bioplex/downloadInteractions.php"%", ".join(SRC_FILES), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://wren.hms.harvard.edu/bioplex/index.php'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'ppi', 'where_clause': "ppitype = 'BioPlex'"}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) f = BIOPLEX_FILE line_ct = slmf.wcl(f) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from BioPlex PPI file {}".format(line_ct, f) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(f, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # GeneA GeneB UniprotA UniprotB SymbolA SymbolB pW pNI pInt ct = 0 ppi_ct = 0 same12_ct = 0 k2pid = {} notfnd = set() dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) geneid1 = row[0] geneid2 = row[1] up1 = row[2] up2 = row[3] sym1 = row[4] sym2 = row[5] pw = row[6] pni = row[7] pint = row[8] # protein1 k1 = "%s|%s|%s" % (up1, sym1, geneid1) if k1 in k2pid: pid1 = k2pid[k1] elif k1 in notfnd: continue else: t1 = find_target(dba, k1) if not t1: notfnd.add(k1) continue pid1 = t1['components']['protein'][0]['id'] k2pid[k1] = pid1 # protein2 k2 = "%s|%s|%s" % (up2, sym2, geneid2) if k2 in k2pid: pid2 = k2pid[k2] elif k2 in notfnd: continue else: t2 = find_target(dba, k2) if not t2: notfnd.add(k2) continue pid2 = t2['components']['protein'][0]['id'] k2pid[k2] = pid2 if pid1 == pid2: same12_ct += 1 continue # Insert PPI rv = dba.ins_ppi( {'ppitype': 'BioPlex','p_int': pint, 'p_ni': pni, 'p_wrong': pw, 'protein1_id': pid1, 'protein1_str': k1, 'protein2_id': pid2, 'protein2_str': k2} ) if rv: ppi_ct += 1 else: dba_err_ct += 1 pbar.finish() for k in notfnd: logger.warn("No target found for: {}".format(k)) print "{} BioPlex PPI rows processed.".format(ct) print " Inserted {} new ppi rows".format(ppi_ct) if same12_ct: print " Skipped {} PPIs involving the same protein".format(same12_ct) if notfnd: print " No target found for {} UniProts/Syms/GeneIDs. See logfile {} for details.".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) for f in UPD_FILES[1:]: start_time = time.time() line_ct = slmf.wcl(f) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from BioPlex PPI update file {}".format(line_ct, f) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(f, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # plate_num well_num db_protein_id symbol gene_id bait_symbol bait_geneid pWrongID pNoInt pInt ct = 0 ppi_ct = 0 same12_ct = 0 k2pid = {} notfnd = set() dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) geneid1 = row[6] geneid2 = row[4] sym1 = row[5] sym2 = row[3] pw = row[7] pni = row[8] pint = row[9] # protein1 k1 = "|%s|%s" % (sym1, geneid1) if k1 in k2pid: pid1 = k2pid[k1] elif k1 in notfnd: continue else: t1 = find_target(dba, k1) if not t1: notfnd.add(k1) continue pid1 = t1['components']['protein'][0]['id'] k2pid[k1] = pid1 # protein2 k2 = "|%s|%s" % (sym2, geneid2) if k2 in k2pid: pid2 = k2pid[k2] elif k2 in notfnd: continue else: t2 = find_target(dba, k2) if not t2: notfnd.add(k2) continue pid2 = t2['components']['protein'][0]['id'] k2pid[k2] = pid2 if pid1 == pid2: same12_ct += 1 continue # Insert PPI rv = dba.ins_ppi( {'ppitype': 'BioPlex','p_int': pint, 'p_ni': pni, 'p_wrong': pw, 'protein1_id': pid1, 'protein1_str': k1, 'protein2_id': pid2, 'protein2_str': k2} ) if rv: ppi_ct += 1 else: dba_err_ct += 1 pbar.finish() for k in notfnd: logger.warn("No target found for: {}".format(k)) print "{} BioPlex PPI rows processed.".format(ct) print " Inserted {} new ppi rows".format(ppi_ct) if same12_ct: print " Skipped {} PPIs involving the same protein".format(same12_ct) if notfnd: print " No target found for {} UniProts/Syms/GeneIDs. See logfile {} for details.".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as main() dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'Drugable Epigenome Domains', 'source': 'Files from http://www.nature.com/nrd/journal/v11/n5/suppinfo/nrd3674.html', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.nature.com/nrd/journal/v11/n5/suppinfo/nrd3674.html'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'Drugable Epigenome Class'"}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) total_ti_ct = 0 notfnd = set() for k,d in FILE_LIST.items(): if not args['--quiet']: print "\nProcessing Epigenetic {}s".format(k) for dom,f in d.items(): f = INPUT_DIR + f line_ct = slmf.wcl(f) if not args['--quiet']: print 'Processing {} lines from {} input file {}'.format(line_ct, dom, f) with open(f, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header lines ct = 0 not_fnd_ct = 0 tct = 0 ti_ct = 0 dba_err_ct = 0 for row in csvreader: ct += 1 targets = dba.find_targets({'sym': row[0]}) if not targets: targets = dba.find_targets({'geneid': row[3]}) if not targets: targets = dba.find_targets({'uniprot': row[2]}) if not targets: k = "%s|%s|%s"%(row[0],row[3],row[2]) notfnd.add(k) logger.warn("No target found for: {}".format(k)) continue tct += 1 t = targets[0] p = t['components']['protein'][0] if len(row) == 5: val = "Epigenetic %s - %s" % (k, dom) else: val = "Epigenetic %s - %s %s: %s" % (k, dom, row[4], row[5]) rv = dba.ins_tdl_info({'protein_id': p['id'], 'itype': 'Drugable Epigenome Class', 'string_value': val}) if not rv: dba_err_ct += 1 continue ti_ct += 1 if not args['--quiet']: print " {} lines processed. Found {}, skipped {}".format(ct, tct, not_fnd_ct) print " Inserted {} new tdl_info rows".format(ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) total_ti_ct += ti_ct if not args['--quiet']: print "\nInserted a total of {} new Drugable Epigenome Class tdl_infos".format(total_ti_ct) if len(notfnd) > 0: print " No target found for {} sym/geneid/uniprots. See logfile {} for details.".format(len(notfnd), logfile)