def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Parse the MP OWL file if not args['--quiet']: print "\nParsing Mammalian Phenotype Ontology file {}".format(DOWNLOAD_DIR + FILENAME) mp = parse_mp_owl(DOWNLOAD_DIR + FILENAME) print "Got {} MP terms".format(len(mp)) # Dataset dataset_id = dba.ins_dataset( {'name': 'Mammalian Phenotype Ontology', 'source': 'OWL file downloaded from %s'%BASE_URL+FILENAME, 'app': PROGRAM, 'app_version': __version__} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'mpo'}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] if not args['--quiet']: print "\nLoading {} Mammalian Phenotype Ontology terms".format(len(mp)) pbar = ProgressBar(widgets=pbar_widgets, maxval=len(mp)).start() ct = 0 mpo_ct = 0 dba_err_ct = 0 for mpd in mp: ct += 1 rv = dba.ins_mpo(mpd) if rv: mpo_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} terms processed.".format(ct) print " Inserted {} new mpo rows".format(mpo_ct) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def main(): argparser = argparse.ArgumentParser(description="Export TCRD target data to a CSV file") argparser.add_argument("-o", "--outfile", help='Output file [path/]name', default=OUTFILE) argparser.add_argument('-db', '--dbname', help='MySQL database name', default=DBNAME) argparser.add_argument("-i", "--idg", help="Export only IDG-Eligible tagets", action="store_true", default=False) argparser.add_argument("-e", "--expand", help="Export expanded (a LOT of data) CSV version", action="store_true", default=False) args = argparser.parse_args() dba = DBAdaptor({'dbname': args.dbname}) dbi = dba.get_dbinfo() print "\n%s (v%s) [%s]:" % (PROGRAM, __version__, time.strftime("%c")) print "\nConnected to TCRD database %s (schema ver %s, data ver %s)\n" % (dbi['dbname'], dbi['schema_ver'], dbi['data_ver']) if args.idg: tct = dba.get_target_count(idg=True) print "Exporting CSV for %d IDG-Eligible targets from TCRD to file %s" % (tct, args.outfile) else: tct = dba.get_target_count(idg=False) print "Exporting CSV for all %d targets from TCRD to file %s" % (tct, args.outfile) header = ['TCRD ID', 'Name', 'Description', 'HGNC Sym', 'NCBI Gene ID', 'UniProt', 'STRING ID', 'TDL', 'IDG Eligible', 'DTO ID', 'DTO Class'] if args.expand: header = header + ['PANTHER Class(es)', 'GeneRIF Count', 'NCBI Gene PubMed Count', 'JensenLab PubMed Score', 'PubTator Score', 'Ab Count', 'Monoclonal Ab Count', 'Activity Count', 'ChEMBL Selective Compound', 'ChEMBL First Reference Year', 'DrugCentral Activity Count', 'PDB Count', 'PDBs', 'GO Annotation Count', 'Experimental MF/BP Leaf Term GOA(s)', 'OMIM Phenotype Count', 'OMIM Phenotype(s)', 'JAX/MGI Human Ortholog Phenotype Count', 'JAX/MGI Human Ortholog Phenotype(s)', 'IMPC Ortholog Phenotype Count', 'IMPC Ortholog Phenotype(s)', 'GWAS Count', 'GWAS Phenotype(s)', 'Pathway Count', 'Pathways', 'Total Disease Count', 'Top 5 Text-Mining DISEASES', 'eRAM Diseases', 'EBI Patent Count', 'Is Transcription Factor', 'TMHMM Prediction', 'HPA Tissue Specificity Index', 'HPM Gene Tissue Specificity Index', 'HPM Protein Tissue Specificity Index', 'TIN-X Novelty', 'Top 5 TIN-X Importance(s)'] pbar_widgets = ['Progress: ', Percentage(), ' ', Bar(marker='#',left='[',right=']'), ' ', ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() with open(args.outfile, 'wb') as csvout: csvwriter = csv.writer(csvout, quotechar='"', quoting=csv.QUOTE_MINIMAL) csvwriter.writerow(header) ct = 0 if args.idg: for t in dba.get_targets(idg=True, include_annotations=args.expand): ct += 1 if args.expand: csvwriter.writerow( target2csv_exp(t) ) else: csvwriter.writerow( target2csv(t) ) pbar.update(ct) else: for t in dba.get_targets(idg=False, include_annotations=args.expand): #for tid in [9]: # t = dba.get_target(tid, True) ct += 1 if args.expand: csvwriter.writerow(target2csv_exp(t)) else: csvwriter.writerow(target2csv(t)) pbar.update(ct) pbar.finish() print "%d CSV rows exported" % ct print "\n%s: Done." % PROGRAM
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'Human Proteome Map', 'source': 'IDG-KMC generated data by Oleg Ursu at UNM.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.humanproteomemap.org/'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance provs = [ {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Protein'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'}, {'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPM Gene'", 'comment': 'Log Median and qualitative expression values are derived from files from http://www.humanproteomemap.org/download.php'}, {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Protein Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'}, {'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPM Gene Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.humanproteomemap.org/download.php. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)'}] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] with open(TISSUE2UBERON_FILE, 'r') as ifh: tiss2uid = ast.literal_eval(ifh.read()) if not args['--quiet']: print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE) # # Protein Level Expressions # line_ct = slmf.wcl(PROTEIN_QUAL_FILE) if not args['--quiet']: print "\nProcessing {} lines in HPM file {}".format(line_ct, PROTEIN_QUAL_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 rs2pids = defaultdict(list) notfnd = set() nouid = set() dba_err_ct = 0 pmark = {} exp_ct = 0 with open(PROTEIN_QUAL_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) #rs = re.sub('\.\d+$', '', row[0]) # get rid of version rs = row[0] if rs in rs2pids: # we've already found it pids = rs2pids[rs] elif rs in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets_by_xref({'xtype': 'RefSeq', 'value': rs}, False) if not targets: notfnd.add(rs) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) rs2pids[rs] = pids # save this mapping so we only lookup each target once tissue = row[1] if row[3] == 'NA': init = {'etype': 'HPM Protein', 'tissue': tissue, 'qual_value': row[4],} else: init = {'etype': 'HPM Protein','tissue': tissue, 'qual_value': row[4], 'number_value': row[3]} # Add Uberon ID, if we can find one if tissue in tiss2uid: uberon_id = tiss2uid[tissue] else: uberon_id = dba.get_uberon_id({'name': tissue}) if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(tissue) for pid in pids: init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new expression rows for {} proteins ({} RefSeqs)".format(exp_ct, len(pmark), len(rs2pids)) if notfnd: print "No target found for {} RefSeqs. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) line_ct = slmf.wcl(PROTEIN_TAU_FILE) if not args['--quiet']: print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, PROTEIN_TAU_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 pmark = {} skip_ct = 0 ti_ct = 0 with open(PROTEIN_TAU_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) #rs = re.sub('\.\d+$', '', row[0]) # get rid of version rs = row[0] tau = row[1] if rs not in rs2pids: skip_ct += 1 continue for pid in rs2pids[rs]: rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Protein Tissue Specificity Index', 'number_value': tau}) if not rv: dba_err_ct += 1 continue ti_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new HPM Protein Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark)) if skip_ct > 0: print " Skipped {} rows with RefSeqs not in map from expression file.".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) # # Gene Level Expressions # line_ct = slmf.wcl(GENE_QUAL_FILE) if not args['--quiet']: print "\nProcessing {} lines in HPM file {}".format(line_ct, GENE_QUAL_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 sym2pids = defaultdict(list) notfnd = set() nouid = set() dba_err_ct = 0 pmark = {} exp_ct = 0 with open(GENE_QUAL_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) sym = re.sub('\.\d+$', '', row[0]) # get rid of version if sym in sym2pids: pids = sym2pids[sym] elif sym in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets({'sym': sym}, False) if not targets: notfnd.add(sym) continue pids = [] for t in targets: pids.append(t['components']['protein'][0]['id']) sym2pids[sym] = pids # save this mapping so we only lookup each target once tissue = row[1] if row[3] == 'NA': init = {'etype': 'HPM Gene', 'tissue': tissue, 'qual_value': row[4],} else: init = {'etype': 'HPM Gene','tissue': tissue, 'qual_value': row[4], 'number_value': row[3]} # Add Uberon ID, if we can find one if tissue in tiss2uid: uberon_id = tiss2uid[tissue] else: uberon_id = dba.get_uberon_id({'name': tissue}) if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(tissue) for pid in pids: init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new expression rows for {} proteins ({} Gene Symbols)".format(exp_ct, len(pmark), len(sym2pids)) if notfnd: print " No target found for {} symbols. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) line_ct = slmf.wcl(GENE_TAU_FILE) if not args['--quiet']: print "\nProcessing {} lines in Tissue Specificity Index file {}".format(line_ct, GENE_TAU_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 pmark = {} skip_ct = 0 ti_ct = 0 with open(GENE_TAU_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ct += 1 pbar.update(ct) sym = re.sub('\.\d+$', '', row[0]) # get rid of version tau = row[1] if sym not in sym2pids: skip_ct += 1 continue for pid in rs2pids[rs]: rv = dba.ins_tdl_info({'protein_id': pid, 'itype': 'HPM Gene Tissue Specificity Index', 'number_value': tau}) if not rv: dba_err_ct += 1 continue ti_ct += 1 pmark[pid] = True pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new HPM Gene Tissue Specificity Index tdl_info rows for {} proteins.".format(ti_ct, len(pmark)) if skip_ct > 0: print " Skipped {} rows with symbols not in map from expression file".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def main(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] generifs = dba.get_generifs() if not args['--quiet']: print "\nProcessing {} GeneRIFs".format(len(generifs)) logger.info("Processing {} GeneRIFs".format(len(generifs))) pbar = ProgressBar(widgets=pbar_widgets, maxval=len(generifs)).start() yrre = re.compile(r'^(\d{4})') ct = 0 yr_ct = 0 skip_ct = 0 net_err_ct = 0 pubmed2date = {} missing_pmids = set() for generif in generifs: ct += 1 pbar.update(ct) for pmid in generif['pubmed_ids'].split("|"): if pmid in pubmed2date: continue # See if this PubMed is in TCRD... pm = dba.get_pubmed(pmid) if pm: # if so get date from there if pm['date']: pubmed2date[pmid] = pm['date'] else: # if not, will have to get it via EUtils missing_pmids.add(pmid) pbar.finish() if not args['--quiet']: print "{} GeneRIFs processed.".format(ct) in_tcrd_ct = len(pubmed2date) print "Got date mapping for {} PubMeds in TCRD".format(in_tcrd_ct) if not args['--quiet']: print "\nGetting {} missing PubMeds from E-Utils".format( len(missing_pmids)) logger.debug("Getting {} missing PubMeds from E-Utils".format( len(missing_pmids))) chunk_ct = 0 err_ct = 0 no_date_ct = 0 pmids = list(missing_pmids) for chunk in chunker(pmids, 200): chunk_ct += 1 if not args['--quiet']: print " Processing chunk {}".format(chunk_ct) logger.debug("Chunk {}: {}".format(chunk_ct, chunk)) r = get_pubmed(chunk) if not r or r.status_code != 200: # try again... r = get_pubmed(pmid) if not r or r.status_code != 200: logger.error( "Bad E-Utils response for PubMed ID {}".format(pmid)) net_err_ct += 1 continue soup = BeautifulSoup(r.text, "xml") pmas = soup.find('PubmedArticleSet') for pma in pmas.findAll('PubmedArticle'): pmid = pma.find('PMID').text date = get_pubmed_article_date(pma) if date: pubmed2date[pmid] = date else: no_date_ct += 1 elapsed = time.time() - start_time if not args['--quiet']: print "{} PubMed IDs processed.".format(ct) print "Got date mapping for {} PubMeds not in TCRD".format( len(pubmed2date) - in_tcrd_ct) print "No date for {} PubMeds".format(no_date_ct) if net_err_ct > 0: print "WARNING: {} Network/E-Utils errors occurred. See logfile {} for details.".format( net_err_ct, logfile) if not args['--quiet']: print "Dumping map to file: {}".format(PICKLE_FILE) pickle.dump(pubmed2date, open(PICKLE_FILE, 'wb'))
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Human Protein Atlas', 'source': 'IDG-KMC generated data by Steve Mathias at UNM from HPA file http://www.proteinatlas.org/download/normal_tissue.tsv.zip.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.proteinatlas.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "etype = 'HPA'", 'comment': 'Qualitative expression values are derived from files from http://www.proteinatlas.org/' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'HPA Tissue Specificity Index'", 'comment': 'Tissue Specificity scores are derived from files from http://www.proteinatlas.org/. The score is the Tau value as descibed in Yanai, I. et. al., Bioinformatics 21(5): 650-659 (2005)' }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) with open(TISSUE2UBERON_FILE, 'r') as ifh: tiss2uid = ast.literal_eval(ifh.read()) if not args['--quiet']: print "\nGot {} tissue to Uberon ID mappings from file {}".format( len(tiss2uid), TISSUE2UBERON_FILE) line_ct = slmf.wcl(HPA_FILE) if not args['--quiet']: print "\nProcessing {} lines in HPA file {}".format(line_ct, HPA_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 pmark = {} exp_ct = 0 nouid = set() with open(HPA_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # "protein_id" "Tissue" "Gene" "Gene name" "Level" "Reliability" ct += 1 tissue = row[1] init = { 'protein_id': row[0], 'etype': 'HPA', 'tissue': tissue, 'qual_value': row[4], 'evidence': row[5] } # Add Uberon ID, if we can find one if tissue in tiss2uid: uberon_id = tiss2uid[tissue] else: uberon_id = dba.get_uberon_id({'name': tissue}) if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(tissue) rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[row[1]] = True pbar.update(ct) pbar.finish() print "Processed {} HPA lines.".format(ct) print " Inserted {} new expression rows for {} proteins.".format( exp_ct, len(pmark)) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format( len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) line_ct = slmf.wcl(HPA_TAU_FILE) if not args['--quiet']: print "\nProcessing {} lines in HPA TAU file {}".format( line_ct, HPA_TAU_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 dba_err_ct = 0 pmark = {} skip_ct = 0 ti_ct = 0 with open(HPA_TAU_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # "Gene" "TAU" "protein_id" ct += 1 pbar.update(ct) if row[1] == 'None': skip_ct += 1 continue rv = dba.ins_tdl_info({ 'protein_id': int(row[2]), 'itype': 'HPA Tissue Specificity Index', 'number_value': row[1] }) if not rv: dba_err_ct += 1 continue pmark[row[1]] = True ti_ct += 1 pbar.finish() print "Processed {} lines.".format(ct) print " Inserted {} new HPA Tissue Specificity Index tdl_info rows for {} proteins.".format( ti_ct, len(pmark)) if skip_ct: print " Skipped {} rows with no tau.".format(skip_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging when debug is 0 fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as load() dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'STRING IDs', 'source': 'Files %s and %s from from http://string-db.org/'%(os.path.basename(INFILE1), os.path.basename(INFILE2)), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://string-db.org/'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'stringid'}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) aliasmap = {} pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] ct = 0 skip_ct = 0 mult_ct = 0 line_ct = slmf.wcl(INFILE1) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, INFILE1) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(INFILE1, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # taxid uniprot_ac|uniprot_id string_id identity bit_score ct += 1 pbar.update(ct) if float(row[3]) != 100: skip_ct += 1 continue [uniprot, name] = row[1].split("|") ensp = row[2].replace('9606.', '') bitscore = float(row[4]) if uniprot in aliasmap: # Save mapping with highest bit score if bitscore > aliasmap[uniprot][1]: aliasmap[uniprot] = (ensp, bitscore) else: aliasmap[uniprot] = (ensp, bitscore) if name in aliasmap: # Save mapping with highest bit score if bitscore > aliasmap[name][1]: aliasmap[name] = (ensp, bitscore) else: aliasmap[name] = (ensp, bitscore) pbar.finish() unmap_ct = len(aliasmap) print "{} input lines processed.".format(ct) print " Skipped {} non-identity lines".format(skip_ct) print " Got {} uniprot/name to STRING ID mappings".format(unmap_ct) line_ct = slmf.wcl(INFILE2) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, INFILE2) pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 warn_ct = 0 with open(INFILE2, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: ## string_protein_id ## alias ## source ## ct += 1 pbar.update(ct) alias = row[1] ensp = row[0].replace('9606.', '') if alias in aliasmap and aliasmap[alias][0] != ensp: # do not replace mappings from *human.uniprot_2_string.2018* with aliases logger.warn("Different ENSPs found for same alias {}: {} vs {}".format(alias, aliasmap[alias][0], ensp)) warn_ct += 1 continue aliasmap[alias] = (ensp, None) pbar.finish() amap_ct = len(aliasmap) - unmap_ct print "{} input lines processed.".format(ct) print " Added {} alias to STRING ID mappings".format(amap_ct) if warn_ct > 0: print " Skipped {} aliases that would override UniProt mappings. See logfile {} for details.".format(warn_ct, logfile) tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nLoading STRING IDs for {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 upd_ct = 0 nf_ct = 0 dba_err_ct = 0 for target in dba.get_targets(include_annotations=True): ct += 1 pbar.update(ct) p = target['components']['protein'][0] geneid = 'hsa:' + str(p['geneid']) hgncid = None if 'HGNC' in p['xrefs']: hgncid = p['xrefs']['HGNC'][0]['value'] ensp = None if p['uniprot'] in aliasmap: ensp = aliasmap[p['uniprot']][0] elif p['name'] in aliasmap: ensp = aliasmap[p['name']][0] elif geneid in aliasmap: ensp = aliasmap[geneid][0] elif hgncid and hgncid in aliasmap: ensp = aliasmap[hgncid][0] if not ensp: nf_ct += 1 logger.warn("No stringid fo protein {} ({})".format(p['id'], p['uniprot'])) continue rv = dba.do_update({'table': 'protein', 'id': p['id'], 'col': 'stringid', 'val': ensp} ) if rv: upd_ct += 1 else: dba_err_ct += 1 pbar.finish() print "Updated {} STRING ID values".format(upd_ct) if nf_ct > 0: print "No stringid found for {} proteins. See logfile {} for details.".format(nf_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'Jensen Lab TISSUES', 'source': 'Files %s from %s'%(", ".join(SRC_FILES), BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://tissues.jensenlab.org/'} ) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'expression', 'where_clause': "type LIKE 'JensenLab %'"}) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) with open(TISSUE2UBERON_FILE, 'r') as ifh: tiss2uid = ast.literal_eval(ifh.read()) if not args['--quiet']: print "\nGot {} tissue to Uberon ID mappings from file {}".format(len(tiss2uid), TISSUE2UBERON_FILE) # this dict will map ENSP|sym from input files to TCRD protein_id(s) # so we only have to find target(s) once for each pair. # See find_pids() below pmap = {} # Knowledge channel fn = DOWNLOAD_DIR+FILE_K line_ct = slmf.wcl(fn) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} exp_ct = 0 notfnd = set() nouid = set() dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) k = "%s|%s" % (row[0], row[1]) # ENSP|sym if k in notfnd: continue pids = find_pids(dba, k, pmap) if not pids: notfnd.add(k) continue etype = 'JensenLab Knowledge ' + row[4] init = {'etype': etype, 'tissue': row[3],'boolean_value': 1, 'oid': row[2], 'evidence': row[5], 'conf': row[6]} # Add Uberon ID, if we can find one if row[2]: uberon_id = dba.get_uberon_id({'oid': row[2]}) if not uberon_id: uberon_id = dba.get_uberon_id({'name': row[3]}) if not uberon_id and row[3] in tiss2uid: uberon_id = tiss2uid[row[3]] if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(row[3]) for pid in pids: init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pmark[pid] = True pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) for t in nouid: logger.warn("No Uberon ID found for {}".format(t)) print "{} rows processed.".format(ct) print " Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark)) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) # Experiment channel fn = DOWNLOAD_DIR+FILE_E line_ct = slmf.wcl(fn) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} exp_ct = 0 notfnd = set() nouid = set() skip_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) if row[6] == '0': # skip zero confidence rows skip_ct += 1 continue sym = row[1] # some rows look like: # ['ENSP00000468389', 'PSENEN {ECO:0000313|Ensembl:ENSP00000468593}', 'BTO:0002860', 'Oral mucosa', 'HPA', 'High: 1 antibody', '1'] if ' ' in sym: sym = sym.split()[0] k = "%s|%s" % (row[0], sym) # ENSP|sym if k in notfnd: continue try: pids = find_pids(dba, k, pmap) except ValueError: print "[ERROR] Row: %s; k: %s" % (str(row), k) if not pids: notfnd.add(k) continue etype = 'JensenLab Experiment ' + row[4] init = {'etype': etype, 'tissue': row[3], 'string_value': row[5], 'oid': row[2], 'conf': row[6]} # Add Uberon ID, if we can find one if row[2]: uberon_id = dba.get_uberon_id({'oid': row[2]}) if not uberon_id: uberon_id = dba.get_uberon_id({'name': row[3]}) if not uberon_id and row[3] in tiss2uid: uberon_id = tiss2uid[row[3]] if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(row[3]) for pid in pids: pmark[pid] = True init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) for t in nouid: logger.warn("No Uberon ID found for {}".format(t)) print "{} rows processed.".format(ct) print " Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark)) print " Skipped {} zero confidence rows".format(skip_ct) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) # Text Mining channel fn = DOWNLOAD_DIR+FILE_T line_ct = slmf.wcl(fn) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] if not args['--quiet']: print "\nProcessing {} lines in input file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} exp_ct = 0 notfnd = set() nouid = set() dba_err_ct = 0 for row in tsvreader: ct += 1 pbar.update(ct) k = "%s|%s" % (row[0], row[1]) # ENSP|sym if k in notfnd: continue pids = find_pids(dba, k, pmap) if not pids: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue etype = 'JensenLab Text Mining' init = {'etype': etype, 'tissue': row[3], 'boolean_value': 1, 'oid': row[2], 'zscore': row[4], 'conf': row[5], 'url': row[6]} # Add Uberon ID, if we can find one if row[2]: uberon_id = dba.get_uberon_id({'oid': row[2]}) if not uberon_id: uberon_id = dba.get_uberon_id({'name': row[3]}) if not uberon_id and row[3] in tiss2uid: uberon_id = tiss2uid[row[3]] if uberon_id: init['uberon_id'] = uberon_id else: nouid.add(row[3]) for pid in pids: pmark[pid] = True init['protein_id'] = pid rv = dba.ins_expression(init) if not rv: dba_err_ct += 1 continue exp_ct += 1 pbar.finish() for k in notfnd: logger.warn("No target found for {}".format(k)) for t in nouid: logger.warn("No Uberon ID found for {}".format(t)) print "{} rows processed.".format(ct) print " Inserted {} new expression rows for {} proteins".format(exp_ct, len(pmark)) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format(len(notfnd), logfile) if nouid: print "No Uberon ID found for {} tissues. See logfile {} for details.".format(len(nouid), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'LINCS', 'source': "CSV file exported from Oleg Ursu's lincs PostgreSQL database on seaborgium. I do not know the origin of this database at this time.", 'app': PROGRAM, 'app_version': __version__, 'url': 'http://lincsproject.org/LINCS/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'lincs'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(INPUT_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 gid2pids = {} notfnd = set() dba_err_ct = 0 pmark = {} lincs_ct = 0 with open(INPUT_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') for row in tsvreader: # 0: level5_lm.pr_gene_id # 1: level5_lm.zscore # 2: perturbagen.dc_id # 3: perturbagen.canonical_smiles # 4: signature.cell_id ct += 1 gid = row[0] if gid in gid2pids: # we've already found it pids = gid2pids[gid] elif gid in notfnd: # we've already not found it continue else: # look it up targets = dba.find_targets({'geneid': gid}, False) if not targets: notfnd.add(gid) continue pids = [] for t in targets: pid = t['components']['protein'][0]['id'] pids.append(pid) gid2pids[ gid] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_lincs({ 'protein_id': pid, 'cellid': row[4], 'zscore': row[1], 'pert_dcid': row[2], 'pert_smiles': row[3] }) if not rv: dba_err_ct += 1 continue pmark[pid] = True lincs_ct += 1 pbar.update(ct) pbar.finish() for gid in notfnd: logger.warn("No target found for {}".format(gid)) print "{} lines processed.".format(ct) print "Loaded {} new lincs rows for {} proteins.".format( lincs_ct, len(pmark)) if notfnd: print "No target found for {} geneids. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as load() dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) t2up = {} with open(INFILE) as ifh: for line in ifh: line = line.rstrip() [up, tid] = line.split(" ") t2up[tid] = up tct = len(t2up) if not args['--quiet']: print "\nGot {} UniProt accessions from file {}".format(tct, INFILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nLoading Antibodypedia annotations for {} targets".format(tct) logger.info("Loading Antibodypedia annotations for {} targets".format(tct)) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 tiab_ct = 0 timab_ct = 0 tiurl_ct = 0 dba_err_ct = 0 net_err_ct = 0 for tid, up in t2up.items(): ct += 1 pid = int(tid) pbar.update(ct) url = ABPC_API_URL + up r = None attempts = 1 while attempts <= 5: try: logger.info("Getting {} [Target {}, attempt {}]".format( url, tid, attempts)) r = requests.get(url) break except: attempts += 1 time.sleep(1) if not r: net_err_ct += 1 logger.error("No response for {} [Target {}, attempt {}]".format( url, tid, attempts)) continue if r.status_code != 200: net_err_ct += 1 logger.error( "Bad response: {} for {} [Target {}, attempt {}]".format( r.status_code, url, tid, attempts)) continue abpd = json.loads(r.text) rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'Ab Count', 'integer_value': int(abpd['num_antibodies']) }) if rv: tiab_ct += 1 else: dba_err_ct += 1 if 'ab_type_monoclonal' in abpd: mab_ct = int(abpd['ab_type_monoclonal']) else: mab_ct = 0 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'MAb Count', 'integer_value': mab_ct }) if rv: timab_ct += 1 else: dba_err_ct += 1 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'Antibodypedia.com URL', 'string_value': abpd['url'] }) if rv: tiurl_ct += 1 else: dba_err_ct += 1 time.sleep(1) pbar.update(ct) pbar.finish() print "{} TCRD targets processed.".format(ct) print " Inserted {} Ab Count tdl_info rows".format(tiab_ct) print " Inserted {} MAb Count tdl_info rows".format(timab_ct) print " Inserted {} Antibodypedia.com URL tdl_info rows".format(tiurl_ct) if net_err_ct > 0: print "WARNING: Network error for {} targets. See logfile {} for details.".format( net_err_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as load() dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Aintibodypedia.com', 'source': 'Web API at %s' % ABPC_API_URL, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.antibodypedia.com' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': 'itype == "Ab Count"' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': 'itype == "MAb Count"' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': 'itype == "Antibodypedia.com URL"' }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] tct = dba.get_target_count() if not args['--quiet']: print "\nLoading Antibodypedia annotations for {} TCRD targets".format( tct) logger.info( "Loading Antibodypedia annotations for {} TCRD targets".format(tct)) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 tiab_ct = 0 timab_ct = 0 tiurl_ct = 0 dba_err_ct = 0 net_err_ct = 0 for target in dba.get_targets(): ct += 1 pbar.update(ct) tid = target['id'] p = target['components']['protein'][0] pid = p['id'] url = ABPC_API_URL + p['uniprot'] r = None attempts = 1 while attempts <= 5: try: logger.info("Getting {} [Target {}, attempt {}]".format( url, tid, attempts)) r = requests.get(url) break except: attempts += 1 time.sleep(1) if not r: net_err_ct += 1 logger.error("No response for {} [Target {}, attempt {}]".format( url, tid, attempts)) continue if r.status_code != 200: net_err_ct += 1 logger.error( "Bad response: {} for {} [Target {}, attempt {}]".format( r.status_code, url, tid, attempts)) continue abpd = json.loads(r.text) rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'Ab Count', 'integer_value': int(abpd['num_antibodies']) }) if rv: tiab_ct += 1 else: dba_err_ct += 1 if 'ab_type_monoclonal' in abpd: mab_ct = int(abpd['ab_type_monoclonal']) else: mab_ct = 0 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'MAb Count', 'integer_value': mab_ct }) if rv: timab_ct += 1 else: dba_err_ct += 1 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'Antibodypedia.com URL', 'string_value': abpd['url'] }) if rv: tiurl_ct += 1 else: dba_err_ct += 1 time.sleep(1) pbar.update(ct) pbar.finish() print "{} TCRD targets processed.".format(ct) print " Inserted {} Ab Count tdl_info rows".format(tiab_ct) print " Inserted {} MAb Count tdl_info rows".format(timab_ct) print " Inserted {} Antibodypedia.com URL tdl_info rows".format(tiurl_ct) if net_err_ct > 0: print "WARNING: Network error for {} targets. See logfile {} for details.".format( net_err_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'GWAS Catalog', 'source': 'File %s from http://www.ebi.ac.uk/gwas/docs/file-downloads' % os.path.basename(INFILE), 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ebi.ac.uk/gwas/home' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'gwas'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(INFILE) line_ct -= 1 if not args['--quiet']: print '\nProcessing {} lines from input file {}'.format( line_ct, INFILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() outlist = [] with open(INFILE, 'rU') as tsvfile: tsvreader = csv.reader(tsvfile, delimiter='\t') header = tsvreader.next() # skip header line ct = 0 notfnd = set() pmark = {} gwas_ct = 0 dba_err_ct = 0 # 0: DATE ADDED TO CATALOG # 1: PUBMEDID # 2: FIRST AUTHOR # 3: DATE # 4: JOURNAL # 5: LINK # 6: STUDY # 7: DISEASE/TRAIT # 8: INITIAL SAMPLE SIZE # 9: REPLICATION SAMPLE SIZE # 10: REGION # 11: CHR_ID # 12: CHR_POS # 13: REPORTED GENE(S) # 14: MAPPED_GENE # 15: UPSTREAM_GENE_ID # 16: DOWNSTREAM_GENE_ID # 17: SNP_GENE_IDS # 18: UPSTREAM_GENE_DISTANCE # 19: DOWNSTREAM_GENE_DISTANCE # 20: STRONGEST SNP-RISK ALLELE # 21: SNPS # 22: MERGED # 23: SNP_ID_CURRENT # 24: CONTEXT # 25: INTERGENIC # 26: RISK ALLELE FREQUENCY # 27: P-VALUE # 28: PVALUE_MLOG # 29: P-VALUE (TEXT) # 30: OR or BETA # 31: 95% CI (TEXT) # 32: PLATFORM [SNPS PASSING QC] # 33: CNV # 34: MAPPED_TRAIT # 35: MAPPED_TRAIT_URI # 36: STUDY ACCESSION # 37: GENOTYPING TECHNOLOGY symregex = re.compile(r' ?[-,;] ?') for row in tsvreader: ct += 1 if len(row) < 14: continue symstr = row[14] if symstr == 'NR': continue symlist = symregex.split(symstr) for sym in symlist: if sym in notfnd: continue targets = dba.find_targets({'sym': sym}) if not targets: notfnd.add(sym) logger.warn("No target found for symbol {}".format(sym)) continue for t in targets: p = t['components']['protein'][0] try: pval = float(row[27]) except: pval = None try: orbeta = float(row[30]) except: orbeta = None if row[25]: ig = int(row[25]) else: ig = None rv = dba.ins_gwas({ 'protein_id': p['id'], 'disease_trait': row[7], 'snps': row[21], 'pmid': row[1], 'study': row[6], 'context': row[24], 'intergenic': ig, 'p_value': pval, 'or_beta': orbeta, 'cnv': row[33], 'mapped_trait': row[34], 'mapped_trait_uri': row[35] }) if not rv: dba_err_ct += 1 continue pmark[p['id']] = True gwas_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new gwas rows for {} proteins".format( gwas_ct, len(pmark.keys())) if notfnd: print "No target found for {} symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging when debug is 0 fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'JAX/MGI Mouse/Human Orthology Phenotypes', 'source': 'File %s from ftp.informatics.jax.org' % PT_FILE, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.informatics.jax.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'phenotype', 'where_clause': "ptype = 'JAX/MGI Human Ortholog Phenotyp'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) if not args['--quiet']: print "\nParsing Mammalian Phenotype Ontology file {}".format( DOWNLOAD_DIR + MPO_OWL_FILE) mpo = parse_mp_owl(MPO_OWL_FILE) if not args['--quiet']: print "Got {} MP terms".format(len(mpo)) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] fn = DOWNLOAD_DIR + PT_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines from input file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pt_ct = 0 skip_ct = 0 pmark = {} notfnd = set() dba_err_ct = 0 for row in tsvreader: ct += 1 if not row[6] or row[6] == '': skip_ct += 1 continue sym = row[0] geneid = row[1] k = "%s|%s" % (sym, geneid) if k in notfnd: continue targets = dba.find_targets({'sym': sym}, idg=False) if not targets: targets = dba.find_targets({'geneid': geneid}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue for t in targets: pid = t['components']['protein'][0]['id'] pmark[pid] = True for mpid in row[6].split(): rv = dba.ins_phenotype({ 'protein_id': pid, 'ptype': 'JAX/MGI Human Ortholog Phenotype', 'term_id': mpid, 'term_name': mpo[mpid]['name'] }) if rv: pt_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} new phenotype rows for {} proteins".format( pt_ct, len(pmark.keys())) print " Skipped {} lines with no MP terms".format(skip_ct) if notfnd: print "No target found for {} gene symbols/ids. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'HGNC', 'source': 'Custom download file from https://www.genenames.org/download/custom/', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.genenames.org/', 'comments': 'File downloaded with the following column data: HGNC ID Approved symbol Approved name Status UniProt ID NCBI Gene ID Mouse genome database ID' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile {} for details.".format( logfile) sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'sym', 'comment': "This is only updated with HGNC data if data from UniProt is absent." }, { 'dataset_id': dataset_id, 'table_name': 'protein', 'column_name': 'geneid', 'comment': "This is only updated with HGNC data if data from UniProt is absent." }, { 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) line_ct = slmf.wcl(HGNC_TSV_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, HGNC_TSV_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 tmark = {} hgnc_ct = 0 mgi_ct = 0 sym_ct = 0 symdiscr_ct = 0 geneid_ct = 0 geneiddiscr_ct = 0 nf_ct = 0 db_err_ct = 0 with open(HGNC_TSV_FILE, 'rU') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') header = tsvreader.next() # skip header line ct += 1 for row in tsvreader: # 0: HGNC ID # 1: Approved symbol # 2: Approved name # 3: Status # 4: UniProt ID # 5: NCBI Gene ID # 6: Mouse genome database ID ct += 1 pbar.update(ct) sym = row[1] geneid = row[5] up = row[4] targets = dba.find_targets({'sym': sym}) if not targets: targets = dba.find_targets({'geneid': geneid}) if not targets: targets = dba.find_targets({'uniprot': up}) if not targets: nf_ct += 1 #logger.warn("No target found for {}|{}|{}".format(sym, geneid, up)) continue for t in targets: p = t['components']['protein'][0] pid = p['id'] tmark[pid] = True # HGNC xref rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'HGNC', 'dataset_id': dataset_id, 'value': row[0] }) if rv: hgnc_ct += 1 else: db_err_ct += 1 # MGI xref rv = dba.ins_xref({ 'protein_id': pid, 'xtype': 'MGI ID', 'dataset_id': dataset_id, 'value': row[6] }) if rv: mgi_ct += 1 else: db_err_ct += 1 # Add missing syms if p['sym'] == None: rv = dba.upd_protein(pid, 'sym', sym) if rv: logger.info( "Inserted new sym {} for protein {}, {}".format( sym, pid, p['uniprot'])) sym_ct += 1 else: db_err_ct += 1 else: # Check for symbol discrepancies if p['sym'] != sym: logger.warn("Symbol discrepancy: UniProt=%s, HGNC=%s" % (p['sym'], sym)) symdiscr_ct += 1 if geneid: # Add missing geneids if p['geneid'] == None: rv = dba.upd_protein(pid, 'geneid', geneid) if rv: logger.info( "Inserted new geneid {} for protein {}, {}". format(geneid, pid, p['uniprot'])) geneid_ct += 1 else: db_err_ct += 1 else: # Check for geneid discrepancies if p['geneid'] != int(geneid): logger.warn( "GeneID discrepancy: UniProt={}, HGNC={}". format(p['geneid'], geneid)) geneiddiscr_ct += 1 pbar.finish() print "Processed {} lines - {} targets annotated.".format(ct, len(tmark)) print "No target found for {} lines.".format(nf_ct) print " Inserted {} HGNC ID xrefs".format(hgnc_ct) print " Inserted {} MGI ID xrefs".format(mgi_ct) if sym_ct > 0: print " Added {} new HGNC symbols".format(sym_ct) if symdiscr_ct > 0: print "WARNING: {} discrepant HGNC symbols. See logfile {} for details".format( symdiscr_ct, logfile) if geneid_ct > 0: print " Added {} new NCBI Gene IDs".format(geneid_ct) if geneiddiscr_ct > 0: print "WARNING: {} discrepant NCBI Gene IDs. See logfile {} for details".format( geneiddiscr_ct, logfile) if db_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( db_err_ct, logfile)
def run_and_load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'TMHMM Predictions', 'source': 'Results of running TMHMM on protein sequences.', 'app': PROGRAM, 'app_version': __version__, }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'TMHMM Prediction'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) tct = dba.get_target_count(idg=False) print "\nProcessing {} TCRD targets".format(tct) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() regex = re.compile(r'PredHel=(\d+)') ct = 0 ti_ct = 0 dba_err_ct = 0 for t in dba.get_targets(idg=False, include_annotations=False): ct += 1 p = t['components']['protein'][0] fasta = ">%s|%s %s\n%s\n" % (t['id'], p['name'], p['description'], p['seq']) #print "[DEBUG] Fasta:\n%s" % fasta fasta_filename = "/tmp/%s.fa" % t['id'] f = open(fasta_filename, 'w') f.write(fasta) f.close() cmd = '%s --short --noplot %s' % (TMHMM_BIN, fasta_filename) #print "[DEBUG] Cmd: %s" % cmd output = '' for line in runProcess(cmd.split()): output += line os.remove(fasta_filename) #print "[DEBUG] Output: %s" % output pred = regex.findall(output)[0] #print "[DEBUG] PredHel: %s" % predhel if pred != '0': rv = dba.ins_tdl_info({ 'protein_id': p['id'], 'itype': 'TMHMM Prediction', 'string_value': output }) if not rv: dba_err_ct += 1 continue ti_ct += 1 pbar.update(ct) pbar.finish() print "{} targets processed.".format(ct) print " Inserted {} new TMHMM Prediction tdl_info rows".format(ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'TDLs', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'TDLs are generated by the loading app from data in TCRD.' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile {} for details.".format( logfile) sys.exit(1) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'target', 'column_name': 'tdl' }) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nProcessing {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 tdl_cts = {'Tclin': 0, 'Tchem': 0, 'Tbio': 0, 'Tdark': 0} bump_ct = 0 dba_err_ct = 0 upd_ct = 0 for target in dba.get_targets(idg=False, include_annotations=True): ct += 1 pbar.update(ct) (tdl, bump_flag) = get_tdl(target) tdl_cts[tdl] += 1 if bump_flag: bump_ct += 1 rv = dba.upd_target(target['id'], 'tdl', tdl) if rv: upd_ct += 1 else: dba_err_ct += 1 pbar.finish() print "{} TCRD targets processed.".format(ct) print "Set TDL values for {} targets:".format(upd_ct) print " {} targets are Tclin".format(tdl_cts['Tclin']) print " {} targets are Tchem".format(tdl_cts['Tchem']) print " {} targets are Tbio - {} bumped from Tdark".format( tdl_cts['Tbio'], bump_ct) print " {} targets are Tdark".format(tdl_cts['Tdark']) if dba_err_ct > 0: print "WARNING: {} database errors occured. See logfile {} for details.".format( dba_err_ct, logfile)
logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(LOGFILE) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) for ver, fn in INPUTFILES: fn = DATA_DIR + fn load(args, dba, logfile, logger, ver, fn) # Dataset dataset_id = dba.ins_dataset({ 'name': 'IDG Eligible Lists',
def load(ortho_df, args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Orthologs', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.genenames.org/cgi-bin/hcop' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'ortholog', 'comment': "Orthologs are majority vote from the OMA, EggNOG and InParanoid resources as per HGNC." }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] tct = dba.get_target_count() if not args['--quiet']: print "\nLoading ortholog data for {} TCRD targets".format(tct) logger.info("Loading ortholog data for {} TCRD targets".format(tct)) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 ortho_ct = 0 tskip_ct = 0 skip_ct = 0 notfnd = set() dba_err_ct = 0 for target in dba.get_targets(): ct += 1 pbar.update(ct) logger.info("Processing target %d" % target['id']) p = target['components']['protein'][0] if p['sym']: # try first by symbol to_df = ortho_df.loc[ortho_df['human_symbol'] == p['sym']] elif p['geneid']: # then try by GeneID to_df = ortho_df.loc[ortho_df['human_entrez_gene'] == p['geneid']] else: tskip_ct += 1 continue if len(to_df) == 0: continue for idx, row in to_df.iterrows(): if row['ortholog_species_symbol'] == '-' and row[ 'ortholog_species_name'] == '-': skip_ct += 1 continue sp = TAXID2SP[row['ortholog_species']] init = { 'protein_id': p['id'], 'taxid': row['ortholog_species'], 'species': sp, 'sources': row['sources'], 'symbol': row['ortholog_species_symbol'], 'name': row['ortholog_species_name'] } # Add MOD DB ID if it's there if row['ortholog_species_db_id'] != '-': init['db_id'] = row['ortholog_species_db_id'] # Add NCBI Gene ID if it's there if row['ortholog_species_entrez_gene'] != '-': init['geneid'] = row['ortholog_species_entrez_gene'] # Construct MOD URLs for mouse, rat, zebrafish, fly, worm and yeast if sp == 'Mouse': init[ 'mod_url'] = 'http://www.informatics.jax.org/marker/' + row[ 'ortholog_species_db_id'] elif sp == 'Rat': rgdid = row['ortholog_species_db_id'].replace('RGD:', '') init[ 'mod_url'] = 'http://rgd.mcw.edu/rgdweb/report/gene/main.html?id=' + rgdid elif sp == 'Zebrafish': init['mod_url'] = 'http://zfin.org/' + row[ 'ortholog_species_db_id'] elif sp == 'Fruitfly': init['mod_url'] = "http://flybase.org/reports/%s.html" % row[ 'ortholog_species_db_id'] elif sp == 'C. elegans': init['mod_url'] = 'http://www.wormbase.org/search/gene/' + row[ 'ortholog_species_symbol'] elif sp == 'S.cerevisiae': init['mod_url'] = 'https://www.yeastgenome.org/locus/' + row[ 'ortholog_species_db_id'] rv = dba.ins_ortholog(init) if rv: ortho_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "Processed {} targets.".format(ct) print "Loaded {} new ortholog rows".format(ortho_ct) print " Skipped {} empty ortholog entries".format(skip_ct) print " Skipped {} targets with no sym/geneid".format(tskip_ct) if len(notfnd) > 0: print " No orthologs found for {} targets.".format(len(notfnd)) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args, dod): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset # data-version field in the header of the OBO file has a relase version: # data-version: releases/2016-03-25 f = os.popen("head %s" % DOWNLOAD_DIR + FILENAME) for line in f: if line.startswith("data-version:"): ver = line.replace('data-version: ', '') break f.close() dataset_id = dba.ins_dataset({ 'name': 'Disease Ontology', 'source': 'File %s, version %s' % (BASE_URL + FILENAME, ver), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://disease-ontology.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'do'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'do_xref' }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nLoading {} Disease Ontology terms".format(len(dod)) pbar = ProgressBar(widgets=pbar_widgets, maxval=len(dod)).start() ct = 0 do_ct = 0 dba_err_ct = 0 for doid, d in dod.items(): ct += 1 d['doid'] = doid rv = dba.ins_do(d) if rv: do_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} terms processed.".format(ct) print " Inserted {} new do rows".format(do_ct) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'eRAM Disease Associations', 'source': 'Data scraped from eRAM web pages.', 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.unimd.org/eram/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'eRAM'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) s = shelve.open(ERAM_SHELF_FILE) dis_ct = len(s['disease_names']) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} disease names in shelf file {}".format( dis_ct, ERAM_SHELF_FILE) pbar = ProgressBar(widgets=pbar_widgets, maxval=dis_ct).start() ct = 0 pmark = {} skip_ct = 0 dnerr1_ct = 0 dnerr2_ct = 0 notfnd = set() dis_ct = 0 dba_err_ct = 0 for dname in s['disease_names']: ct += 1 try: dname = str(dname) except: dnerr2_ct += 1 logger.warn("UnicodeEncodeError for disease name '{}'".format( dname.encode('ascii', 'ignore'))) continue if dname not in s: dnerr_ct += 1 logger.warn("Disease name '{}' not in shelf".format(dname)) continue if 'currated_genes' not in s[dname]: skip_ct += 1 continue for cg in s[dname]['currated_genes']: sym = cg['sym'] geneid = cg['geneid'] k = "%s|%s" % (sym, geneid) if k in notfnd: continue targets = dba.find_targets({'sym': sym}) if not targets: targets = dba.find_targets({'geneid': geneid}) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue for t in targets: p = t['components']['protein'][0] pmark[t['id']] = True for doid in s[dname]['doids']: rv = dba.ins_disease({ 'protein_id': p['id'], 'dtype': 'eRAM', 'name': dname, 'did': doid, 'source': cg['sources'] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark)) if skip_ct > 0: print "Skipped {} diseases with no currated genes. See logfile {} for details.".format( skip_ct, logfile) if dnerr1_ct > 0: print "{} disease names not found in shelf. See logfile {} for details.".format( dnerr1_ct, logfile) if dnerr2_ct > 0: print "{} disease names cannot be decoded to strs. See logfile {} for details.".format( dnerr2_ct, logfile) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as main() dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Drug Central', 'source': "Drug Central files download files: %s" % ", ".join(SRC_FILES), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://drugcentral.org/' }) if not dataset_id: print "WARNING: Error inserting dataset. See logfile {} for details.".format( logfile) sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'drug_activity' }, { 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'DrugCentral Indication'" }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile {} for details.".format( logfile) sys.exit(1) # First get mapping of DrugCentral names to ids name2id = {} line_ct = slmf.wcl(NAME_ID_FILE) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format( line_ct, NAME_ID_FILE) with open(NAME_ID_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): continue name2id[row[0]] = row[1].replace("\n", '') print "{} input lines processed.".format(ct) print "Saved {} keys in infos map".format(len(name2id)) # Next get drug info fields infos = {} line_ct = slmf.wcl(DRUGINFO_FILE) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format( line_ct, DRUGINFO_FILE) with open(DRUGINFO_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): continue infos[row[0]] = row[1].replace("\n", '') print "{} input lines processed.".format(ct) print "Saved {} keys in infos map".format(len(infos)) # # MOA activities # drug2tids = defaultdict(list) line_ct = slmf.wcl(TCLIN_FILE) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from DrugDB MOA activities file {}".format( line_ct, TCLIN_FILE) with open(TCLIN_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # uniprot swissprot drug_name act_value act_type action_type source_name reference smiles ChEMBL_Id ct = 0 da_ct = 0 err_ct = 0 notfnd = [] dba_err_ct = 0 for row in tsvreader: ct += 1 up = row[0] sp = row[1] drug = row[2] if drug not in name2id: err_ct += 1 logger.warn("No DrugCentral id found for {}".format(drug)) continue dcid = name2id[drug] targets = dba.find_targets({'uniprot': up}) if not targets: targets = dba.find_targets({'name': sp}) if not targets: notfnd.append(up) continue tid = targets[0]['id'] drug2tids[drug].append(tid) init = { 'target_id': tid, 'drug': drug, 'dcid': dcid, 'has_moa': 1, 'source': row[5] } if row[3]: init['act_value'] = row[3] if row[4]: init['act_type'] = row[4] if row[5]: init['action_type'] = row[5] if row[6]: init['source'] = row[6] if row[7]: init['reference'] = row[7] if row[8]: init['smiles'] = row[8] if row[9]: init['cmpd_chemblid'] = row[9] if drug in infos: init['nlm_drug_info'] = infos[drug] rv = dba.ins_drug_activity(init) if rv: da_ct += 1 else: dba_err_ct += 1 print "{} DrugCentral Tclin rows processed.".format(ct) print " Inserted {} new drug_activity rows".format(da_ct) if len(notfnd) > 0: print "WARNNING: {} Uniprot/Swissprot Accessions NOT FOUND in TCRD:".format( len(notfnd)) for up in notfnd: print up if err_ct > 0: print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format( err_ct, logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # # Non-MOA activities # line_ct = slmf.wcl(TCHEM_FILE) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from Non-MOA activities file {}".format( line_ct, TCHEM_FILE) with open(TCHEM_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # uniprot swissprot drug_name act_value act_type action_type source_name reference smiles ChEMBL_Id ct = 0 da_ct = 0 err_ct = 0 notfnd = [] dba_err_ct = 0 for row in tsvreader: ct += 1 up = row[0] sp = row[1] drug = row[2] if drug not in name2id: err_ct += 1 logger.warn("No DrugCentral id found for {}".format(drug)) continue dcid = name2id[drug] targets = dba.find_targets({'uniprot': up}) if not targets: targets = dba.find_targets({'name': sp}) if not targets: notfnd.append(up) continue tid = targets[0]['id'] drug2tids[drug].append(tid) init = { 'target_id': tid, 'drug': drug, 'dcid': dcid, 'has_moa': 0, 'source': row[5] } if row[3]: init['act_value'] = row[3] if row[4]: init['act_type'] = row[4] if row[5]: init['action_type'] = row[5] if row[6]: init['source'] = row[6] if row[7]: init['reference'] = row[7] if row[8]: init['smiles'] = row[8] if row[9]: init['chemblid'] = row[9] if drug in infos: init['nlm_drug_info'] = infos[drug] rv = dba.ins_drug_activity(init) if rv: da_ct += 1 else: dba_err_ct += 1 print "{} DrugCentral Tchem rows processed.".format(ct) print " Inserted {} new drug_activity rows".format(da_ct) if len(notfnd) > 0: print "WARNNING: {} DrugDB Uniprot Accessions NOT FOUND in TCRD:".format( len(notfnd)) for up in notfnd: print up if err_ct > 0: print "WARNNING: DrugCentral ID not found for {} drug names. See logfile {} for details.".format( err_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # # Indications (diseases) # line_ct = slmf.wcl(DRUGIND_FILE) line_ct -= 1 if not args['--quiet']: print "\nProcessing {} lines from indications file {}".format( line_ct, DRUGIND_FILE) with open(DRUGIND_FILE, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line # DRUG_ID DRUG_NAME INDICATION_FDB UMLS_CUI SNOMEDCT_CUI DOID ct = 0 t2d_ct = 0 notfnd = {} dba_err_ct = 0 for row in tsvreader: ct += 1 drug = row[1] if drug not in drug2tids: notfnd[drug] = True continue init = { 'protein_id': tid, 'dtype': 'DrugCentral Indication', 'name': row[2], 'drug_name': drug } if row[5] != '': init['did'] = row[5] for tid in drug2tids[drug]: # NB> Using target_id as protein_id works for now, but will not if/when we have multiple protein targets init['protein_id'] = tid rv = dba.ins_disease(init) if rv: t2d_ct += 1 else: dba_err_ct += 1 print "{} DrugCentral indication rows processed.".format(ct) print " Inserted {} new disease rows".format(t2d_ct) if len(notfnd.keys()) > 0: print "WARNNING: {} drugs NOT FOUND in activity files:".format( len(notfnd)) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'JensenLab PubMed Text-mining Scores', 'source': 'File %s' % BASE_URL + FILENAME, 'app': PROGRAM, 'app_version': __version__, 'url': BASE_URL }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'pmscore' }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'JensenLab PubMed Score'" }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] ensp2pids = {} pmscores = {} # protein.id => sum(all scores) pms_ct = 0 upd_ct = 0 notfnd = {} dba_err_ct = 0 infile = DOWNLOAD_DIR + FILENAME line_ct = slmf.wcl(infile) if not args['--quiet']: print "\nProcessing {} input lines in file {}".format(line_ct, infile) with open(infile, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: # sym year score ct += 1 pbar.update(ct) if not row[0].startswith('ENSP'): continue ensp = row[0] if ensp in ensp2pids: # we've already found it pids = ensp2pids[ensp] elif ensp in notfnd: # we've already not found it continue else: targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets_by_xref({ 'xtype': 'STRING', 'value': '9606.' + ensp }) if not targets: notfnd[ensp] = True logger.warn("No target found for {}".format(ensp)) continue pids = [] for target in targets: pids.append(target['components']['protein'][0]['id']) ensp2pids[ ensp] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_pmscore({ 'protein_id': pid, 'year': row[1], 'score': row[2] }) if rv: pms_ct += 1 else: dba_err_ct += 1 if pid in pmscores: pmscores[pid] += float(row[2]) else: pmscores[pid] = float(row[2]) pbar.finish() print "{} input lines processed.".format(ct) print " Inserted {} new pmscore rows for {} targets".format( pms_ct, len(pmscores)) if len(notfnd) > 0: print "No target found for {} STRING IDs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) print "\nLoading {} JensenLab PubMed Score tdl_infos".format( len(pmscores.keys())) ct = 0 ti_ct = 0 dba_err_ct = 0 for pid, score in pmscores.items(): ct += 1 rv = dba.ins_tdl_info({ 'protein_id': pid, 'itype': 'JensenLab PubMed Score', 'number_value': score }) if rv: ti_ct += 1 else: dba_err_ct += 1 print "{} processed".format(ct) print " Inserted {} new JensenLab PubMed Score tdl_info rows".format( ti_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( (dba_err_ct, logfile))
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Jensen Lab DISEASES', 'source': 'Files %s from %s' % (", ".join(SRC_FILES), BASE_URL), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://diseases.jensenlab.org/' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype LIKE 'JensenLab %'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) # Knowledge channel fn = DOWNLOAD_DIR + FILE_K line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in notfnd: continue targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue dtype = 'JensenLab Knowledge ' + row[4] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True init = { 'protein_id': p['id'], 'dtype': dtype, 'name': row[3], 'did': row[2], 'evidence': row[5], 'conf': row[6] } rv = dba.ins_disease(init) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark)) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Experiment channel fn = DOWNLOAD_DIR + FILE_E line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} notfnd = set() dis_ct = 0 skip_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[6] == '0': # skip zero confidence rows skip_ct += 1 continue ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in notfnd: continue targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue dtype = 'JensenLab Experiment ' + row[4] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True rv = dba.ins_disease({ 'protein_id': p['id'], 'dtype': dtype, 'name': row[3], 'did': row[2], 'evidence': row[5], 'conf': row[6] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark)) if skip_ct > 0: print "Skipped {} zero confidence rows".format(skip_ct) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Text Mining channel fn = DOWNLOAD_DIR + FILE_T line_ct = slmf.wcl(fn) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 pmark = {} notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in notfnd: continue targets = dba.find_targets({'stringid': ensp}) if not targets: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue dtype = 'JensenLab Text Mining' for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True rv = dba.ins_disease({ 'protein_id': p['id'], 'dtype': dtype, 'name': row[3], 'did': row[2], 'zscore': row[4], 'conf': row[5] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark)) if notfnd: print "No target found for {} stringids/symbols. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Expression Atlas', 'source': 'IDG-KMC generated data at UNM.', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ebi.ac.uk/gxa/', 'comment': 'Disease associations are derived from files from ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/atlas/experiments/atlas-latest-data.tar.gz' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'disease', 'where_clause': "dtype = 'Expression Atlas'" }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) line_ct = slmf.wcl(INPUT_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, INPUT_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] with open(INPUT_FILE, 'rU') as tsv: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() tsvreader = csv.reader(tsv, delimiter='\t') header = tsvreader.next() # skip header line ct = 0 k2pids = {} pmark = {} notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: # 0: "Gene ID" # 1: "DOID" # 2: "Gene Name" # 3: "log2foldchange" # 4: "p-value" # 5: "disease" # 6: "experiment_id" # 7: "contrast_id" ct += 1 sym = row[2] ensg = row[0] k = "%s|%s" % (sym, ensg) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: targets = dba.find_targets({'sym': sym}, idg=False) if not targets: targets = dba.find_targets_by_xref({ 'xtype': 'ENSG', 'value': ensg }) if not targets: notfnd.add(k) logger.warn("No target found for {}".format(k)) continue pids = [] for t in targets: p = t['components']['protein'][0] pmark[p['id']] = True pids.append(p['id']) k2pids[ k] = pids # save this mapping so we only lookup each target once for pid in pids: rv = dba.ins_disease({ 'protein_id': pid, 'dtype': 'Expression Atlas', 'name': row[5], 'did': row[1], 'log2foldchange': "%.3f" % float(row[3]), 'pvalue': row[4] }) if not rv: dba_err_ct += 1 continue dis_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed.".format(ct) print "Loaded {} new disease rows for {} proteins.".format( dis_ct, len(pmark)) if notfnd: print "No target found for {} symbols/ensgs. See logfile {} for details.".format( len(notfnd), logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'GeneRIF Years', 'source': 'PubMed records via NCBI E-Utils', 'app': PROGRAM, 'app_version': __version__, 'url': 'https://www.ncbi.nlm.nih.gov/pubmed' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'generif', 'column_name': 'years' }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pubmed2date = pickle.load(open(PICKLE_FILE, 'rb')) if not args['--quiet']: print "\nGot %d PubMed date mappings from file %s" % (len(pubmed2date), PICKLE_FILE) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] generifs = dba.get_generifs() if not args['--quiet']: print "\nProcessing {} GeneRIFs".format(len(generifs)) logger.info("Processing {} GeneRIFs".format(len(generifs))) pbar = ProgressBar(widgets=pbar_widgets, maxval=len(generifs)).start() yrre = re.compile(r'^(\d{4})') ct = 0 yr_ct = 0 skip_ct = 0 net_err_ct = 0 dba_err_ct = 0 for generif in generifs: ct += 1 logger.debug("Processing GeneRIF: {}".format(generif)) # GeneRIFs with multiple refs often have duplicates, so fix that if "|" in generif['pubmed_ids']: pmids = set(generif['pubmed_ids'].split("|")) pmids = list(pmids) rv = dba.do_update({ 'table': 'generif', 'id': generif['id'], 'col': 'pubmed_ids', 'val': "|".join(pmids) }) if not rv: dba_err_ct += 1 else: pmids = [generif['pubmed_ids']] years = list() for pmid in pmids: if pmid in pubmed2date: m = yrre.match(pubmed2date[pmid]) if m: years.append(m.groups(1)[0]) else: years.append('') else: years.append('') # See if we got any years... if any(years): # if so, so do the updates rv = dba.do_update({ 'table': 'generif', 'id': generif['id'], 'col': 'years', 'val': "|".join(years) }) if rv: yr_ct += 1 else: dba_err_ct += 1 else: # if not, skip skip_ct += 1 pbar.update(ct) pbar.finish() if not args['--quiet']: print "{} GeneRIFs processed.".format(ct) print " Updated {} genefifs with years".format(yr_ct) print " Skipped {} generifs with no years.".format(skip_ct) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) if net_err_ct > 0: print "WARNING: {} Network/E-Utils errors occurred. See logfile {} for details.".format( net_err_ct, logfile)
def tinx(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # The results of parsing the input mentions files will be the following dictionaries: pid2pmids = { } # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein # Including the UniProt accession in the key is just for convenience when # checking the output. It is not used for anything. doid2pmids = {} # DOID => set of all PMIDs that mention the disease pmid_disease_ct = { } # PMID => count of diseases mentioned in a given paper pmid_protein_ct = { } # PMID => count of proteins mentioned in a given paper # First parse the Disease Ontology OBO file to get DO names and defs dofile = DO_DOWNLOAD_DIR + DO_OBO print "\nParsing Disease Ontology file {}".format(dofile) do_parser = obo.Parser(open(dofile)) do = {} for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags print " Got {} Disease Ontology terms".format(len(do)) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] fn = JL_DOWNLOAD_DIR + PROTEIN_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in protein file {}".format(line_ct, fn) with open(fn, 'rU') as tsvf: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 pbar.update(ct) if not line.startswith('ENSP'): skip_ct += 1 continue data = line.rstrip().split('\t') ensp = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) targets = dba.find_targets({'stringid': ensp}) if not targets: # if we don't find a target by stringid, which is the more reliable and # prefered way, try by Ensembl xref targets = dba.find_targets_by_xref({ 'xtype': 'Ensembl', 'value': ensp }) if not targets: notfnd.add(ensp) continue for t in targets: p = t['components']['protein'][0] k = "%s,%s" % (p['id'], p['uniprot']) if k in pid2pmids: pid2pmids[k] = pid2pmids[k].union(pmids) else: pid2pmids[k] = set(pmids) for pmid in pmids: if pmid in pmid_protein_ct: pmid_protein_ct[pmid] += 1.0 else: pmid_protein_ct[pmid] = 1.0 pbar.finish() for ensp in notfnd: logger.warn("No target found for {}".format(ensp)) print "{} lines processed.".format(ct) print " Skipped {} non-ENSP lines".format(skip_ct) print " Saved {} protein to PMIDs mappings".format(len(pid2pmids)) print " Saved {} PMID to protein count mappings".format( len(pmid_protein_ct)) if notfnd: print " No target found for {} ENSPs. See logfile {} for details.".format( len(notfnd), logfile) fn = JL_DOWNLOAD_DIR + DISEASE_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsvf: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 pbar.update(ct) if not line.startswith('DOID:'): skip_ct += 1 continue data = line.rstrip().split('\t') doid = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) if doid not in do: logger.warn("%s not found in DO" % doid) notfnd.add(doid) continue if doid in doid2pmids: doid2pmids[doid] = doid2pmids[doid].union(pmids) else: doid2pmids[doid] = set(pmids) for pmid in pmids: if pmid in pmid_disease_ct: pmid_disease_ct[pmid] += 1.0 else: pmid_disease_ct[pmid] = 1.0 pbar.finish() print "{} lines processed.".format(ct) print " Skipped {} non-DOID lines".format(skip_ct) print " Saved {} DOID to PMIDs mappings".format(len(doid2pmids)) print " Saved {} PMID to disease count mappings".format( len(pmid_disease_ct)) if notfnd: print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format( len(notfnd), logfile) if not args['--quiet']: print "\nComputing protein novely scores" # To calculate novelty scores, each paper (PMID) is assigned a # fractional target (FT) score of one divided by the number of targets # mentioned in it. The novelty score of a given protein is one divided # by the sum of the FT scores for all the papers mentioning that # protein. ct = 0 with open(PROTEIN_NOVELTY_FILE, 'wb') as pnovf: pnovf.write("Protein ID,UniProt,Novelty\n") for k in pid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in pid2pmids[k]: ft_score_sum += 1.0 / pmid_protein_ct[pmid] novelty = 1.0 / ft_score_sum pnovf.write("%s,%.8f\n" % (k, novelty)) print " Wrote {} novelty scores to file {}".format( ct, PROTEIN_NOVELTY_FILE) if not args['--quiet']: print "\nComputing disease novely scores" # Exactly as for proteins, but using disease mentions ct = 0 with open(DISEASE_NOVELTY_FILE, 'wb') as dnovf: dnovf.write("DOID,Novelty\n") for doid in doid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in doid2pmids[doid]: ft_score_sum += 1.0 / pmid_disease_ct[pmid] novelty = 1.0 / ft_score_sum dnovf.write("%s,%.8f\n" % (doid, novelty)) print " Wrote {} novelty scores to file {}".format( ct, DISEASE_NOVELTY_FILE) if not args['--quiet']: print "\nComputing importance scores" # To calculate importance scores, each paper is assigned a fractional # disease-target (FDT) score of one divided by the product of the # number of targets mentioned and the number of diseases # mentioned. The importance score for a given disease-target pair is # the sum of the FDT scores for all papers mentioning that disease and # protein. ct = 0 with open(IMPORTANCE_FILE, 'wb') as impf: impf.write("DOID,Protein ID,UniProt,Score\n") for k, ppmids in pid2pmids.items(): for doid, dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) fdt_score_sum = 0.0 for pmid in pd_pmids: fdt_score_sum += 1.0 / (pmid_protein_ct[pmid] * pmid_disease_ct[pmid]) if fdt_score_sum > 0: ct += 1 impf.write("%s,%s,%.8f\n" % (doid, k, fdt_score_sum)) print " Wrote {} importance scores to file {}".format(ct, IMPORTANCE_FILE) if not args['--quiet']: print "\nComputing PubMed rankings" # PMIDs are ranked for a given disease-target pair based on a score # calculated by multiplying the number of targets mentioned and the # number of diseases mentioned in that paper. Lower scores have a lower # rank (higher priority). If the scores do not discriminate, PMIDs are # reverse sorted by value with the assumption that larger PMIDs are # newer and of higher priority. ct = 0 with open(PMID_RANKING_FILE, 'wb') as pmrf: pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n") for k, ppmids in pid2pmids.items(): for doid, dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) scores = [ ] # scores are tuples of (PMID, protein_mentions*disease_mentions) for pmid in pd_pmids: scores.append( (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid])) if len(scores) > 0: scores.sort(cmp_pmids_scores) for i, t in enumerate(scores): ct += 1 pmrf.write("%s,%s,%d,%d\n" % (doid, k, t[0], i)) print " Wrote {} PubMed rankings to file {}".format(ct, PMID_RANKING_FILE)
def calc_and_load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'KEGG Distances', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'Directed graphs are produced from KEGG pathway KGML files and all shortest path lengths are then calculated and stored.'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'kegg_distance'}) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) kgmls = get_kgmls(KGML_DIR) if not args['--quiet']: print "\nProcessing {} KGML files in {}".format(len(kgmls), KGML_DIR) logger.info("Processing {} KGML files in {}".format(len(kgmls), KGML_DIR)) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=len(kgmls)).start() # All pathways shortest path lengths # (node1, node2) => distance all_pws_spls = {} ct = 0 err_ct = 0 for kgml in kgmls: logger.info(" Working on {}".format(kgml)) ct += 1 try: dig = kg.kgml_file_to_digraph(kgml) except: err_ct += 1 logger.error("Error parsing file: {}".format(kgml)) continue aspls = nx.all_pairs_shortest_path_length(dig) dct = 0 for source in aspls: for target in aspls[source]: if source == target: continue st = (source, target) if st in all_pws_spls: if aspls[source][target] < all_pws_spls[st]: all_pws_spls[st] = aspls[source][target] dct += 1 else: all_pws_spls[st] = aspls[source][target] dct += 1 logger.info(" {} has {} non-zero shortest path lengths".format(kgml, dct)) pbar.update(ct) pbar.finish() logger.info("Got {} total unique non-zero shortest path lengths".format(len(all_pws_spls))) if not args['--quiet']: print " Got {} total unique non-zero shortest path lengths".format(len(all_pws_spls)) if err_ct > 0: print "WARNNING: {} parsing errors occurred. See logfile {} for details.".format(err_ct, logfile) logger.info("Processing {} KEGG Distances".format(len(all_pws_spls))) if not args['--quiet']: print "\nProcessing {} KEGG Distances".format(len(all_pws_spls)) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=len(all_pws_spls)).start() gid2pids = defaultdict(list) # So we only find each target once, # save protein.geneid => protein.id(s) notfnd = set() ct = 0 skip_ct = 0 kd_ct = 0 dba_err_ct = 0 for st,dist in all_pws_spls.items(): ct += 1 geneid1 = re.sub(r'^hsa:', '', st[0]) geneid2 = re.sub(r'^hsa:', '', st[1]) if geneid1 in gid2pids: pids1 = gid2pids[geneid1] elif geneid1 in notfnd: skip_ct += 1 continue else: targets = dba.find_targets({'geneid': geneid1}) if not targets: skip_ct += 1 notfnd.add(geneid1) # add to notfnd so we don't try looking it up again logger.warn("No target found for KEGG Gene ID {}".format(geneid1)) continue pids1 = [] for t in targets: pid = t['components']['protein'][0]['id'] pids1.append(pid) gid2pids[geneid1].append(pid) if geneid2 in gid2pids: pids2 = gid2pids[geneid2] elif geneid2 in notfnd: skip_ct += 1 continue else: targets = dba.find_targets({'geneid': geneid2}) if not targets: skip_ct += 1 notfnd.add(geneid2) # add to notfnd so we don't try looking it up again logger.warn("No target found for KEGG Gene ID {}".format(geneid2)) continue pids2 = [] for t in targets: pid = t['components']['protein'][0]['id'] pids2.append(pid) gid2pids[geneid2].append(pid) for pid1 in pids1: for pid2 in pids2: rv = dba.ins_kegg_distance({'pid1': pid1, 'pid2': pid2, 'distance': dist}) if rv: kd_ct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() print "{} KEGG Distances processed.".format(ct) print " Inserted {} new kegg_distance rows".format(kd_ct) if skip_ct > 0: print " {} KEGG IDs not found in TCRD - Skipped {} rows. See logfile {} for details.".format(len(notfnd), skip_ct, logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'OMIM', 'source': 'Files %s downloaded from omim.org' % ", ".join([GENEMAP_FILE, TITLES_FILE, PS_FILE]), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://omim.org/', 'comments': 'Confirmed OMIM phenotypes and OMIM Phenotype Series info' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'omim' }, { 'dataset_id': dataset_id, 'table_name': 'omim_ps' }, { 'dataset_id': dataset_id, 'table_name': 'phenotype', 'where_clause': "ptype = 'OMIM'" }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) # OMIMs and Phenotypic Series fname = DOWNLOAD_DIR + TITLES_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print '\nProcessing %d lines from input file %s' % (line_ct, fname) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 omim_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0: Prefix ??? # 1: Mim Number # 2: Preferred Title; symbol Alternative Title(s); symbol(s) # 3: Included Title(s); symbols title = row[2].partition(';')[0] rv = dba.ins_omim({'mim': row[1], 'title': title}) if not rv: dba_err_ct += 1 continue omim_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print "Loaded {} new omim rows".format(omim_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) fname = DOWNLOAD_DIR + PS_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print '\nProcessing %d lines from input file %s' % (line_ct, fname) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 ps_ct = 0 err_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0: Phenotypic Series Number # 1: Mim Number # 2: Phenotype if len(row) == 2: init = {'omim_ps_id': row[0], 'title': row[1]} elif len(row) == 3: init = {'omim_ps_id': row[0], 'mim': row[1], 'title': row[2]} else: err_ct += 1 logger.warn("Parsing error for row {}".format(row)) continue rv = dba.ins_omim_ps(init) if not rv: dba_err_ct += 1 continue ps_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print "Loaded {} new omim_ps rows".format(ps_ct) if err_ct > 0: print "WARNING: {} parsing errors occurred. See logfile {} for details.".format( er_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile) # Phenotypes fname = DOWNLOAD_DIR + GENEMAP_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print '\nProcessing %d lines from input file %s' % (line_ct, fname) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 tmark = {} skip_ct = 0 notfnd_ct = 0 prov_ct = 0 dds_ct = 0 pt_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0 - Sort ??? # 1 - Month # 2 - Day # 3 - Year # 4 - Cytogenetic location # 5 - Gene Symbol(s) # 6 - Confidence # 7 - Gene Name # 8 - MIM Number # 9 - Mapping Method # 10 - Comments # 11 - Phenotypes # 12 - Mouse Gene Symbol pts = row[11] if pts.startswith('?'): prov_ct += 1 continue if '(4)' in pts: dds_ct += 1 trait = "MIM Number: %s" % row[8] if row[11]: trait += "; Phenotype: %s" % pts found = False syms = row[5].split(', ') logger.info("Checking for OMIM syms: {}".format(syms)) for sym in syms: targets = dba.find_targets({'sym': sym}) if targets: found = True for t in targets: p = t['components']['protein'][0] logger.info( " Symbol {} found target {}: {}, {}".format( sym, t['id'], p['name'], p['description'])) rv = dba.ins_phenotype({ 'protein_id': p['id'], 'ptype': 'OMIM', 'trait': trait }) if not rv: dba_err_ct += 1 continue tmark[t['id']] = True pt_ct += 1 if not found: notfnd_ct += 1 logger.warn("No target found for row {}".format(row)) pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print " Skipped {} provisional phenotype rows.".format(prov_ct) print " Skipped {} deletion/duplication syndrome rows.".format(dds_ct) print "Loaded {} OMIM phenotypes for {} targets".format(pt_ct, len(tmark)) if notfnd_ct > 0: print "No target found for {} good lines. See logfile {} for details.".format( notfnd_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'Harmonogram CDFs', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'CDFs are calculated by the loader app based on gene_attribute data in TCRD.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({'dataset_id': 1, 'table_name': 'hgram_cdf'}) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] # Create a dictionary of gene_attribute_type.name => [] pairs counts = {} # Create a dictionary of gene_attribute_type.name => {} pairs stats = {} gatypes = dba.get_gene_attribute_types() for ga in gatypes: counts[ga] = [] stats[ga] = {} tct = dba.get_target_count(idg=False) if not args['--quiet']: print "\nCollecting counts for {} gene attribute types on {} TCRD targets".format( len(gatypes), tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 for t in dba.get_targets(idg=False, include_annotations=True, get_ga_counts=True): ct += 1 pbar.update(ct) p = t['components']['protein'][0] pid = p['id'] if not 'gene_attribute_counts' in p: continue for type, attr_count in p['gene_attribute_counts'].items(): counts[type].append(attr_count) pbar.finish() print "\nCalculatig Gene Attribute stats. See logfile {}.".format(logfile) logger.info("Calculatig Gene Attribute stats:") for type, l in counts.items(): if len(l) == 0: del (counts[type]) continue npa = numpy.array(l) logger.info(" %s: %d counts; mean: %.2f; std: %.2f" % (type, len(l), npa.mean(), npa.std())) stats[type]['mean'] = npa.mean() stats[type]['std'] = npa.std() print "\nLoading HGram CDFs for {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 nan_ct = 0 cdf_ct = 0 dba_err_ct = 0 for t in dba.get_targets(idg=False, include_annotations=True, get_ga_counts=True): ct += 1 p = t['components']['protein'][0] pid = p['id'] if not 'gene_attribute_counts' in p: continue for type, attr_count in p['gene_attribute_counts'].items(): attr_cdf = gaussian_cdf(attr_count, stats[type]['mean'], stats[type]['std']) if math.isnan(attr_cdf): attr_cdf = 1.0 / (1.0 + math.exp(-1.702 * ( (attr_count - stats[type]['mean']) / stats[type]['std']))) if math.isnan(attr_cdf): nan_ct += 1 continue rv = dba.ins_hgram_cdf({ 'protein_id': p['id'], 'type': type, 'attr_count': attr_count, 'attr_cdf': attr_cdf }) if not rv: dba_err_ct += 1 continue cdf_ct += 1 pbar.update(ct) pbar.finish() print "Processed {} targets.".format(ct) print " Loaded {} new hgram_cdf rows".format(cdf_ct) print " Skipped {} NaN CDFs".format(nan_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def calc_and_load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'KEGG Nearest Tclins', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'Nearest upstream and downstream Tclin targets are found and stored based on KEGG Distances.' }) assert dataset_id, "Error inserting dataset See logfile {} for details.".format( logfile) # Provenance rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'kegg_nearest_tclin' }) assert rv, "Error inserting provenance. See logfile {} for details.".format( logfile) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] tct = dba.get_target_count() if not args['--quiet']: print "\nProcessing {} TCRD targets".format(tct) pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 uct = 0 umark = set() dct = 0 dmark = set() dba_err_ct = 0 for target in dba.get_targets(): #tids = [1983, 7166] #for tid in tids: # target = dba.get_target(tid) ct += 1 if target['tdl'] == 'Tclin': continue pid = target['components']['protein'][0]['id'] ups = dba.get_nearest_kegg_tclins(pid, 'upstream') if ups: umark.add(pid) for d in ups: d['tclin_id'] = d['protein_id'] d['protein_id'] = pid d['direction'] = 'upstream' rv = dba.ins_kegg_nearest_tclin(d) if rv: uct += 1 else: dba_err_ct += 1 dns = dba.get_nearest_kegg_tclins(pid, 'downstream') if dns: dmark.add(pid) for d in dns: d['tclin_id'] = d['protein_id'] d['protein_id'] = pid d['direction'] = 'downstream' rv = dba.ins_kegg_nearest_tclin(d) if rv: dct += 1 else: dba_err_ct += 1 pbar.update(ct) pbar.finish() if not args['--quiet']: print "\n{} targets processed.".format(ct) print " {} non-Tclin targets have upstream Tclin target(s)".format( len(umark)) print " Inserted {} upstream kegg_nearest_tclin rows".format(uct) print " {} non-Tclin targets have downstream Tclin target(s)".format( len(dmark)) print " Inserted {} upstream kegg_nearest_tclin rows".format(dct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format( dba_err_ct, logfile)
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) # DBAdaptor uses same logger as main() dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database %s (schema ver %s; data ver %s)", args['--dbname'], dbi['schema_ver'], dbi['data_ver']) if not args['--quiet']: print "\nConnected to TCRD database %s (schema ver %s; data ver %s)" % ( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset({ 'name': 'NCBI Gene', 'source': 'EUtils web API at %s' % EFETCH_GENE_URL, 'app': PROGRAM, 'app_version': __version__, 'url': 'http://www.ncbi.nlm.nih.gov/gene' }) if not dataset_id: print "WARNING: Error inserting dataset See logfile %s for details." % logfile sys.exit(1) # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'NCBI Gene Summary'" }, { 'dataset_id': dataset_id, 'table_name': 'tdl_info', 'where_clause': "itype = 'NCBI Gene PubMed Count'" }, { 'dataset_id': dataset_id, 'table_name': 'generif' }, { 'dataset_id': dataset_id, 'table_name': 'xref', 'where_clause': "dataset_id = %d" % dataset_id }, { 'dataset_id': dataset_id, 'table_name': 'alias', 'where_clause': "dataset_id = %d" % dataset_id }] for prov in provs: rv = dba.ins_provenance(prov) if not rv: print "WARNING: Error inserting provenance. See logfile %s for details." % logfile sys.exit(1) s = shelve.open(SHELF_FILE, writeback=True) s['loaded'] = [] s['retries'] = {} s['counts'] = defaultdict(int) tct = dba.get_target_count() if not args['--quiet']: print "\nLoading NCBI Gene annotations for %d TCRD targets" % tct logger.info("Loading NCBI Gene annotations for %d TCRD targets\n" % tct) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=tct).start() ct = 0 skip_ct = 0 for t in dba.get_targets(include_annotations=False): tid = t['id'] ct += 1 p = t['components']['protein'][0] pid = p['id'] if p['geneid'] == None: skip_ct += 1 continue geneid = str(p['geneid']) logger.info("Processing target %d: geneid %s" % (tid, geneid)) (status, headers, xml) = get_ncbigene(geneid) if not status: logger.warn("Failed getting Gene ID %s" % geneid) s['retries'][tid] = True continue if status != 200: logger.warn("Bad API response for Gene ID %s: %s" % (geneid, status)) s['retries'][tid] = True continue gene_annotations = parse_genexml(xml) if not gene_annotations: s['counts']['xml_err'] += 1 logger.error("XML Error for Gene ID %s" % geneid) s['retries'][tid] = True continue load_annotations(dba, t, dataset_id, gene_annotations, s) time.sleep(0.5) pbar.update(ct) pbar.finish() print "Processed %d targets." % ct if skip_ct > 0: print "Skipped %d targets with no geneid" % skip_ct print "Loaded NCBI annotations for %d targets" % len(s['loaded']) if len(s['retries']) > 0: print "Total targets remaining for retries: %d " % len(s['retries']) loop = 1 while len(s['retries']) > 0: print "\nRetry loop %d: Loading NCBI Gene annotations for %d TCRD targets" % ( loop, len(s['retries'])) logger.info( "Retry loop %d: Loading NCBI Gene annotations for %d TCRD targets" % (loop, len(s['retries']))) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] pbar = ProgressBar(widgets=pbar_widgets, maxval=len(s['retries'])).start() ct = 0 act = 0 for tid, _ in s['retries'].items(): ct += 1 t = dba.get_target(tid, include_annotations=False) geneid = str(t['components']['protein'][0]['geneid']) logger.info("Processing target %d: geneid %s" % (tid, geneid)) (status, headers, xml) = get_ncbigene(geneid) if not status: logger.warn("Failed getting Gene ID %s" % geneid) continue if status != 200: logger.warn("Bad API response for Gene ID %s: %s" % (geneid, status)) continue gene_annotations = parse_genexml(xml) if not gene_annotations: s['counts']['xml_err'] += 1 logger.error("XML Error for Gene ID %s" % geneid) continue load_annotations(dba, t, dataset_id, gene_annotations, s) act += 1 del s['retries'][tid] time.sleep(0.5) pbar.update(ct) loop += 1 if loop == 5: print("Completed 5 retry loops. Aborting.") break pbar.finish() print "Processed %d targets." % ct print " Annotated %d additional targets" % act print " Total annotated targets: %d" % len(s['loaded']) if len(s['retries']) > 0: print "Total targets remaining for retries: %d " % len( s['retries']) print "\nInserted %d aliases" % s['counts']['alias'] print "Inserted %d NCBI Gene Summary tdl_infos" % s['counts']['summary'] print "Inserted %d NCBI Gene PubMed Count tdl_infos" % s['counts']['pmc'] print "Inserted %d GeneRIFs" % s['counts']['generif'] print "Inserted %d PubMed xrefs" % s['counts']['pmxr'] #print "Inserted %d other xrefs" % s['counts']['xref'] if s['counts']['xml_err'] > 0: print "WARNNING: %d XML parsing errors occurred. See logfile %s for details." % ( s['counts']['xml_err'], logfile) if s['counts']['dba_err'] > 0: print "WARNNING: %d DB errors occurred. See logfile %s for details." % ( s['counts']['dba_err'], logfile)