def do_tiga(dba, logger, logfile): tigas = dba.get_tigas() tigact = len(tigas) print(f"\nLoading {tigact} TIGA ExtLinks for TCRD proteins") ct = 0 el_ct = 0 pmark = {} dba_err_ct = 0 for d in tigas: ct += 1 slmf.update_progress(ct / tigact) rv = dba.ins_extlink({ 'source': 'TIGA', 'protein_id': d['protein_id'], 'url': TIGA_PAGE_URL.format(d['ensg']) }) if not rv: dba_err_ct += 1 continue el_ct += 1 pmark[d['protein_id']] = True print("Inserted {} new TIGA extlink rows for {} TCRD proteins.".format( el_ct, len(pmark))) if dba_err_ct > 0: print( f"ERROR: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load_tdls(dba, logfile, logger): tids = dba.get_target_ids() tct = len(tids) print(f"\nCalculating/Loading TDLs for {tct} TCRD targets") ct = 0 tdl_cts = {'Tclin': 0, 'Tchem': 0, 'Tbio': 0, 'Tdark': 0} bump_ct = 0 dba_err_ct = 0 upd_ct = 0 for tid in tids: tinfo = dba.get_target4tdlcalc(tid) ct += 1 slmf.update_progress(ct/tct) (tdl, bump_flag) = compute_tdl(tinfo) tdl_cts[tdl] += 1 if bump_flag: bump_ct += 1 rv = dba.do_update({'table': 'target', 'id': tid, 'col': 'tdl', 'val': tdl}) if rv: upd_ct += 1 else: dba_err_ct += 1 print(f"{ct} TCRD targets processed.") print(f"Set TDL value for {upd_ct} targets:") print(" {} targets are Tclin".format(tdl_cts['Tclin'])) print(" {} targets are Tchem".format(tdl_cts['Tchem'])) print(" {} targets are Tbio - {} bumped from Tdark".format(tdl_cts['Tbio'], bump_ct)) print(" {} targets are Tdark".format(tdl_cts['Tdark'])) if dba_err_ct: print(f"ERROR: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.")
def load_mondo(args, dba, logger, logfile, mondod, cfgd): mondo_ct = len(mondod) if not args['--quiet']: print(f"Loading {mondo_ct} Mondo terms") ct = 0 ins_ct = 0 dba_err_ct = 0 for mondod, md in mondod.items(): ct += 1 ud['mondoid'] = mondoid rv = dba.ins_mondo(md) if rv: ins_ct += 1 else: dba_err_ct += 1 slmf.update_progress(ct / mondo_ct) # Dataset # data-version field in the header of the OBO file has a relase version: # data-version: releases/2016-03-25 f = os.popen("head %s" % cfgd['DOWNLOAD_DIR'] + cfgd['FILENAME']) for line in f: if line.startswith("data-version:"): ver = line.replace('data-version: ', '') break f.close() dataset_id = dba.ins_dataset({ 'name': 'Mondo', 'source': 'File %s, version %s' % (cfgd['BASE_URL'] + cfgd['FILENAME'], ver), 'app': PROGRAM, 'app_version': __version__, 'url': 'https://github.com/monarch-initiative/mondo' }) assert dataset_id, f"Error inserting dataset See logfile {logfile} for details." # Provenance provs = [{ 'dataset_id': dataset_id, 'table_name': 'mondo' }, { 'dataset_id': dataset_id, 'table_name': 'mondo_parent' }, { 'dataset_id': dataset_id, 'table_name': 'mondo_xref' }] for prov in provs: rv = dba.ins_provenance(prov) assert rv, f"Error inserting provenance. See logfile {logfile} for details." print(f"{ct} terms processed.") print(f" Inserted {ins_ct} new uberon rows") if dba_err_ct > 0: print( f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details." )
def export_uniprot_mapping(dba, ofn): uptdls = dba.get_uniprots_tdls() ct = len(uptdls) exp_ct = 0 print(f"\nExporting UniProts/TDLs for {ct} TCRD targets") with open(ofn, 'w') as ofh: ofh.write(f"UniProt_accession\tPharos_target\tTDL\n") for d in uptdls: ofh.write(f"{d['uniprot']}\t{d['uniprot']}\t{d['tdl']}\n") exp_ct += 1 slmf.update_progress(exp_ct/ct) print(f"Wrote {exp_ct} lines to file {ofn}")
def load_do(args, dba, logger, logfile, dod, cfgd): do_ct = len(dod) if not args['--quiet']: print(f"Loading {do_ct} Disease Ontology terms") ct = 0 ins_ct = 0 dba_err_ct = 0 for doid, d in dod.items(): ct += 1 d['doid'] = doid rv = dba.ins_do(d) if rv: ins_ct += 1 else: dba_err_ct += 1 slmf.update_progress(ct / do_ct) # Dataset # data-version field in the header of the OBO file has a relase version: # data-version: releases/2016-03-25 for line in os.popen("head %s" % cfgd['DOWNLOAD_DIR'] + cfgd['FILENAME']): if line.startswith("data-version:"): ver = line.replace('data-version: ', '') break dataset_id = dba.ins_dataset({ 'name': 'Disease Ontology', 'source': 'File %s, version %s' % (cfgd['BASE_URL'] + cfgd['FILENAME'], ver), 'app': PROGRAM, 'app_version': __version__, 'url': 'http://disease-ontology.org/' }) assert dataset_id, f"Error inserting dataset. See logfile {logfile} for details." # Provenance rv = dba.ins_provenance({'dataset_id': dataset_id, 'table_name': 'do'}) assert rv, f"Error inserting provenance. See logfile {logfile} for details." rv = dba.ins_provenance({ 'dataset_id': dataset_id, 'table_name': 'do_xref' }) assert rv, f"Error inserting provenance. See logfile {logfile} for details." print(f"{ct} terms processed.") print(f" Inserted {ins_ct} new do rows") if dba_err_ct > 0: print( f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load_mouse_rat(args, dba, dataset_id, logger, logfile): fn = UP_DOWNLOAD_DIR + UP_RODENT_FILE.replace('.gz', '') if not args['--quiet']: print(f"\nParsing file {fn}") root = objectify.parse(fn).getroot() up_ct = len(root.entry) if not args['--quiet']: print(f"Loading data for {up_ct} UniProt records") logger.info(f"Loading data for {up_ct} UniProt records in file {fn}") ct = 0 load_ct = 0 skip_ct = 0 xml_err_ct = 0 dba_err_ct = 0 for i in range(len(root.entry)): ct += 1 slmf.update_progress(ct / up_ct) entry = root.entry[i] # filter for mouse and rat records for orgname in entry.organism.find(NS + 'name'): if orgname.get('type') == 'scientific': break if orgname not in ['Mus musculus', 'Rattus norvegicus']: skip_ct += 1 logger.debug("Skipping {} entry {}".format(orgname, entry.accession)) continue logger.info("Processing entry {}".format(entry.accession)) nhpinit = entry2nhpinit(entry, dataset_id) if not nhpinit: xml_err_ct += 1 logger.error("XML Error for {}".format(entry.accession)) continue nhpid = dba.ins_nhprotein(nhpinit) if not nhpid: dba_err_ct += 1 continue logger.debug("Nhprotein insert id: {}".format(nhpid)) load_ct += 1 print(f"Processed {ct} UniProt records.") print(f" Loaded {load_ct} Mouse and Rat nhproteins") if skip_ct > 0: print(f" Skipped {skip_ct} non-Mouse/Rat records") if xml_err_ct > 0: print( f"WARNING: {xml_err_ct} XML parsing errors occurred. See logfile {logfile} for details." ) if dba_err_ct > 0: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." )
def do_glygen(dba, logger, logfile): proteins = dba.get_proteins() pct = len(proteins) print(f"\nChecking/Loading GlyGen ExtLinks for {pct} TCRD proteins") ct = 0 el_ct = 0 notfnd = set() api_err_ct = 0 dba_err_ct = 0 for p in proteins: logger.info(f"Processing protein {p['id']}: {p['uniprot']}") ct += 1 slmf.update_progress(ct / pct) ingg = chk_glygen(p['uniprot']) if ingg == True: rv = dba.ins_extlink({ 'source': 'GlyGen', 'protein_id': p['id'], 'url': GLYGEN_PROTEIN_PAGE_URL.format(p['uniprot']) }) if not rv: dba_err_ct += 1 continue el_ct += 1 elif ingg == False: logger.warn(f"No GlyGen record for {p['uniprot']}") notfnd.add(p['uniprot']) continue else: logger.Error("Unexpected GlyGen API result for {p['uniprot']}") api_err_ct += 1 continue print(f"Processed {ct} TCRD proteins.") print(f"Inserted {el_ct} new GlyGen extlink rows.") if notfnd: print( "No GlyGen record found for {} TCRD UniProts. See logfile {} for details." .format(len(notfnd), logfile)) if api_err_ct > 0: print( f"WARNING: {api_err_ct} unexpected API responses. See logfile {logfile} for details." ) if dba_err_ct > 0: print( f"ERROR: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load_mondo(dba, logger, logfile, mondod, cfgd): mondo_ct = len(mondod) print(f"Loading {mondo_ct} MonDO terms") ct = 0 ins_ct = 0 dba_err_ct = 0 for mondoid,md in mondod.items(): ct += 1 md['mondoid'] = mondoid if 'xrefs' in md: for xref in md['xrefs']: if 'source' in xref and 'source="MONDO:equivalentTo"' in xref['source']: xref['equiv_to'] = 1 else: xref['equiv_to'] = 0 rv = dba.ins_mondo(md) if rv: ins_ct += 1 else: dba_err_ct += 1 slmf.update_progress(ct/mondo_ct) # Dataset # data-version field in the header of the OBO file has a relase version: # data-version: releases/2016-03-25 f = os.popen("head %s"%cfgd['DOWNLOAD_DIR'] + cfgd['FILENAME']) for line in f: if line.startswith("data-version:"): ver = line.replace('data-version: ', '') break f.close() dataset_id = dba.ins_dataset( {'name': 'Mondo', 'source': 'Mondo file {}, version {}'.format(cfgd['BASE_URL']+cfgd['FILENAME'], ver), 'app': PROGRAM, 'app_version': __version__, 'url': 'https://mondo.monarchinitiative.org/'} ) assert dataset_id, f"Error inserting dataset See logfile {logfile} for details." # Provenance provs = [ {'dataset_id': dataset_id, 'table_name': 'mondo'} , {'dataset_id': dataset_id, 'table_name': 'mondo_parent'}, {'dataset_id': dataset_id, 'table_name': 'mondo_xref'} ] for prov in provs: rv = dba.ins_provenance(prov) assert rv, f"Error inserting provenance. See logfile {logfile} for details." print(f"{ct} terms processed.") print(f" Inserted {ins_ct} new mondo rows (w/ associated parents and xrefs)") if dba_err_ct: print(f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.")
def parse_mappings(fn): line_ct = slmf.wcl(fn) print(f"\nProcessing {line_ct} input lines in mapping file {fn}") up2chembl = {} with open(fn, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') ct = 0 for row in tsvreader: ct += 1 slmf.update_progress(ct / line_ct) if row[0].startswith('#'): continue if row[3] != 'SINGLE PROTEIN': continue if row[0] in up2chembl: up2chembl[row[0]].append(row[1]) else: up2chembl[row[0]] = [row[1]] return up2chembl
def load_human(args, dba, dataset_id, eco_map, logger, logfile): fn = UP_DOWNLOAD_DIR + UP_HUMAN_FILE.replace('.gz', '') if not args['--quiet']: print(f"\nParsing file {fn}") root = objectify.parse(fn).getroot() up_ct = len(root.entry) if not args['--quiet']: print(f"Loading data for {up_ct} UniProt records") logger.info(f"Loading data for {up_ct} UniProt records in file {fn}") ct = 0 load_ct = 0 xml_err_ct = 0 dba_err_ct = 0 for i in range(len(root.entry)): ct += 1 slmf.update_progress(ct / up_ct) entry = root.entry[i] logger.info("Processing entry {}".format(entry.accession)) tinit = entry2tinit(entry, dataset_id, eco_map) if not tinit: xml_err_ct += 1 logger.error("XML Error for {}".format(entry.accession)) continue tid = dba.ins_target(tinit) if not tid: dba_err_ct += 1 continue logger.debug(f"Target insert id: {tid}") load_ct += 1 print(f"Processed {ct} UniProt records.") print(f" Loaded {load_ct} targets/proteins") if xml_err_ct > 0: print( f"WARNING: {xml_err_ct} XML parsing errors occurred. See logfile {logfile} for details." ) if dba_err_ct > 0: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load_pubmed(curs, logger, logfile): st = time.time() fn = INFILES['pubmed'] line_ct = slmf.wcl(fn) print(f'\nLoading TIN-X pubmeds from {fn}...') ct = 0 pm_ct = 0 dup_ct = 0 err_ct = 0 with open(fn, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') for row in tsvreader: if ct == 0: # skip header header = row # header line ct += 1 continue ct += 1 slmf.update_progress(ct/line_ct) try: curs.execute(INS_SQL['pubmed'], tuple(row)) pm_ct += 1 except Error as e: if f"Duplicate entry '{row[0]}'" in e.msg: # this should not happen under "production" runs, but it's here for testing/debugging dup_ct += 1 continue else: err_ct += 1 logger.error(f"``{e}`` for line {ct}. Data: {row}") continue ets = slmf.secs2str(time.time() - st) print(f"\n Processed {ct} lines. Inserted {pm_ct} pubmed rows. Elapsed time: {ets}") if err_ct: print(f" WARNING: {err_ct} errors occurred. See logfile {logfile} for details.") if dup_ct: print(f" Skipped {dup_ct} existing pubmeds.") print("Done.")
def load_DISEASES(dba, logger, logfile): # Knowledge channel fn = JL_DOWNLOAD_DIR + DISEASES_FILE_K line_ct = slmf.wcl(fn) print(f"Processing {line_ct} lines in DISEASES Knowledge file {fn}") with open(fn, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') ct = 0 k2pids = {} # ENSP|sym => list of TCRD protein ids pmark = {} skip_ct = 0 notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 slmf.update_progress(ct / line_ct) if not row[0].startswith('ENSP'): skip_ct += 1 continue ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: pids = dba.find_protein_ids({'stringid': ensp}) if not pids: pids = dba.find_protein_ids({'sym': sym}) if not pids: notfnd.add(k) logger.warn(f"No protein found for {k}") continue k2pids[ k] = pids # save this mapping so we only lookup each ENSP|sym once dtype = 'JensenLab Knowledge ' + row[4] for pid in pids: rv = dba.ins_disease({ 'protein_id': pid, 'dtype': dtype, 'name': row[3], 'did': row[2], 'evidence': row[5], 'conf': row[6] }) if rv: dis_ct += 1 pmark[pid] = True else: dba_err_ct += 1 print(f"{ct} lines processed.") print(" Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark))) if skip_ct: print(f" Skipped {skip_ct} rows w/o ENSP") if notfnd: print( " No target found for {} stringids/symbols. See logfile {} for details." .format(len(notfnd), logfile)) if dba_err_ct: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." ) # Experiment channel fn = JL_DOWNLOAD_DIR + DISEASES_FILE_E line_ct = slmf.wcl(fn) print(f"Processing {line_ct} lines in DISEASES Experiment file {fn}") with open(fn, 'rU') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') ct = 0 k2pids = {} # ENSP|sym => list of TCRD protein ids pmark = {} notfnd = set() dis_ct = 0 skip_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 slmf.update_progress(ct / line_ct) if not row[0].startswith('ENSP'): skip_ct += 1 continue if row[2].startswith('ENSP'): skip_ct += 1 continue ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: pids = dba.find_protein_ids({'stringid': ensp}) if not pids: pids = dba.find_protein_ids({'sym': sym}) if not pids: notfnd.add(k) logger.warn(f"No protein found for {k}") continue k2pids[ k] = pids # save this mapping so we only lookup each ENSP|sym once dtype = 'JensenLab Experiment ' + row[4] for pid in pids: rv = dba.ins_disease({ 'protein_id': pid, 'dtype': dtype, 'name': row[3], 'did': row[2], 'evidence': row[5], 'conf': row[6] }) if rv: dis_ct += 1 pmark[pid] = True else: dba_err_ct += 1 print(f"{ct} lines processed.") print(" Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark))) if skip_ct: print(f" Skipped {skip_ct} rows w/o ENSP or with ENSP did") if notfnd: print( " No target found for {} stringids/symbols. See logfile {} for details." .format(len(notfnd), logfile)) if dba_err_ct: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." ) # Text Mining channel fn = JL_DOWNLOAD_DIR + DISEASES_FILE_T line_ct = slmf.wcl(fn) print(f"Processing {line_ct} lines in DISEASES Textmining file {fn}") with open(fn, 'rU') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') ct = 0 k2pids = {} # ENSP|sym => list of TCRD protein ids pmark = {} notfnd = set() dis_ct = 0 skip_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 slmf.update_progress(ct / line_ct) if not row[0].startswith('ENSP'): skip_ct += 1 continue if float(row[5]) < 3.0: # skip rows with confidence < 3.0 skip_ct += 1 continue ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: pids = dba.find_protein_ids({'stringid': ensp}) if not pids: pids = dba.find_protein_ids({'sym': sym}) if not pids: notfnd.add(k) logger.warn(f"No protein found for {k}") continue k2pids[ k] = pids # save this mapping so we only lookup each ENSP|sym once dtype = 'JensenLab Text Mining' for pid in pids: rv = dba.ins_disease({ 'protein_id': pid, 'dtype': dtype, 'name': row[3], 'did': row[2], 'zscore': row[4], 'conf': row[5] }) if rv: dis_ct += 1 pmark[pid] = True else: dba_err_ct += 1 print(f"{ct} lines processed.") print(" Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark))) if skip_ct: print(f" Skipped {skip_ct} rows w/o ENSP or with confidence < 3") if notfnd: print( " No target found for {} stringids/symbols. See logfile {} for details." .format(len(notfnd), logfile)) if dba_err_ct: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load(args, dba, logger, logfile): line_ct = slmf.wcl(IDG_LIST_FILE) print(f"\nProcessing {line_ct} lines in file {IDG_LIST_FILE}") logger.info(f"Processing {line_ct} lines in list file {IDG_LIST_FILE}") ct = 0 idg_ct = 0 fam_ct = 0 notfnd = [] multfnd = [] dba_err_ct = 0 with open(IDG_LIST_FILE, 'r') as ifh: csvreader = csv.reader(ifh) for row in csvreader: if ct == 0: header = row # header line ct += 1 continue ct += 1 slmf.update_progress(ct / line_ct) sym = row[0] fam = row[1] if fam == 'IonChannel': fam = 'IC' tids = dba.find_target_ids({'sym': sym}) if not tids: notfnd.append(sym) continue if len(tids) > 1: multfnd.append(sym) continue rv = dba.do_update({ 'table': 'target', 'col': 'idg', 'id': tids[0], 'val': 1 }) if rv: idg_ct += 1 else: db_err_ct += 1 rv = dba.do_update({ 'table': 'target', 'col': 'fam', 'id': tids[0], 'val': fam }) if rv: fam_ct += 1 else: dba_err_ct += 1 print(f"{ct} lines processed") print(f"{idg_ct} target rows updated with IDG flags") print(f"{fam_ct} target rows updated with fams") if notfnd: print("WARNING: No target found for {} symbols: {}".format( len(notfnd), ", ".join(notfnd))) if multfnd: print("WARNING: Multiple targets found for {} symbols: {}".format( len(multfnd), ", ".join(multfnd))) if dba_err_ct > 0: print( f"WARNING: {dba_err_ct} database errors occured. See logfile {logfile} for details." )
def load(dba, logger, logfile): infile = DOWNLOAD_DIR + TIGA_FILE line_ct = slmf.wcl(infile) print(f"\nProcessing {line_ct} lines in TIGA file {infile}") ct = 0 k2pids = defaultdict(list) # maps sym|ENSG to TCRD protein_id(s) notfnd = set() pmark = {} tiga_ct = 0 dba_err_ct = 0 with open(infile, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') for row in tsvreader: if ct == 0: # skip header header = row # header line ct += 1 continue # 0: ensemblId # 1: efoId # 2: trait # 3: n_study # 4: n_snp # 5: n_snpw # 6: geneNtrait # 7: geneNstudy # 8: traitNgene # 9: traitNstudy # 10: pvalue_mlog_median # 11: pvalue_mlog_max # 12: or_median # 13: n_beta # 14: study_N_mean # 15: rcras # 16: geneSymbol # 17: TDL # 18: geneFamily # 19: geneIdgList # 20: geneName # 21: meanRank # 22: meanRankScore ct += 1 slmf.update_progress(ct / line_ct) sym = row[16] ensg = row[0] k = sym + '|' + ensg pids = [] if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: # look it up pids = dba.find_protein_ids({'sym': sym}) if not pids: pids = dba.find_protein_ids_by_xref({ 'xtype': 'Ensembl', 'value': ensg }) if not pids: notfnd.add(k) continue k2pids[ k] = pids # save this mapping so we only lookup each sym/ENSG once init = { 'ensg': ensg, 'efoid': row[1], 'trait': row[2], 'n_study': row[3], 'n_snp': row[4], 'n_snpw': row[5], 'geneNtrait': row[6], 'geneNstudy': row[7], 'traitNgene': row[8], 'traitNstudy': row[9], 'pvalue_mlog_median': row[10], 'pvalue_mlog_max': row[11], 'n_beta': row[13], 'study_N_mean': row[14], 'rcras': row[15], 'meanRank': row[21], 'meanRankScore': row[22] } if row[12] != 'NA': init['or_median'] = row[12] #if row[] != 'NA': # init[''] = row[] for pid in pids: init['protein_id'] = pid rv = dba.ins_tiga(init) if not rv: dba_err_ct += 1 continue tiga_ct += 1 pmark[pid] = True for k in notfnd: logger.warn(f"No protein found for {k}") print(f"Processed {ct} lines") print(" Inserted {} new tiga rows for {} proteins".format( tiga_ct, len(pmark))) if notfnd: print("No target found for {} sym/ENSGs. See logfile {} for details.". format(len(notfnd), logfile)) if dba_err_ct > 0: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." ) infile = DOWNLOAD_DIR + TIGA_PROV_FILE line_ct = slmf.wcl(infile) print(f"\nProcessing {line_ct} lines in TIGA provenance file {infile}") ct = 0 tigaprov_ct = 0 dba_err_ct = 0 with open(infile, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') for row in tsvreader: if ct == 0: # skip header header = row # header line ct += 1 continue # 0: ensemblId # 1: TRAIT_URI # 2: STUDY_ACCESSION # 3: PUBMEDID # 4: efoId ct += 1 slmf.update_progress(ct / line_ct) rv = dba.ins_tiga_provenance({ 'ensg': row[0], 'efoid': row[4], 'study_acc': row[2], 'pubmedid': row[3] }) if not rv: dba_err_ct += 1 continue tigaprov_ct += 1 print(f"Processed {ct} lines") print(f" Inserted {tigaprov_ct} new tiga_provenance rows") if dba_err_ct > 0: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load(args, dba, logger, logfile): fn = DOWNLOAD_DIR + GENO_PHENO_FILE.replace('.gz', '') line_ct = slmf.wcl(fn) if not args['--quiet']: print(f"\nProcessing {line_ct} lines in input file {fn}") ct = 0 pt_ct = 0 pmark = {} sym2nhpids = {} notfnd = set() skip_ct = 0 dba_err_ct = 0 with open(fn, 'r') as csvfile: csvreader = csv.reader(csvfile) for row in csvreader: # 0: marker_accession_id # 1: marker_symbol # 2: phenotyping_center # 3: colony_id # 4: sex # 5: zygosity # 6: allele_accession_id # 7: allele_symbol # 8: allele_name # 9: strain_accession_id # 10: strain_name # 11: project_name # 12: project_fullname # 13: pipeline_name # 14: pipeline_stable_id # 15: procedure_stable_id # 16: procedure_name # 17: parameter_stable_id # 18: parameter_name # 19: top_level_mp_term_id # 20: top_level_mp_term_name # 21: mp_term_id # 22: mp_term_name # 23: p_value # 24: percentage_change # 25: effect_size # 26: statistical_method # 27: resource_name if ct == 0: header = row # header line ct += 1 continue ct += 1 slmf.update_progress(ct/line_ct) sym = row[1] if not row[21] and not row[22]: # skip data with neither a term_id or term_name skip_ct += 1 continue if sym in sym2nhpids: # we've already found it nhpids = sym2nhpids[sym] elif sym in notfnd: # we've already not found it continue else: nhpids = dba.find_nhprotein_ids({'sym': sym}, species = 'Mus musculus') if not nhpids: notfnd.add(sym) logger.warn("No nhprotein found for symbol {}".format(sym)) continue sym2nhpids[sym] = nhpids # save this mapping so we only lookup each symbol once pval = None if row[23] and row[23] != '': try: pval = float(row[23]) except: logger.warn("Problem converting p_value {} for row {}".format(row[23], ct)) sex = None if row[4] and len(row[4]) <= 8: sex = row[4] for nhpid in nhpids: rv = dba.ins_phenotype({'nhprotein_id': nhpid, 'ptype': 'IMPC', 'top_level_term_id': row[19], 'top_level_term_name': row[20], 'term_id': row[21], 'term_name': row[22], 'p_value': pval, 'percentage_change': row[24], 'effect_size': row[25], 'procedure_name': row[16], 'parameter_name': row[18], 'statistical_method': row[26], 'sex': sex, 'gp_assoc': 1}) if rv: pmark[nhpid] = True pt_ct += 1 else: dba_err_ct += 1 print(f"{ct} lines processed.") print("Loaded {} IMPC phenotypes for {} nhproteins".format(pt_ct, len(pmark))) if notfnd: print("No nhprotein found for {} gene symbols. See logfile {} for details.".format(len(notfnd), logfile)) if skip_ct > 0: print(f"Skipped {skip_ct} lines with no term_id or term_name.") if dba_err_ct > 0: print(f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.") fn = DOWNLOAD_DIR + STAT_RES_FILE.replace('.gz', '') line_ct = slmf.wcl(fn) if not args['--quiet']: print(f"\nProcessing {line_ct} lines from input file {fn}") ct = 0 pt_ct = 0 pmark = {} sym2nhpids = {} notfnd = set() skip_ct = 0 pv_ct = 0 dba_err_ct = 0 with open(fn, 'rU') as csvfile: csvreader = csv.reader(csvfile) for row in csvreader: # 0: phenotyping_center # 1: intercept_estimate # 2: procedure_id # 3: mutant_biological_model_id # 4: rotated_residuals_test # 5: weight_effect_p_value # 6: male_mutant_count # 7: pipeline_stable_key # 8: female_ko_effect_p_value # 9: pipeline_stable_id # 10: parameter_stable_key # 11: data_type # 12: parameter_stable_id # 13: interaction_significant # 14: strain_accession_id # 15: control_selection_method # 16: parameter_name # 17: allele_name # 18: phenotyping_center_id # 19: weight_effect_stderr_estimate # 20: weight_effect_parameter_estimate # 21: procedure_stable_id # 22: status # 23: sex_effect_parameter_estimate # 24: female_ko_effect_stderr_estimate # 25: female_percentage_change # 26: group_2_residuals_normality_test # 27: marker_accession_id # 28: mp_term_name # 29: group_1_residuals_normality_test # 30: genotype_effect_p_value # 31: dependent_variable # 32: resource_name # 33: project_id # 34: procedure_name # 35: doc_id # 36: top_level_mp_term_id # 37: allele_accession_id # 38: blups_test # 39: null_test_p_value # 40: p_value # 41: marker_symbol # 42: control_biological_model_id # 43: pipeline_name # 44: sex # 45: interaction_effect_p_value # 46: colony_id # 47: project_name # 48: female_ko_parameter_estimate # 49: female_mutant_count # 50: organisation_id # 51: external_db_id # 52: female_control_count # 53: intermediate_mp_term_id # 54: db_id # 55: male_ko_effect_p_value # 56: top_level_mp_term_name # 57: metadata_group # 58: sex_effect_stderr_estimate # 59: zygosity # 60: male_percentage_change # 61: sex_effect_p_value # 62: mp_term_id # 63: male_ko_effect_stderr_estimate # 64: additional_information # 65: statistical_method # 66: _version_ # 67: intercept_estimate_stderr_estimate # 68: male_control_count # 69: intermediate_mp_term_name # 70: strain_name # 71: classification_tag # 72: effect_size # 73: procedure_stable_key # 74: allele_symbol # 75: resource_id # 76: group_2_genotype # 77: variance_significant # 78: pipeline_id # 79: group_1_genotype # 80: male_ko_parameter_estimate # 81: genotype_effect_parameter_estimate # 82: categories # 83: parameter_id # 84: batch_significant # 85: genotype_effect_stderr_estimate # 86: resource_fullname if ct == 0: header = row # header line ct += 1 continue ct += 1 slmf.update_progress(ct/line_ct) sym = row[41] if not row[62] and not row[28]: # skip lines with neither a term_id or term_name skip_ct += 1 continue if sym in sym2nhpids: # we've already found it nhpids = sym2nhpids[sym] elif sym in notfnd: # we've already not found it continue else: nhpids = dba.find_nhprotein_ids({'sym': sym}, species = 'Mus musculus') if not nhpids: notfnd.add(sym) logger.warn("No nhprotein found for symbol {}".format(sym)) continue sym2nhpids[sym] = nhpids # save this mapping so we only lookup each symbol once pval = None if row[40] and row[40] != '': try: pval = float(row[40]) except: logger.warn("Problem converting p_value {} for row {}".format(row[40], ct)) sex = None if row[4] and len(row[4]) <= 8: sex = row[4] for nhpid in nhpids: rv = dba.ins_phenotype({'nhprotein_id': nhpid, 'ptype': 'IMPC', 'top_level_term_id': row[36], 'top_level_term_name': row[56], 'term_id': row[62], 'term_name': row[28], 'p_value': pval, 'effect_size': row[72], 'procedure_name': row[34], 'parameter_name': row[16], 'statistical_method': row[65], 'sex': sex, 'gp_assoc': 0}) if rv: pmark[nhpid] = True pt_ct += 1 else: dba_err_ct += 1 print(f"{ct} lines processed.") print("Loaded {} IMPC phenotypes for {} nhproteins".format(pt_ct, len(pmark))) if notfnd: print("No nhprotein found for {} gene symbols. See logfile {} for details.".format(len(notfnd), logfile)) if skip_ct > 0: print(f"Skipped {skip_ct} lines with no term_id or term_name.") if dba_err_ct > 0: print(f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.")
def tinx(args, dba, logger, logfile): # The results of parsing the input mentions files will be the following dictionaries: pid2pmids = {} # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein # Including the UniProt accession in the key is just for convenience when # checking the output. It is not used for anything. doid2pmids = {} # DOID => set of all PMIDs that mention the disease pmid_disease_ct = {} # PMID => count of diseases mentioned in a given paper pmid_protein_ct = {} # PMID => count of proteins mentioned in a given paper # First parse the Disease Ontology OBO file to get DO names and defs dofile = DO_DOWNLOAD_DIR + DO_OBO print(f"\nParsing Disease Ontology file {dofile}") do_parser = obo.Parser(dofile) do = {} for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags print(" Got {} Disease Ontology terms".format(len(do))) fn = JL_DOWNLOAD_DIR+PROTEIN_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print(f"\nProcessing {line_ct} lines in protein file {fn}") with open(fn, 'rU') as tsvf: #pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 slmf.update_progress(ct/line_ct) if not line.startswith('ENSP'): skip_ct += 1 continue data = line.rstrip().split('\t') ensp = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) tids = dba.find_target_ids({'stringid': ensp}) if not tids: # if we don't find a target by stringid, which is the more reliable and # prefered way, try by Ensembl xref tids = dba.find_target_ids_by_xref({'xtype': 'Ensembl', 'value': ensp}) if not tids: notfnd.add(ensp) continue for tid in tids: t = dba.get_target(tid, annot=False) p = t['components']['protein'][0] k = "{},{}".format(p['id'], p['uniprot']) if k in pid2pmids: pid2pmids[k] = pid2pmids[k].union(pmids) else: pid2pmids[k] = set(pmids) for pmid in pmids: if pmid in pmid_protein_ct: pmid_protein_ct[pmid] += 1.0 else: pmid_protein_ct[pmid] = 1.0 for ensp in notfnd: logger.warn(f"No target found for {ensp}") print(f"\n{ct} lines processed") print(f" Skipped {skip_ct} non-ENSP lines") print(" Saved {} protein to PMIDs mappings".format(len(pid2pmids))) print(" Saved {} PMID to protein count mappings".format(len(pmid_protein_ct))) if notfnd: print(" No target found for {} ENSPs. See logfile {} for details.".format(len(notfnd), logfile)) fn = JL_DOWNLOAD_DIR+DISEASE_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print(f"\nProcessing {line_ct} lines in file {fn}") with open(fn, 'rU') as tsvf: ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 slmf.update_progress(ct/line_ct) if not line.startswith('DOID:'): skip_ct += 1 continue data = line.rstrip().split('\t') doid = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) if doid not in do: logger.warn(f"{doid} not found in DO") notfnd.add(doid) continue if doid in doid2pmids: doid2pmids[doid] = doid2pmids[doid].union(pmids) else: doid2pmids[doid] = set(pmids) for pmid in pmids: if pmid in pmid_disease_ct: pmid_disease_ct[pmid] += 1.0 else: pmid_disease_ct[pmid] = 1.0 print(f"\n{ct} lines processed.") print(f" Skipped {skip_ct} non-DOID lines") print(" Saved {} DOID to PMIDs mappings".format(len(doid2pmids))) print(" Saved {} PMID to disease count mappings".format(len(pmid_disease_ct))) if notfnd: print("WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile)) if not args['--quiet']: print("\nComputing protein novely scores") # To calculate novelty scores, each paper (PMID) is assigned a # fractional target (FT) score of one divided by the number of targets # mentioned in it. The novelty score of a given protein is one divided # by the sum of the FT scores for all the papers mentioning that # protein. ct = 0 with open(PROTEIN_NOVELTY_FILE, 'w') as pnovf: pnovf.write("Protein ID,UniProt,Novelty\n") for k in pid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in pid2pmids[k]: ft_score_sum += 1.0 / pmid_protein_ct[pmid] novelty = 1.0 / ft_score_sum pnovf.write( "%s,%.8f\n" % (k, novelty) ) print(f" Wrote {ct} novelty scores to file {PROTEIN_NOVELTY_FILE}") if not args['--quiet']: print("\nComputing disease novely scores") # Exactly as for proteins, but using disease mentions ct = 0 with open(DISEASE_NOVELTY_FILE, 'w') as dnovf: dnovf.write("DOID,Novelty\n") for doid in doid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in doid2pmids[doid]: ft_score_sum += 1.0 / pmid_disease_ct[pmid] novelty = 1.0 / ft_score_sum dnovf.write( "%s,%.8f\n" % (doid, novelty) ) print(f" Wrote {ct} novelty scores to file {DISEASE_NOVELTY_FILE}") if not args['--quiet']: print("\nComputing importance scores") # To calculate importance scores, each paper is assigned a fractional # disease-target (FDT) score of one divided by the product of the # number of targets mentioned and the number of diseases # mentioned. The importance score for a given disease-target pair is # the sum of the FDT scores for all papers mentioning that disease and # protein. ct = 0 with open(IMPORTANCE_FILE, 'w') as impf: impf.write("DOID,Protein ID,UniProt,Score\n") for k,ppmids in pid2pmids.items(): for doid,dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) fdt_score_sum = 0.0 for pmid in pd_pmids: fdt_score_sum += 1.0 / ( pmid_protein_ct[pmid] * pmid_disease_ct[pmid] ) if fdt_score_sum > 0: ct += 1 impf.write( "%s,%s,%.8f\n" % (doid, k, fdt_score_sum) ) print(f" Wrote {ct} importance scores to file {IMPORTANCE_FILE}") if not args['--quiet']: print("\nComputing PubMed rankings") # PMIDs are ranked for a given disease-target pair based on a score # calculated by multiplying the number of targets mentioned and the # number of diseases mentioned in that paper. Lower scores have a lower # rank (higher priority). If the scores do not discriminate, PMIDs are # reverse sorted by value with the assumption that larger PMIDs are # newer and of higher priority. ct = 0 with open(PMID_RANKING_FILE, 'w') as pmrf: pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n") for k,ppmids in pid2pmids.items(): for doid,dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) scores = [] # scores are tuples of (PMID, protein_mentions*disease_mentions) for pmid in pd_pmids: scores.append( (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid]) ) if len(scores) > 0: scores.sort(key = cmp_to_key(cmp_pmids_scores)) for i,t in enumerate(scores): ct += 1 pmrf.write( "%s,%s,%d,%d\n" % (doid, k, t[0], i) ) print(f" Wrote {ct} PubMed rankings to file {PMID_RANKING_FILE}")
def load(args, dba, logger, logfile): if not args['--quiet']: print("\nGetting target resource data from RSS...") target_data = get_target_data() assert target_data, "Error getting target data: FATAL" rss_ct = len(target_data) ct = 0 skip_ct = 0 res_ct = 0 tmark = set() notfnd = set() mulfnd = set() dba_err_ct = 0 if not args['--quiet']: print(f"Processing {rss_ct} target resource records...") for td in target_data: logger.info("Processing target resource data: {}".format(td)) ct += 1 slmf.update_progress(ct / rss_ct) if not td['pharosReady']: skip_ct += 1 continue sym = td['target'] #rssid = td['id'].rsplit('/')[-1] rssid = td['id'] resource_data = get_resource_data(td['id']) dbjson = json.dumps(resource_data['data'][0]['resource']) tids = dba.find_target_ids({'sym': sym}) if not tids: tids = dba.find_target_ids({'sym': sym}, incl_alias=True) if not tids: notfnd.add(sym) logger.warn("No target found for {}".format(sym)) continue if len(tids) > 1: mulfnd.add(sym) logger.warn("Multiple targets found for {}".format(sym)) tid = tids[0] rv = dba.ins_drgc_resource({ 'rssid': rssid, 'resource_type': td['resourceType'], 'target_id': tid, 'json': dbjson }) if not rv: dba_err_ct += 1 continue tmark.add(tid) res_ct += 1 print(f"{ct} RSS target resource records processed.") print(f" Skipped {skip_ct} non-pharosReady resources.") print("Inserted {} new drgc_resource rows for {} targets".format( res_ct, len(tmark))) if notfnd: print( "WARNING: No target found for {} symbols. See logfile {} for details." .format(len(notfnd), logfile)) if mulfnd: print( "WARNING: Multiple targets found for {} symbols. See logfile {} for details." .format(len(mulfnd), logfile)) if dba_err_ct > 0: print( f"ERROR: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load_pmscores(dba, logger, logfile): ensp2pids = {} # ENSP => list of TCRD protein ids pmscores = {} # protein.id => sum(all scores) pms_ct = 0 skip_ct = 0 notfnd = set() dba_err_ct = 0 infile = JL_DOWNLOAD_DIR + PM_SCORES_FILE line_ct = slmf.wcl(infile) print(f"Processing {line_ct} lines in file {infile}") with open(infile, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: # sym year score ct += 1 slmf.update_progress(ct / line_ct) if not row[0].startswith('ENSP'): skip_ct += 1 continue ensp = row[0] if ensp in ensp2pids: # we've already found it pids = ensp2pids[ensp] elif ensp in notfnd: # we've already not found it continue else: pids = dba.find_protein_ids({'stringid': ensp}) if not pids: pids = dba.find_protein_ids_by_xref({ 'xtype': 'STRING', 'value': '9606.' + ensp }) if not pids: notfnd.add(ensp) logger.warn("No protein found for {}".format(ensp)) continue ensp2pids[ ensp] = pids # save this mapping so we only lookup each ENSP once for pid in pids: rv = dba.ins_pmscore({ 'protein_id': pid, 'year': row[1], 'score': row[2] }) if rv: pms_ct += 1 else: dba_err_ct += 1 if pid in pmscores: pmscores[pid] += float(row[2]) else: pmscores[pid] = float(row[2]) print(f"{ct} input lines processed.") print(" Inserted {} new pmscore rows for {} proteins".format( pms_ct, len(pmscores))) if skip_ct: print(f" Skipped {skip_ct} rows w/o ENSP") if notfnd: print( " No protein found for {} STRING IDs. See logfile {} for details." .format(len(notfnd), logfile)) if dba_err_ct: print( f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details." ) print("Updating {} JensenLab PubMed Scores...".format(len(pmscores))) ct = 0 ti_ct = 0 dba_err_ct = 0 for pid, score in pmscores.items(): ct += 1 rv = dba.upd_pms_tdlinfo(pid, score) if rv: ti_ct += 1 else: dba_err_ct += 1 print(f" Updated {ti_ct} 'JensenLab PubMed Score' tdl_info rows") if dba_err_ct: print( f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load_tinx(args, dba, do, logger, logfile): fn = f"{TINX_OUTDIR}ProteinNovelty.csv" line_ct = slmf.wcl(fn) if not args['--quiet']: print("f\nProcessing {line_ct} lines in file {fn}") with open(fn, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # Protein ID,UniProt,Novelty ct = 1 tn_ct = 0 dba_err_ct = 0 for row in csvreader: ct += 1 slmf.update_progress(ct/line_ct) pid = row[0] rv = dba.ins_tinx_novelty( {'protein_id': pid, 'score': float(row[2])} ) if rv: tn_ct += 1 else: dba_err_ct += 1 print(f"{ct} input lines processed.") print(" Inserted {tnct} new tinx_novelty rows".) if dba_err_ct > 0: print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.") dmap = {} fn = f"{TINX_OUTDIR}DiseaseNovelty.csv" line_ct = slmf.wcl(fn) if not args['--quiet']: print("f\nProcessing {line_ct} lines in file {fn}") with open(fn, 'r') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # DOID,Novelty ct = 1 dct = 0 notfnd = set() dba_err_ct = 0 for row in csvreader: ct += 1 slmf.update_progress(ct/line_ct) doid = row[0] if doid in do: if 'name' in do[doid]: dname = do[doid]['name'][0].value else: continue if 'def' in do[doid]: ddef = do[doid]['def'][0].value else: ddef = None else: logger.warn("{row[0]} not in DO map") notfnd.append(row[0]) continue rv = dba.ins_tinx_disease( {'doid': doid, 'name': dname, 'summary': ddef, 'score': float(row[1])} ) if rv: dct += 1 dmap[doid] = rv # map DOID to tinx_disease.id else: dba_err_ct += 1 print(f"{ct} input lines processed.") print(" Inserted {dct} new tinx_disease rows".) print(" Saved {} keys in dmap".format(len(dmap))) if notfnd: print("WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile)) if dba_err_ct > 0: print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.") imap = {} fn = f"{TINX_OUTDIR}Importance.csv" line_ct = slmf.wcl(fn) if not args['--quiet']: print("f\nProcessing {line_ct} lines in file {fn}") with open(fn, 'r') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # DOID,Protein ID,UniProt,Score ct = 1 ti_ct = 0 skips1 = set() dba_err_ct = 0 for row in csvreader: ct += 1 slmf.update_progress(ct/line_ct) if row[0] not in dmap: logger.warn("{row[0]} not in dmap") skips1.add(row[0]) continue did = dmap[row[0]] pid = row[1] rv = dba.ins_tinx_importance( {'protein_id': pid, 'disease_id': did, 'score': float(row[3])} ) if rv: ti_ct += 1 # map DOID|PID to tinx_importance.id k = f"{row[0]}|{row[1]}" imap[k] = rv else: dba_err_ct += 1 print(f"{ct} input lines processed.") print(" Inserted {ti_ct} new tinx_importance rows".) print(" Saved {} keys in imap".format(len(imap))) if len(skips1) > 0: print("WARNNING: No disease found in dmap for {} DOIDs. See logfile {} for details.".format(len(skips1), logfile)) if dba_err_ct > 0: print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.") fn = f"{TINX_OUTDIR}PMIDRanking.csv" line_ct = slmf.wcl(fn) if not args['--quiet']: print("f\nProcessing {line_ct} lines in file {fn}") regex = re.compile(r"^DOID:0*") with open(fn, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # DOID,Protein ID,UniProt,PubMed ID,Rank ct = 1 tar_ct = 0 skips = set() dba_err_ct = 0 for row in csvreader: ct += 1 slmf.update_progress(ct/line_ct) k = "%s|%s"%(row[0],row[1]) if k not in imap: logger.warn("{k} not in imap") skips.add(k) continue iid = imap[k] rv = dba.ins_tinx_articlerank( {'importance_id': iid, 'pmid': row[3], 'rank': row[4]} ) if rv: tar_ct += 1 else: dba_err_ct += 1 print(f"{ct} input lines processed.") print(" Inserted {tar_ct} new tinx_articlerank rows".) if len(skips) > 0: print("WARNNING: No importance found in imap for {} keys. See logfile {} for details.".format(len(skips), logfile)) if dba_err_ct > 0: print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")
def load(args, dba, up2chembl, chembldb, logfile, logger): upct = len(up2chembl) if not args['--quiet']: print(f"\nProcessing {upct} UniProt accessions in up2chembl") ct = 0 dba_err_ct = 0 notfnd = set() nic_ct = 0 nga_ct = 0 tdl_ct = 0 ca_ct = 0 cyti_ct = 0 csti_ct = 0 t2acts = {} c2acts = {} for up in up2chembl.keys(): ct += 1 slmf.update_progress(ct / upct) tids = dba.find_target_ids({'uniprot': up}, incl_alias=True) if not tids: notfnd.add(up) logger.warn(f"No TCRD target found for UniProt {up}") continue tid = tids[0] tp = dba.get_targetprotein(tid) logger.info(f"Processing ChEMBL data for UniProt {up}: target {tid}") chembl_acts = [] for ctid in up2chembl[up]: # Query 1 with closing(chembldb.cursor(dictionary=True)) as curs: curs.execute(SQLq1, (ctid, )) for d in curs: if d['year']: d['reference'] = "{}, ({}) {}:{}:{}".format( d['journal'], d['year'], d['volume'], d['issue'], d['first_page']) else: d['reference'] = "{}, {}:{}:{}".format( d['journal'], d['volume'], d['issue'], d['first_page']) for k in ['journal', 'volume', 'issue', 'first_page']: del (d[k]) chembl_acts.append(d) # Query 2 with closing(chembldb.cursor(dictionary=True)) as curs: curs.execute(SQLq2, (ctid, )) for d in curs: d['reference'] = None chembl_acts.append(d) if tp['fam'] in CUTOFFS: cutoff = CUTOFFS[tp['fam']] else: cutoff = 6.0 # 1uM for other families logger.info(f"Filter cutoff for {up} (target id {tid}) is {cutoff}") filtered_acts = [ a for a in chembl_acts if a['pchembl_value'] >= cutoff ] logger.info("{} ChEMBL acts => {} filtered acts".format( len(chembl_acts), len(filtered_acts))) if not filtered_acts: nga_ct += 1 continue # # if we get here, the target has qualifying activites (and is thus Tchem) # # sort filtered activities by pchembl_value (descending), so that the # activity with the largest will be sorted_by_pchembl_value[0] sorted_by_pchembl_value = sorted(filtered_acts, key=itemgetter('pchembl_value'), reverse=True) # load TCRD cmpd_activities # The most potent activity value for a given target will be this one: # MIN(cmpd_activity.id) WHERE catype = 'ChEMBL' AND target_id = 3000 for a in sorted_by_pchembl_value: if 'pubmed_id' in a: pmid = a['pubmed_id'] else: pmid = None try: rv = dba.ins_cmpd_activity({ 'target_id': tid, 'catype': 'ChEMBL', 'cmpd_id_in_src': a['chembl_id'], 'cmpd_name_in_src': a['compound_name'], 'smiles': a['canonical_smiles'], 'reference': a['reference'], 'act_value': a['pchembl_value'], 'act_type': a['standard_type'], 'pubmed_ids': pmid }) except: # some names have weird hex characters and cause errors, so replace w/ ? rv = dba.ins_cmpd_activity({ 'target_id': tid, 'catype': 'ChEMBL', 'cmpd_id_in_src': a['chembl_id'], 'cmpd_name_in_src': '?', 'smiles': a['canonical_smiles'], 'reference': a['reference'], 'act_value': a['pchembl_value'], 'act_type': a['standard_type'], 'pubmed_ids': pmid }) if rv: ca_ct += 1 else: dba_err_ct += 1 # Save First ChEMBL Reference Year tdl_info, if there is one yrs = [a['year'] for a in filtered_acts if 'year' in a and a['year']] if len(yrs) > 0: first_year = min(yrs) rv = dba.ins_tdl_info({ 'target_id': tid, 'itype': 'ChEMBL First Reference Year', 'integer_value': first_year }) if rv: cyti_ct += 1 else: dba_err_ct += 1 # Save mappings for selective compound calculations t2acts[tid] = copy.copy(sorted_by_pchembl_value) for a in chembl_acts: ac = copy.copy(a) smi = ac['canonical_smiles'] del (ac['canonical_smiles']) ac['tid'] = tid ac['tname'] = tp['name'] if smi in c2acts: c2acts[smi].append(ac) else: c2acts[smi] = [ac] print(f"{ct} UniProt accessions processed.") if notfnd: print( " No TCRD target found for {} UniProt accessions. See logfile {} for details." .format(len(notfnd), logfile)) if nic_ct > 0: print(f" {nic_ct} targets not found in ChEMBL") print(f" {nga_ct} targets have no qualifying activities in ChEMBL") print(f"Inserted {ca_ct} new cmpd_activity rows") print( f"Inserted {cyti_ct} new 'ChEMBL First Reference Year' tdl_info rows") if dba_err_ct > 0: print( f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details." ) # Selective compound calculations if not args['--quiet']: print("\nRunning selective compound analysis...") c2macts = {} for c, acts in c2acts.items(): if len(acts) > 1: c2macts[c] = list(acts) # then sort the activity lists by pchembl_value c2smacts = {} for c, acts in c2macts.items(): c2smacts[c] = sorted(acts, key=itemgetter('pchembl_value')) selective = [] for smi in c2smacts.keys(): i = 1 while i <= len(c2smacts[smi]) - 1: if c2smacts[smi][i]['tid'] == c2smacts[smi][i - 1]['tid']: i += 1 continue diff = c2smacts[smi][i]['pchembl_value'] - c2smacts[smi][ i - 1]['pchembl_value'] if diff >= 2: selective.append(smi) break i += 1 if not args['--quiet']: print(" Found {} selective compounds".format(len(selective))) cscti_ct = 0 dba_err_ct = 0 for tid, acts in t2acts.items(): for a in acts: if a['canonical_smiles'] in selective: # Save ChEMBL Selective Compound tdl_info val = "{}|{}".format(a['chembl_id'], a['canonical_smiles']) rv = dba.ins_tdl_info({ 'target_id': tid, 'itype': 'ChEMBL Selective Compound', 'string_value': val }) if rv: cscti_ct += 1 else: dba_err_ct += 1 break if not args['--quiet']: print( f"Inserted {cscti_ct} new 'ChEMBL Selective Compound' tdl_info rows" ) if dba_err_ct > 0: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load(args, dba, dataset_id, logger, logfile): line_ct = slmf.wcl(HGNC_TSV_FILE) if not args['--quiet']: print(f"\nProcessing {line_ct} lines in file {HGNC_TSV_FILE}") ct = 0 hgnc_ct = 0 mgi_ct = 0 chr_ct = 0 sym_ct = 0 symdiscr_ct = 0 geneid_ct = 0 geneiddiscr_ct = 0 notfnd = set() pmark = {} db_err_ct = 0 with open(HGNC_TSV_FILE, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') for row in tsvreader: # 0: HGNC ID # 1: Approved symbol # 2: Approved name # 3: Status # 4: Chromosome # 5: Mouse genome database ID # 6: NCBI Gene ID # 7: UniProt ID if ct == 0: header = row # header line ct += 1 continue ct += 1 slmf.update_progress(ct/line_ct) sym = row[1] if row[6] != '': geneid = int(row[6]) else: geneid = None if row[7] != '': up = row[7] else: up = None pids = dba.find_protein_ids({'sym': sym}) if not pids and geneid: pids = dba.find_protein_ids({'geneid': geneid}) if not pids and up: pids = dba.find_protein_ids({'uniprot': up}) if up and not pids: notfnd.add(f"{sym}|{geneid}|{up}") logger.warn(f"No protein found for {sym}|{geneid}|{up}") continue for pid in pids: # HGNC xref hgncid = row[0].replace('HGNC:', '') rv = dba.ins_xref({'protein_id': pid, 'xtype': 'HGNC ID', 'dataset_id': dataset_id, 'value': hgncid}) if rv: hgnc_ct += 1 else: db_err_ct += 1 # MGI xref if row[5] != '': mgiid = row[5].replace('MGI:', '') rv = dba.ins_xref({'protein_id': pid, 'xtype': 'MGI ID', 'dataset_id': dataset_id, 'value': mgiid}) if rv: mgi_ct += 1 else: db_err_ct += 1 # Add protein.chr values rv = dba.do_update({'table': 'protein', 'col': 'chr', 'id': pid, 'val': row[4]}) if rv: chr_ct += 1 else: db_err_ct += 1 p = dba.get_protein(pid) # Add missing syms if p['sym'] == None: rv = dba.do_update({'table': 'protein', 'col': 'sym', 'id': pid, 'val': sym}) if rv: logger.info("Inserted new sym {} for protein {}|{}".format(sym, pid, p['uniprot'])) sym_ct += 1 else: db_err_ct += 1 else: # Check for symbol discrepancies if p['sym'] != sym: logger.warn("Symbol discrepancy: UniProt's=%s, HGNC's=%s" % (p['sym'], sym)) symdiscr_ct += 1 if geneid: # Add missing geneids if p['geneid'] == None: rv = dba.do_update({'table': 'protein', 'col': 'geneid', 'id': pid, 'val': geneid}) if rv: logger.info("Inserted new geneid {} for protein {}, {}".format(geneid, pid, p['uniprot'])) geneid_ct += 1 else: db_err_ct += 1 else: # Check for geneid discrepancies if p['geneid'] != geneid: logger.warn("GeneID discrepancy: UniProt's={}, HGNC's={}".format(p['geneid'], geneid)) geneiddiscr_ct += 1 pmark[pid] = True print("Processed {} lines - {} proteins annotated.".format(ct, len(pmark))) if notfnd: print("No protein found for {} lines (with UniProts).".format(len(notfnd))) print(f" Updated {chr_ct} protein.chr values.") print(f" Inserted {hgnc_ct} HGNC ID xrefs") print(f" Inserted {mgi_ct} MGI ID xrefs") if sym_ct > 0: print(f" Inserted {sym_ct} new HGNC symbols") if symdiscr_ct > 0: print(f"WARNING: Found {symdiscr_ct} discrepant HGNC symbols. See logfile {logfile} for details") if geneid_ct > 0: print(f" Inserted {geneid_ct} new NCBI Gene IDs") if geneiddiscr_ct > 0: print(f"WARNING: Found {geneiddiscr_ct} discrepant NCBI Gene IDs. See logfile {logfile} for details") if db_err_ct > 0: print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")