def parse_mappings(fn): line_ct = slmf.wcl(fn) print(f"\nProcessing {line_ct} input lines in mapping file {fn}") up2chembl = {} with open(fn, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') ct = 0 for row in tsvreader: ct += 1 slmf.update_progress(ct / line_ct) if row[0].startswith('#'): continue if row[3] != 'SINGLE PROTEIN': continue if row[0] in up2chembl: up2chembl[row[0]].append(row[1]) else: up2chembl[row[0]] = [row[1]] return up2chembl
def parse_protein_mentions(self): line_ct = slmf.wcl(self._protein_file) self._logger.info("Processing {} lines in protein file {}".format( line_ct, self._protein_file)) with open(self._protein_file, 'r') as tsvf: ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 if not line.startswith('ENSP'): skip_ct += 1 continue data = line.rstrip().split('\t') ensp = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) pids = self._dba.find_protein_ids({'stringid': ensp}) if not pids: # if we don't find a protein by stringid, which is the more reliable and # prefered way, try by Ensembl xref pids = self._dba.find_protein_ids_by_xref({ 'xtype': 'Ensembl', 'value': ensp }) if not pids: notfnd.add(ensp) continue for pid in pids: self._pid2pmids[pid] = self._pid2pmids[pid].union(pmids) for pmid in pmids: self._pmid_protein_ct[pmid] += 1.0 self._logger.info(f"{ct} lines processed") self._logger.info(f" Skipped {skip_ct} non-ENSP lines") self._logger.info(" Saved {} protein to PMIDs mappings".format( len(self._pid2pmids))) self._logger.info(" Saved {} PMID to protein count mappings".format( len(self._pmid_protein_ct))) if notfnd: self._logger.info(" No protein found for {} ENSPs.".format( len(notfnd))) self._logger.debug("Here they are: {}".format(', '.join(notfnd))) return (len(self._pid2pmids), len(self._pmid_protein_ct))
def parse_disease_mentions(self): line_ct = slmf.wcl(self._disease_file) self._logger.info("Processing {} lines in disease file {}".format( line_ct, self._disease_file)) with open(self._disease_file, 'r') as tsvf: ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 if not line.startswith('DOID:'): skip_ct += 1 continue data = line.rstrip().split('\t') doid = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) if doid not in self._do: self._logger.warn(f"{doid} not found in DO") notfnd.add(doid) continue if doid in self._doid2pmids: self._doid2pmids[doid] = self._doid2pmids[doid].union( pmids) else: self._doid2pmids[doid] = set(pmids) for pmid in pmids: if pmid in self._pmid_disease_ct: self._pmid_disease_ct[pmid] += 1.0 else: self._pmid_disease_ct[pmid] = 1.0 self._logger.info(f"{ct} lines processed.") self._logger.info(f" Skipped {skip_ct} non-DOID lines") self._logger.info(" Saved {} DOID to PMIDs mappings".format( len(self._doid2pmids))) self._logger.info(" Saved {} PMID to disease count mappings".format( len(self._pmid_disease_ct))) if notfnd: self._logger.warn( "No entry found in DO map for {} DOIDs: {}".format( ', '.join(notfnd))) return (len(self._doid2pmids), len(self._pmid_disease_ct))
def load_pubmed(curs, logger, logfile): st = time.time() fn = INFILES['pubmed'] line_ct = slmf.wcl(fn) print(f'\nLoading TIN-X pubmeds from {fn}...') ct = 0 pm_ct = 0 dup_ct = 0 err_ct = 0 with open(fn, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') for row in tsvreader: if ct == 0: # skip header header = row # header line ct += 1 continue ct += 1 slmf.update_progress(ct/line_ct) try: curs.execute(INS_SQL['pubmed'], tuple(row)) pm_ct += 1 except Error as e: if f"Duplicate entry '{row[0]}'" in e.msg: # this should not happen under "production" runs, but it's here for testing/debugging dup_ct += 1 continue else: err_ct += 1 logger.error(f"``{e}`` for line {ct}. Data: {row}") continue ets = slmf.secs2str(time.time() - st) print(f"\n Processed {ct} lines. Inserted {pm_ct} pubmed rows. Elapsed time: {ets}") if err_ct: print(f" WARNING: {err_ct} errors occurred. See logfile {logfile} for details.") if dup_ct: print(f" Skipped {dup_ct} existing pubmeds.") print("Done.")
def load(args, dba, logger, logfile): line_ct = slmf.wcl(IDG_LIST_FILE) print(f"\nProcessing {line_ct} lines in file {IDG_LIST_FILE}") logger.info(f"Processing {line_ct} lines in list file {IDG_LIST_FILE}") ct = 0 idg_ct = 0 fam_ct = 0 notfnd = [] multfnd = [] dba_err_ct = 0 with open(IDG_LIST_FILE, 'r') as ifh: csvreader = csv.reader(ifh) for row in csvreader: if ct == 0: header = row # header line ct += 1 continue ct += 1 slmf.update_progress(ct / line_ct) sym = row[0] fam = row[1] if fam == 'IonChannel': fam = 'IC' tids = dba.find_target_ids({'sym': sym}) if not tids: notfnd.append(sym) continue if len(tids) > 1: multfnd.append(sym) continue rv = dba.do_update({ 'table': 'target', 'col': 'idg', 'id': tids[0], 'val': 1 }) if rv: idg_ct += 1 else: db_err_ct += 1 rv = dba.do_update({ 'table': 'target', 'col': 'fam', 'id': tids[0], 'val': fam }) if rv: fam_ct += 1 else: dba_err_ct += 1 print(f"{ct} lines processed") print(f"{idg_ct} target rows updated with IDG flags") print(f"{fam_ct} target rows updated with fams") if notfnd: print("WARNING: No target found for {} symbols: {}".format( len(notfnd), ", ".join(notfnd))) if multfnd: print("WARNING: Multiple targets found for {} symbols: {}".format( len(multfnd), ", ".join(multfnd))) if dba_err_ct > 0: print( f"WARNING: {dba_err_ct} database errors occured. See logfile {logfile} for details." )
def tinx(args, dba, logger, logfile): # The results of parsing the input mentions files will be the following dictionaries: pid2pmids = {} # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein # Including the UniProt accession in the key is just for convenience when # checking the output. It is not used for anything. doid2pmids = {} # DOID => set of all PMIDs that mention the disease pmid_disease_ct = {} # PMID => count of diseases mentioned in a given paper pmid_protein_ct = {} # PMID => count of proteins mentioned in a given paper # First parse the Disease Ontology OBO file to get DO names and defs dofile = DO_DOWNLOAD_DIR + DO_OBO print(f"\nParsing Disease Ontology file {dofile}") do_parser = obo.Parser(dofile) do = {} for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags print(" Got {} Disease Ontology terms".format(len(do))) fn = JL_DOWNLOAD_DIR+PROTEIN_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print(f"\nProcessing {line_ct} lines in protein file {fn}") with open(fn, 'rU') as tsvf: #pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 slmf.update_progress(ct/line_ct) if not line.startswith('ENSP'): skip_ct += 1 continue data = line.rstrip().split('\t') ensp = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) tids = dba.find_target_ids({'stringid': ensp}) if not tids: # if we don't find a target by stringid, which is the more reliable and # prefered way, try by Ensembl xref tids = dba.find_target_ids_by_xref({'xtype': 'Ensembl', 'value': ensp}) if not tids: notfnd.add(ensp) continue for tid in tids: t = dba.get_target(tid, annot=False) p = t['components']['protein'][0] k = "{},{}".format(p['id'], p['uniprot']) if k in pid2pmids: pid2pmids[k] = pid2pmids[k].union(pmids) else: pid2pmids[k] = set(pmids) for pmid in pmids: if pmid in pmid_protein_ct: pmid_protein_ct[pmid] += 1.0 else: pmid_protein_ct[pmid] = 1.0 for ensp in notfnd: logger.warn(f"No target found for {ensp}") print(f"\n{ct} lines processed") print(f" Skipped {skip_ct} non-ENSP lines") print(" Saved {} protein to PMIDs mappings".format(len(pid2pmids))) print(" Saved {} PMID to protein count mappings".format(len(pmid_protein_ct))) if notfnd: print(" No target found for {} ENSPs. See logfile {} for details.".format(len(notfnd), logfile)) fn = JL_DOWNLOAD_DIR+DISEASE_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print(f"\nProcessing {line_ct} lines in file {fn}") with open(fn, 'rU') as tsvf: ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 slmf.update_progress(ct/line_ct) if not line.startswith('DOID:'): skip_ct += 1 continue data = line.rstrip().split('\t') doid = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) if doid not in do: logger.warn(f"{doid} not found in DO") notfnd.add(doid) continue if doid in doid2pmids: doid2pmids[doid] = doid2pmids[doid].union(pmids) else: doid2pmids[doid] = set(pmids) for pmid in pmids: if pmid in pmid_disease_ct: pmid_disease_ct[pmid] += 1.0 else: pmid_disease_ct[pmid] = 1.0 print(f"\n{ct} lines processed.") print(f" Skipped {skip_ct} non-DOID lines") print(" Saved {} DOID to PMIDs mappings".format(len(doid2pmids))) print(" Saved {} PMID to disease count mappings".format(len(pmid_disease_ct))) if notfnd: print("WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile)) if not args['--quiet']: print("\nComputing protein novely scores") # To calculate novelty scores, each paper (PMID) is assigned a # fractional target (FT) score of one divided by the number of targets # mentioned in it. The novelty score of a given protein is one divided # by the sum of the FT scores for all the papers mentioning that # protein. ct = 0 with open(PROTEIN_NOVELTY_FILE, 'w') as pnovf: pnovf.write("Protein ID,UniProt,Novelty\n") for k in pid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in pid2pmids[k]: ft_score_sum += 1.0 / pmid_protein_ct[pmid] novelty = 1.0 / ft_score_sum pnovf.write( "%s,%.8f\n" % (k, novelty) ) print(f" Wrote {ct} novelty scores to file {PROTEIN_NOVELTY_FILE}") if not args['--quiet']: print("\nComputing disease novely scores") # Exactly as for proteins, but using disease mentions ct = 0 with open(DISEASE_NOVELTY_FILE, 'w') as dnovf: dnovf.write("DOID,Novelty\n") for doid in doid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in doid2pmids[doid]: ft_score_sum += 1.0 / pmid_disease_ct[pmid] novelty = 1.0 / ft_score_sum dnovf.write( "%s,%.8f\n" % (doid, novelty) ) print(f" Wrote {ct} novelty scores to file {DISEASE_NOVELTY_FILE}") if not args['--quiet']: print("\nComputing importance scores") # To calculate importance scores, each paper is assigned a fractional # disease-target (FDT) score of one divided by the product of the # number of targets mentioned and the number of diseases # mentioned. The importance score for a given disease-target pair is # the sum of the FDT scores for all papers mentioning that disease and # protein. ct = 0 with open(IMPORTANCE_FILE, 'w') as impf: impf.write("DOID,Protein ID,UniProt,Score\n") for k,ppmids in pid2pmids.items(): for doid,dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) fdt_score_sum = 0.0 for pmid in pd_pmids: fdt_score_sum += 1.0 / ( pmid_protein_ct[pmid] * pmid_disease_ct[pmid] ) if fdt_score_sum > 0: ct += 1 impf.write( "%s,%s,%.8f\n" % (doid, k, fdt_score_sum) ) print(f" Wrote {ct} importance scores to file {IMPORTANCE_FILE}") if not args['--quiet']: print("\nComputing PubMed rankings") # PMIDs are ranked for a given disease-target pair based on a score # calculated by multiplying the number of targets mentioned and the # number of diseases mentioned in that paper. Lower scores have a lower # rank (higher priority). If the scores do not discriminate, PMIDs are # reverse sorted by value with the assumption that larger PMIDs are # newer and of higher priority. ct = 0 with open(PMID_RANKING_FILE, 'w') as pmrf: pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n") for k,ppmids in pid2pmids.items(): for doid,dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) scores = [] # scores are tuples of (PMID, protein_mentions*disease_mentions) for pmid in pd_pmids: scores.append( (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid]) ) if len(scores) > 0: scores.sort(key = cmp_to_key(cmp_pmids_scores)) for i,t in enumerate(scores): ct += 1 pmrf.write( "%s,%s,%d,%d\n" % (doid, k, t[0], i) ) print(f" Wrote {ct} PubMed rankings to file {PMID_RANKING_FILE}")
def load_pmscores(dba, logger, logfile): ensp2pids = {} # ENSP => list of TCRD protein ids pmscores = {} # protein.id => sum(all scores) pms_ct = 0 skip_ct = 0 notfnd = set() dba_err_ct = 0 infile = JL_DOWNLOAD_DIR + PM_SCORES_FILE line_ct = slmf.wcl(infile) print(f"Processing {line_ct} lines in file {infile}") with open(infile, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 for row in tsvreader: # sym year score ct += 1 slmf.update_progress(ct / line_ct) if not row[0].startswith('ENSP'): skip_ct += 1 continue ensp = row[0] if ensp in ensp2pids: # we've already found it pids = ensp2pids[ensp] elif ensp in notfnd: # we've already not found it continue else: pids = dba.find_protein_ids({'stringid': ensp}) if not pids: pids = dba.find_protein_ids_by_xref({ 'xtype': 'STRING', 'value': '9606.' + ensp }) if not pids: notfnd.add(ensp) logger.warn("No protein found for {}".format(ensp)) continue ensp2pids[ ensp] = pids # save this mapping so we only lookup each ENSP once for pid in pids: rv = dba.ins_pmscore({ 'protein_id': pid, 'year': row[1], 'score': row[2] }) if rv: pms_ct += 1 else: dba_err_ct += 1 if pid in pmscores: pmscores[pid] += float(row[2]) else: pmscores[pid] = float(row[2]) print(f"{ct} input lines processed.") print(" Inserted {} new pmscore rows for {} proteins".format( pms_ct, len(pmscores))) if skip_ct: print(f" Skipped {skip_ct} rows w/o ENSP") if notfnd: print( " No protein found for {} STRING IDs. See logfile {} for details." .format(len(notfnd), logfile)) if dba_err_ct: print( f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details." ) print("Updating {} JensenLab PubMed Scores...".format(len(pmscores))) ct = 0 ti_ct = 0 dba_err_ct = 0 for pid, score in pmscores.items(): ct += 1 rv = dba.upd_pms_tdlinfo(pid, score) if rv: ti_ct += 1 else: dba_err_ct += 1 print(f" Updated {ti_ct} 'JensenLab PubMed Score' tdl_info rows") if dba_err_ct: print( f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load_DISEASES(dba, logger, logfile): # Knowledge channel fn = JL_DOWNLOAD_DIR + DISEASES_FILE_K line_ct = slmf.wcl(fn) print(f"Processing {line_ct} lines in DISEASES Knowledge file {fn}") with open(fn, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') ct = 0 k2pids = {} # ENSP|sym => list of TCRD protein ids pmark = {} skip_ct = 0 notfnd = set() dis_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 slmf.update_progress(ct / line_ct) if not row[0].startswith('ENSP'): skip_ct += 1 continue ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: pids = dba.find_protein_ids({'stringid': ensp}) if not pids: pids = dba.find_protein_ids({'sym': sym}) if not pids: notfnd.add(k) logger.warn(f"No protein found for {k}") continue k2pids[ k] = pids # save this mapping so we only lookup each ENSP|sym once dtype = 'JensenLab Knowledge ' + row[4] for pid in pids: rv = dba.ins_disease({ 'protein_id': pid, 'dtype': dtype, 'name': row[3], 'did': row[2], 'evidence': row[5], 'conf': row[6] }) if rv: dis_ct += 1 pmark[pid] = True else: dba_err_ct += 1 print(f"{ct} lines processed.") print(" Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark))) if skip_ct: print(f" Skipped {skip_ct} rows w/o ENSP") if notfnd: print( " No target found for {} stringids/symbols. See logfile {} for details." .format(len(notfnd), logfile)) if dba_err_ct: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." ) # Experiment channel fn = JL_DOWNLOAD_DIR + DISEASES_FILE_E line_ct = slmf.wcl(fn) print(f"Processing {line_ct} lines in DISEASES Experiment file {fn}") with open(fn, 'rU') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') ct = 0 k2pids = {} # ENSP|sym => list of TCRD protein ids pmark = {} notfnd = set() dis_ct = 0 skip_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 slmf.update_progress(ct / line_ct) if not row[0].startswith('ENSP'): skip_ct += 1 continue if row[2].startswith('ENSP'): skip_ct += 1 continue ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: pids = dba.find_protein_ids({'stringid': ensp}) if not pids: pids = dba.find_protein_ids({'sym': sym}) if not pids: notfnd.add(k) logger.warn(f"No protein found for {k}") continue k2pids[ k] = pids # save this mapping so we only lookup each ENSP|sym once dtype = 'JensenLab Experiment ' + row[4] for pid in pids: rv = dba.ins_disease({ 'protein_id': pid, 'dtype': dtype, 'name': row[3], 'did': row[2], 'evidence': row[5], 'conf': row[6] }) if rv: dis_ct += 1 pmark[pid] = True else: dba_err_ct += 1 print(f"{ct} lines processed.") print(" Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark))) if skip_ct: print(f" Skipped {skip_ct} rows w/o ENSP or with ENSP did") if notfnd: print( " No target found for {} stringids/symbols. See logfile {} for details." .format(len(notfnd), logfile)) if dba_err_ct: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." ) # Text Mining channel fn = JL_DOWNLOAD_DIR + DISEASES_FILE_T line_ct = slmf.wcl(fn) print(f"Processing {line_ct} lines in DISEASES Textmining file {fn}") with open(fn, 'rU') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') ct = 0 k2pids = {} # ENSP|sym => list of TCRD protein ids pmark = {} notfnd = set() dis_ct = 0 skip_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 slmf.update_progress(ct / line_ct) if not row[0].startswith('ENSP'): skip_ct += 1 continue if float(row[5]) < 3.0: # skip rows with confidence < 3.0 skip_ct += 1 continue ensp = row[0] sym = row[1] k = "%s|%s" % (ensp, sym) if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: pids = dba.find_protein_ids({'stringid': ensp}) if not pids: pids = dba.find_protein_ids({'sym': sym}) if not pids: notfnd.add(k) logger.warn(f"No protein found for {k}") continue k2pids[ k] = pids # save this mapping so we only lookup each ENSP|sym once dtype = 'JensenLab Text Mining' for pid in pids: rv = dba.ins_disease({ 'protein_id': pid, 'dtype': dtype, 'name': row[3], 'did': row[2], 'zscore': row[4], 'conf': row[5] }) if rv: dis_ct += 1 pmark[pid] = True else: dba_err_ct += 1 print(f"{ct} lines processed.") print(" Inserted {} new disease rows for {} proteins".format( dis_ct, len(pmark))) if skip_ct: print(f" Skipped {skip_ct} rows w/o ENSP or with confidence < 3") if notfnd: print( " No target found for {} stringids/symbols. See logfile {} for details." .format(len(notfnd), logfile)) if dba_err_ct: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load(args, dba, dataset_id, logger, logfile): line_ct = slmf.wcl(HGNC_TSV_FILE) if not args['--quiet']: print(f"\nProcessing {line_ct} lines in file {HGNC_TSV_FILE}") ct = 0 hgnc_ct = 0 mgi_ct = 0 chr_ct = 0 sym_ct = 0 symdiscr_ct = 0 geneid_ct = 0 geneiddiscr_ct = 0 notfnd = set() pmark = {} db_err_ct = 0 with open(HGNC_TSV_FILE, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') for row in tsvreader: # 0: HGNC ID # 1: Approved symbol # 2: Approved name # 3: Status # 4: Chromosome # 5: Mouse genome database ID # 6: NCBI Gene ID # 7: UniProt ID if ct == 0: header = row # header line ct += 1 continue ct += 1 slmf.update_progress(ct/line_ct) sym = row[1] if row[6] != '': geneid = int(row[6]) else: geneid = None if row[7] != '': up = row[7] else: up = None pids = dba.find_protein_ids({'sym': sym}) if not pids and geneid: pids = dba.find_protein_ids({'geneid': geneid}) if not pids and up: pids = dba.find_protein_ids({'uniprot': up}) if up and not pids: notfnd.add(f"{sym}|{geneid}|{up}") logger.warn(f"No protein found for {sym}|{geneid}|{up}") continue for pid in pids: # HGNC xref hgncid = row[0].replace('HGNC:', '') rv = dba.ins_xref({'protein_id': pid, 'xtype': 'HGNC ID', 'dataset_id': dataset_id, 'value': hgncid}) if rv: hgnc_ct += 1 else: db_err_ct += 1 # MGI xref if row[5] != '': mgiid = row[5].replace('MGI:', '') rv = dba.ins_xref({'protein_id': pid, 'xtype': 'MGI ID', 'dataset_id': dataset_id, 'value': mgiid}) if rv: mgi_ct += 1 else: db_err_ct += 1 # Add protein.chr values rv = dba.do_update({'table': 'protein', 'col': 'chr', 'id': pid, 'val': row[4]}) if rv: chr_ct += 1 else: db_err_ct += 1 p = dba.get_protein(pid) # Add missing syms if p['sym'] == None: rv = dba.do_update({'table': 'protein', 'col': 'sym', 'id': pid, 'val': sym}) if rv: logger.info("Inserted new sym {} for protein {}|{}".format(sym, pid, p['uniprot'])) sym_ct += 1 else: db_err_ct += 1 else: # Check for symbol discrepancies if p['sym'] != sym: logger.warn("Symbol discrepancy: UniProt's=%s, HGNC's=%s" % (p['sym'], sym)) symdiscr_ct += 1 if geneid: # Add missing geneids if p['geneid'] == None: rv = dba.do_update({'table': 'protein', 'col': 'geneid', 'id': pid, 'val': geneid}) if rv: logger.info("Inserted new geneid {} for protein {}, {}".format(geneid, pid, p['uniprot'])) geneid_ct += 1 else: db_err_ct += 1 else: # Check for geneid discrepancies if p['geneid'] != geneid: logger.warn("GeneID discrepancy: UniProt's={}, HGNC's={}".format(p['geneid'], geneid)) geneiddiscr_ct += 1 pmark[pid] = True print("Processed {} lines - {} proteins annotated.".format(ct, len(pmark))) if notfnd: print("No protein found for {} lines (with UniProts).".format(len(notfnd))) print(f" Updated {chr_ct} protein.chr values.") print(f" Inserted {hgnc_ct} HGNC ID xrefs") print(f" Inserted {mgi_ct} MGI ID xrefs") if sym_ct > 0: print(f" Inserted {sym_ct} new HGNC symbols") if symdiscr_ct > 0: print(f"WARNING: Found {symdiscr_ct} discrepant HGNC symbols. See logfile {logfile} for details") if geneid_ct > 0: print(f" Inserted {geneid_ct} new NCBI Gene IDs") if geneiddiscr_ct > 0: print(f"WARNING: Found {geneiddiscr_ct} discrepant NCBI Gene IDs. See logfile {logfile} for details") if db_err_ct > 0: print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")
def load_tinx(args, dba, do, logger, logfile): fn = f"{TINX_OUTDIR}ProteinNovelty.csv" line_ct = slmf.wcl(fn) if not args['--quiet']: print("f\nProcessing {line_ct} lines in file {fn}") with open(fn, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # Protein ID,UniProt,Novelty ct = 1 tn_ct = 0 dba_err_ct = 0 for row in csvreader: ct += 1 slmf.update_progress(ct/line_ct) pid = row[0] rv = dba.ins_tinx_novelty( {'protein_id': pid, 'score': float(row[2])} ) if rv: tn_ct += 1 else: dba_err_ct += 1 print(f"{ct} input lines processed.") print(" Inserted {tnct} new tinx_novelty rows".) if dba_err_ct > 0: print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.") dmap = {} fn = f"{TINX_OUTDIR}DiseaseNovelty.csv" line_ct = slmf.wcl(fn) if not args['--quiet']: print("f\nProcessing {line_ct} lines in file {fn}") with open(fn, 'r') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # DOID,Novelty ct = 1 dct = 0 notfnd = set() dba_err_ct = 0 for row in csvreader: ct += 1 slmf.update_progress(ct/line_ct) doid = row[0] if doid in do: if 'name' in do[doid]: dname = do[doid]['name'][0].value else: continue if 'def' in do[doid]: ddef = do[doid]['def'][0].value else: ddef = None else: logger.warn("{row[0]} not in DO map") notfnd.append(row[0]) continue rv = dba.ins_tinx_disease( {'doid': doid, 'name': dname, 'summary': ddef, 'score': float(row[1])} ) if rv: dct += 1 dmap[doid] = rv # map DOID to tinx_disease.id else: dba_err_ct += 1 print(f"{ct} input lines processed.") print(" Inserted {dct} new tinx_disease rows".) print(" Saved {} keys in dmap".format(len(dmap))) if notfnd: print("WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile)) if dba_err_ct > 0: print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.") imap = {} fn = f"{TINX_OUTDIR}Importance.csv" line_ct = slmf.wcl(fn) if not args['--quiet']: print("f\nProcessing {line_ct} lines in file {fn}") with open(fn, 'r') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # DOID,Protein ID,UniProt,Score ct = 1 ti_ct = 0 skips1 = set() dba_err_ct = 0 for row in csvreader: ct += 1 slmf.update_progress(ct/line_ct) if row[0] not in dmap: logger.warn("{row[0]} not in dmap") skips1.add(row[0]) continue did = dmap[row[0]] pid = row[1] rv = dba.ins_tinx_importance( {'protein_id': pid, 'disease_id': did, 'score': float(row[3])} ) if rv: ti_ct += 1 # map DOID|PID to tinx_importance.id k = f"{row[0]}|{row[1]}" imap[k] = rv else: dba_err_ct += 1 print(f"{ct} input lines processed.") print(" Inserted {ti_ct} new tinx_importance rows".) print(" Saved {} keys in imap".format(len(imap))) if len(skips1) > 0: print("WARNNING: No disease found in dmap for {} DOIDs. See logfile {} for details.".format(len(skips1), logfile)) if dba_err_ct > 0: print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.") fn = f"{TINX_OUTDIR}PMIDRanking.csv" line_ct = slmf.wcl(fn) if not args['--quiet']: print("f\nProcessing {line_ct} lines in file {fn}") regex = re.compile(r"^DOID:0*") with open(fn, 'rU') as csvfile: csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # DOID,Protein ID,UniProt,PubMed ID,Rank ct = 1 tar_ct = 0 skips = set() dba_err_ct = 0 for row in csvreader: ct += 1 slmf.update_progress(ct/line_ct) k = "%s|%s"%(row[0],row[1]) if k not in imap: logger.warn("{k} not in imap") skips.add(k) continue iid = imap[k] rv = dba.ins_tinx_articlerank( {'importance_id': iid, 'pmid': row[3], 'rank': row[4]} ) if rv: tar_ct += 1 else: dba_err_ct += 1 print(f"{ct} input lines processed.") print(" Inserted {tar_ct} new tinx_articlerank rows".) if len(skips) > 0: print("WARNNING: No importance found in imap for {} keys. See logfile {} for details.".format(len(skips), logfile)) if dba_err_ct > 0: print(f"WARNING: {db_err_ct} DB errors occurred. See logfile {logfile} for details.")
def load(args, dba, logger, logfile): fn = DOWNLOAD_DIR + GENO_PHENO_FILE.replace('.gz', '') line_ct = slmf.wcl(fn) if not args['--quiet']: print(f"\nProcessing {line_ct} lines in input file {fn}") ct = 0 pt_ct = 0 pmark = {} sym2nhpids = {} notfnd = set() skip_ct = 0 dba_err_ct = 0 with open(fn, 'r') as csvfile: csvreader = csv.reader(csvfile) for row in csvreader: # 0: marker_accession_id # 1: marker_symbol # 2: phenotyping_center # 3: colony_id # 4: sex # 5: zygosity # 6: allele_accession_id # 7: allele_symbol # 8: allele_name # 9: strain_accession_id # 10: strain_name # 11: project_name # 12: project_fullname # 13: pipeline_name # 14: pipeline_stable_id # 15: procedure_stable_id # 16: procedure_name # 17: parameter_stable_id # 18: parameter_name # 19: top_level_mp_term_id # 20: top_level_mp_term_name # 21: mp_term_id # 22: mp_term_name # 23: p_value # 24: percentage_change # 25: effect_size # 26: statistical_method # 27: resource_name if ct == 0: header = row # header line ct += 1 continue ct += 1 slmf.update_progress(ct/line_ct) sym = row[1] if not row[21] and not row[22]: # skip data with neither a term_id or term_name skip_ct += 1 continue if sym in sym2nhpids: # we've already found it nhpids = sym2nhpids[sym] elif sym in notfnd: # we've already not found it continue else: nhpids = dba.find_nhprotein_ids({'sym': sym}, species = 'Mus musculus') if not nhpids: notfnd.add(sym) logger.warn("No nhprotein found for symbol {}".format(sym)) continue sym2nhpids[sym] = nhpids # save this mapping so we only lookup each symbol once pval = None if row[23] and row[23] != '': try: pval = float(row[23]) except: logger.warn("Problem converting p_value {} for row {}".format(row[23], ct)) sex = None if row[4] and len(row[4]) <= 8: sex = row[4] for nhpid in nhpids: rv = dba.ins_phenotype({'nhprotein_id': nhpid, 'ptype': 'IMPC', 'top_level_term_id': row[19], 'top_level_term_name': row[20], 'term_id': row[21], 'term_name': row[22], 'p_value': pval, 'percentage_change': row[24], 'effect_size': row[25], 'procedure_name': row[16], 'parameter_name': row[18], 'statistical_method': row[26], 'sex': sex, 'gp_assoc': 1}) if rv: pmark[nhpid] = True pt_ct += 1 else: dba_err_ct += 1 print(f"{ct} lines processed.") print("Loaded {} IMPC phenotypes for {} nhproteins".format(pt_ct, len(pmark))) if notfnd: print("No nhprotein found for {} gene symbols. See logfile {} for details.".format(len(notfnd), logfile)) if skip_ct > 0: print(f"Skipped {skip_ct} lines with no term_id or term_name.") if dba_err_ct > 0: print(f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.") fn = DOWNLOAD_DIR + STAT_RES_FILE.replace('.gz', '') line_ct = slmf.wcl(fn) if not args['--quiet']: print(f"\nProcessing {line_ct} lines from input file {fn}") ct = 0 pt_ct = 0 pmark = {} sym2nhpids = {} notfnd = set() skip_ct = 0 pv_ct = 0 dba_err_ct = 0 with open(fn, 'rU') as csvfile: csvreader = csv.reader(csvfile) for row in csvreader: # 0: phenotyping_center # 1: intercept_estimate # 2: procedure_id # 3: mutant_biological_model_id # 4: rotated_residuals_test # 5: weight_effect_p_value # 6: male_mutant_count # 7: pipeline_stable_key # 8: female_ko_effect_p_value # 9: pipeline_stable_id # 10: parameter_stable_key # 11: data_type # 12: parameter_stable_id # 13: interaction_significant # 14: strain_accession_id # 15: control_selection_method # 16: parameter_name # 17: allele_name # 18: phenotyping_center_id # 19: weight_effect_stderr_estimate # 20: weight_effect_parameter_estimate # 21: procedure_stable_id # 22: status # 23: sex_effect_parameter_estimate # 24: female_ko_effect_stderr_estimate # 25: female_percentage_change # 26: group_2_residuals_normality_test # 27: marker_accession_id # 28: mp_term_name # 29: group_1_residuals_normality_test # 30: genotype_effect_p_value # 31: dependent_variable # 32: resource_name # 33: project_id # 34: procedure_name # 35: doc_id # 36: top_level_mp_term_id # 37: allele_accession_id # 38: blups_test # 39: null_test_p_value # 40: p_value # 41: marker_symbol # 42: control_biological_model_id # 43: pipeline_name # 44: sex # 45: interaction_effect_p_value # 46: colony_id # 47: project_name # 48: female_ko_parameter_estimate # 49: female_mutant_count # 50: organisation_id # 51: external_db_id # 52: female_control_count # 53: intermediate_mp_term_id # 54: db_id # 55: male_ko_effect_p_value # 56: top_level_mp_term_name # 57: metadata_group # 58: sex_effect_stderr_estimate # 59: zygosity # 60: male_percentage_change # 61: sex_effect_p_value # 62: mp_term_id # 63: male_ko_effect_stderr_estimate # 64: additional_information # 65: statistical_method # 66: _version_ # 67: intercept_estimate_stderr_estimate # 68: male_control_count # 69: intermediate_mp_term_name # 70: strain_name # 71: classification_tag # 72: effect_size # 73: procedure_stable_key # 74: allele_symbol # 75: resource_id # 76: group_2_genotype # 77: variance_significant # 78: pipeline_id # 79: group_1_genotype # 80: male_ko_parameter_estimate # 81: genotype_effect_parameter_estimate # 82: categories # 83: parameter_id # 84: batch_significant # 85: genotype_effect_stderr_estimate # 86: resource_fullname if ct == 0: header = row # header line ct += 1 continue ct += 1 slmf.update_progress(ct/line_ct) sym = row[41] if not row[62] and not row[28]: # skip lines with neither a term_id or term_name skip_ct += 1 continue if sym in sym2nhpids: # we've already found it nhpids = sym2nhpids[sym] elif sym in notfnd: # we've already not found it continue else: nhpids = dba.find_nhprotein_ids({'sym': sym}, species = 'Mus musculus') if not nhpids: notfnd.add(sym) logger.warn("No nhprotein found for symbol {}".format(sym)) continue sym2nhpids[sym] = nhpids # save this mapping so we only lookup each symbol once pval = None if row[40] and row[40] != '': try: pval = float(row[40]) except: logger.warn("Problem converting p_value {} for row {}".format(row[40], ct)) sex = None if row[4] and len(row[4]) <= 8: sex = row[4] for nhpid in nhpids: rv = dba.ins_phenotype({'nhprotein_id': nhpid, 'ptype': 'IMPC', 'top_level_term_id': row[36], 'top_level_term_name': row[56], 'term_id': row[62], 'term_name': row[28], 'p_value': pval, 'effect_size': row[72], 'procedure_name': row[34], 'parameter_name': row[16], 'statistical_method': row[65], 'sex': sex, 'gp_assoc': 0}) if rv: pmark[nhpid] = True pt_ct += 1 else: dba_err_ct += 1 print(f"{ct} lines processed.") print("Loaded {} IMPC phenotypes for {} nhproteins".format(pt_ct, len(pmark))) if notfnd: print("No nhprotein found for {} gene symbols. See logfile {} for details.".format(len(notfnd), logfile)) if skip_ct > 0: print(f"Skipped {skip_ct} lines with no term_id or term_name.") if dba_err_ct > 0: print(f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details.")
def load(dba, logger, logfile): infile = DOWNLOAD_DIR + TIGA_FILE line_ct = slmf.wcl(infile) print(f"\nProcessing {line_ct} lines in TIGA file {infile}") ct = 0 k2pids = defaultdict(list) # maps sym|ENSG to TCRD protein_id(s) notfnd = set() pmark = {} tiga_ct = 0 dba_err_ct = 0 with open(infile, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') for row in tsvreader: if ct == 0: # skip header header = row # header line ct += 1 continue # 0: ensemblId # 1: efoId # 2: trait # 3: n_study # 4: n_snp # 5: n_snpw # 6: geneNtrait # 7: geneNstudy # 8: traitNgene # 9: traitNstudy # 10: pvalue_mlog_median # 11: pvalue_mlog_max # 12: or_median # 13: n_beta # 14: study_N_mean # 15: rcras # 16: geneSymbol # 17: TDL # 18: geneFamily # 19: geneIdgList # 20: geneName # 21: meanRank # 22: meanRankScore ct += 1 slmf.update_progress(ct / line_ct) sym = row[16] ensg = row[0] k = sym + '|' + ensg pids = [] if k in k2pids: # we've already found it pids = k2pids[k] elif k in notfnd: # we've already not found it continue else: # look it up pids = dba.find_protein_ids({'sym': sym}) if not pids: pids = dba.find_protein_ids_by_xref({ 'xtype': 'Ensembl', 'value': ensg }) if not pids: notfnd.add(k) continue k2pids[ k] = pids # save this mapping so we only lookup each sym/ENSG once init = { 'ensg': ensg, 'efoid': row[1], 'trait': row[2], 'n_study': row[3], 'n_snp': row[4], 'n_snpw': row[5], 'geneNtrait': row[6], 'geneNstudy': row[7], 'traitNgene': row[8], 'traitNstudy': row[9], 'pvalue_mlog_median': row[10], 'pvalue_mlog_max': row[11], 'n_beta': row[13], 'study_N_mean': row[14], 'rcras': row[15], 'meanRank': row[21], 'meanRankScore': row[22] } if row[12] != 'NA': init['or_median'] = row[12] #if row[] != 'NA': # init[''] = row[] for pid in pids: init['protein_id'] = pid rv = dba.ins_tiga(init) if not rv: dba_err_ct += 1 continue tiga_ct += 1 pmark[pid] = True for k in notfnd: logger.warn(f"No protein found for {k}") print(f"Processed {ct} lines") print(" Inserted {} new tiga rows for {} proteins".format( tiga_ct, len(pmark))) if notfnd: print("No target found for {} sym/ENSGs. See logfile {} for details.". format(len(notfnd), logfile)) if dba_err_ct > 0: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." ) infile = DOWNLOAD_DIR + TIGA_PROV_FILE line_ct = slmf.wcl(infile) print(f"\nProcessing {line_ct} lines in TIGA provenance file {infile}") ct = 0 tigaprov_ct = 0 dba_err_ct = 0 with open(infile, 'r') as ifh: tsvreader = csv.reader(ifh, delimiter='\t') for row in tsvreader: if ct == 0: # skip header header = row # header line ct += 1 continue # 0: ensemblId # 1: TRAIT_URI # 2: STUDY_ACCESSION # 3: PUBMEDID # 4: efoId ct += 1 slmf.update_progress(ct / line_ct) rv = dba.ins_tiga_provenance({ 'ensg': row[0], 'efoid': row[4], 'study_acc': row[2], 'pubmedid': row[3] }) if not rv: dba_err_ct += 1 continue tigaprov_ct += 1 print(f"Processed {ct} lines") print(f" Inserted {tigaprov_ct} new tiga_provenance rows") if dba_err_ct > 0: print( f"WARNING: {dba_err_ct} DB errors occurred. See logfile {logfile} for details." )
def load(args, dba, logger, logfile): # OMIMs and Phenotypic Series fname = DOWNLOAD_DIR + TITLES_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print(f"\nProcessing {line_ct} lines from input file {fname}") with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 omim_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0: Prefix ??? # 1: Mim Number # 2: Preferred Title; symbol Alternative Title(s); symbol(s) # 3: Included Title(s); symbols title = row[2].partition(';')[0] rv = dba.ins_omim({'mim': row[1], 'title': title}) if not rv: dba_err_ct += 1 continue omim_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print "Loaded {} new omim rows".format(omim_ct) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) fname = DOWNLOAD_DIR + PS_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print '\nProcessing %d lines from input file %s' % (line_ct, fname) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 skip_ct = 0 ps_ct = 0 err_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0: Phenotypic Series Number # 1: Mim Number # 2: Phenotype if len(row) ==2: init = {'omim_ps_id': row[0], 'title': row[1]} elif len(row) == 3: init = {'omim_ps_id': row[0], 'mim': row[1], 'title': row[2]} else: err_ct += 1 logger.warn("Parsing error for row {}".format(row)) continue rv = dba.ins_omim_ps(init) if not rv: dba_err_ct += 1 continue ps_ct += 1 pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print "Loaded {} new omim_ps rows".format(ps_ct) if err_ct > 0: print "WARNING: {} parsing errors occurred. See logfile {} for details.".format(er_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) # Phenotypes fname = DOWNLOAD_DIR + GENEMAP_FILE line_ct = slmf.wcl(fname) if not args['--quiet']: print '\nProcessing %d lines from input file %s' % (line_ct, fname) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() with open(fname, 'rU') as tsv: tsvreader = csv.reader(tsv, delimiter='\t') ct = 0 tmark = {} skip_ct = 0 notfnd_ct = 0 prov_ct = 0 dds_ct = 0 pt_ct = 0 dba_err_ct = 0 for row in tsvreader: ct += 1 if row[0].startswith('#'): # The file has commented lines skip_ct += 1 continue # The fields are: # 0 - Sort ??? # 1 - Month # 2 - Day # 3 - Year # 4 - Cytogenetic location # 5 - Gene Symbol(s) # 6 - Confidence # 7 - Gene Name # 8 - MIM Number # 9 - Mapping Method # 10 - Comments # 11 - Phenotypes # 12 - Mouse Gene Symbol pts = row[11] if pts.startswith('?'): prov_ct += 1 continue if '(4)' in pts: dds_ct += 1 trait = "MIM Number: %s" % row[8] if row[11]: trait += "; Phenotype: %s" % pts found = False syms = row[5].split(', ') logger.info("Checking for OMIM syms: {}".format(syms)) for sym in syms: targets = dba.find_targets({'sym': sym}) if targets: found = True for t in targets: p = t['components']['protein'][0] logger.info(" Symbol {} found target {}: {}, {}".format(sym, t['id'], p['name'], p['description'])) rv = dba.ins_phenotype({'protein_id': p['id'], 'ptype': 'OMIM', 'trait': trait}) if not rv: dba_err_ct += 1 continue tmark[t['id']] = True pt_ct += 1 if not found: notfnd_ct += 1 logger.warn("No target found for row {}".format(row)) pbar.update(ct) pbar.finish() print "{} lines processed".format(ct) print " Skipped {} commented lines.".format(skip_ct) print " Skipped {} provisional phenotype rows.".format(prov_ct) print " Skipped {} deletion/duplication syndrome rows.".format(dds_ct) print "Loaded {} OMIM phenotypes for {} targets".format(pt_ct, len(tmark)) if notfnd_ct > 0: print "No target found for {} good lines. See logfile {} for details.".format(notfnd_ct, logfile) if dba_err_ct > 0: print "WARNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)