def parse_do(f): do = {} with open(f, 'r') as fh: do_parser = obo.Parser(fh) for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags return do
def parse_do(args, dofile): if not args['--quiet']: print(f"\nParsing Disease Ontology file {dofile}") do_parser = obo.Parser(dofile) do = {} for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags if not args['--quiet']: print(" Got {} Disease Ontology terms".format(len(do))) return do
def parse_uberon(args, fn): if not args['--quiet']: print(f"Parsing Uberon Ontology file {fn}") uber_parser = obo.Parser(fn) raw_uber = {} for stanza in uber_parser: if stanza.name != 'Term': continue raw_uber[stanza.tags['id'][0].value] = stanza.tags uberd = {} for uid, ud in raw_uber.items(): if 'is_obsolete' in ud: continue if 'name' not in ud: continue init = {'uid': uid, 'name': ud['name'][0].value} if 'def' in ud: init['def'] = ud['def'][0].value if 'comment' in ud: init['comment'] = ud['comment'][0].value if 'is_a' in ud: init['parents'] = [] for parent in ud['is_a']: # some parent values have a source ie. 'UBERON:0010134 {source="MA"}' # get rid of this for now cp = parent.value.split(' ')[0] init['parents'].append(cp) if 'xref' in ud: init['xrefs'] = [] for xref in ud['xref']: if xref.value.startswith('http') or xref.value.startswith( 'url'): continue if len(xref.value.split(' ')) == 1: (db, val) = xref.value.split(':') if db.endswith('_RETIRED'): continue init['xrefs'].append({'db': db, 'value': val}) else: (dbval, src) = xref.value.split(' ', 1) (db, val) = dbval.split(':') if db.endswith('_RETIRED'): continue init['xrefs'].append({ 'db': db, 'value': val, 'source': src }) uberd[uid] = init if not args['--quiet']: print(" Got {} Uberon Ontology terms".format(len(uberd))) return uberd
def parse_rdo_obo(args, fn): if not args['--quiet']: print "\nParsing RGD Disease Ontology file {}".format(fn) rdo_parser = obo.Parser(open(fn)) raw_rdo = {} for stanza in rdo_parser: if stanza.name != 'Term': continue raw_rdo[stanza.tags['id'][0].value] = stanza.tags rdod = {} for doid, d in raw_rdo.items(): if not doid.startswith('DOID:'): continue if 'is_obsolete' in d: continue init = {'doid': doid, 'name': d['name'][0].value} if 'def' in d: init['def'] = d['def'][0].value # if 'is_a' in d: # init['parents'] = [] # for parent in d['is_a']: # init['parents'].append(parent.value) if 'alt_id' in d: init['xrefs'] = [] for aid in d['alt_id']: if aid.value.startswith('http'): continue try: (db, val) = aid.value.split(':') except: pass init['xrefs'].append({'db': db, 'value': val}) if 'xref' in d: if 'xrefs' not in init: init['xrefs'] = [] for xref in d['xref']: if xref.value.startswith('http'): continue try: (db, val) = xref.value.split(':') except: pass init['xrefs'].append({'db': db, 'value': val}) rdod[doid] = init if not args['--quiet']: print "Got {} RGD Disease Ontology terms".format(len(rdod)) return rdod
def parse_uberon_obo(args, fn): if not args['--quiet']: print "Parsing Uberon Ontology file {}".format(fn) uber_parser = obo.Parser(open(fn)) raw_uber = {} for stanza in uber_parser: if stanza.name != 'Term': continue raw_uber[stanza.tags['id'][0].value] = stanza.tags uberd = {} for uid, ud in raw_uber.items(): if 'is_obsolete' in ud: continue if 'name' not in ud: continue init = {'uid': uid, 'name': ud['name'][0].value} if 'def' in ud: init['def'] = ud['def'][0].value if 'comment' in ud: init['comment'] = ud['comment'][0].value if 'is_a' in ud: init['parents'] = [] for parent in ud['is_a']: # some parent values have a source ie. 'UBERON:0010134 {source="MA"}' # get rid of this for now cp = parent.value.split(' ')[0] init['parents'].append(cp) if 'xref' in ud: init['xrefs'] = [] for xref in ud['xref']: if xref.value.startswith('http'): continue try: (db, val) = xref.value.split(':') except: pass if not db.isupper(): # there are all kinds of xrefs like xref: Wolffian:duct # skip these continue if db.endswith('_RETIRED'): continue init['xrefs'].append({'db': db, 'value': val}) uberd[uid] = init if not args['--quiet']: print " Got {} Uberon Ontology terms".format(len(uberd)) return uberd
def parse_mondo(args, fn): if not args['--quiet']: print(f"Parsing Mondo file {fn}") mondo_parser = obo.Parser(fn) raw_mondo = {} for stanza in mondo_parser: if stanza.name != 'Term': continue raw_mondo[stanza.tags['id'][0].value] = stanza.tags mondod = {} for mondoid, md in raw_mondo.items(): if 'is_obsolete' in md: continue if 'name' not in md: continue init = {'mondoid': mondoid, 'name': md['name'][0].value} if 'def' in md: init['def'] = md['def'][0].value if 'comment' in md: init['comment'] = md['comment'][0].value if 'is_a' in md: init['parents'] = [] for parent in md['is_a']: # for now, just ignore parent source infos, if any. cp = parent.value.split(' ')[0] init['parents'].append(cp) if 'xref' in md: init['xrefs'] = [] for xref in md['xref']: if xref.value.startswith('http') or xref.value.startswith( 'url'): continue if len(xref.value.split(' ')) == 1: (db, val) = xref.value.split(':') init['xrefs'].append({'db': db, 'value': val}) else: (dbval, src) = xref.value.split(' ', 1) (db, val) = dbval.split(':') init['xrefs'].append({ 'db': db, 'value': val, 'source': src }) mondod[mondoid] = init if not args['--quiet']: print(" Got {} Mondo terms".format(len(mondod))) return mondod
def mk_eco_map(): print "\nParsing Evidence Ontology file {}".format(ECO_OBO_FILE) parser = obo.Parser(ECO_OBO_FILE) eco = {} for stanza in parser: eco[stanza.tags['id'][0].value] = stanza.tags regex = re.compile(r'GOECO:([A-Z]{2,3})') eco_map = {} for e,d in eco.items(): if not e.startswith('ECO:'): continue if 'xref' in d: for x in d['xref']: m = regex.match(x.value) if m: eco_map[e] = m.group(1) return eco_map
def mk_eco_map(args): """ Return a mapping of Evidence Ontology ECO IDs to Go Evidence Codes. """ fn = ECO_DOWNLOAD_DIR + ECO_OBO if not args['--quiet']: print(f"\nParsing Evidence Ontology file {fn}") eco = {} eco_map = {} parser = obo.Parser(fn) for stanza in parser: eco[stanza.tags['id'][0].value] = stanza.tags regex = re.compile(r'GOECO:([A-Z]{2,3})') for e, d in eco.items(): if not e.startswith('ECO:'): continue if 'xref' in d: for x in d['xref']: m = regex.match(x.value) if m: eco_map[e] = m.group(1) return eco_map
def parse_do(args, fn): if not args['--quiet']: print(f"Parsing Disease Ontology file {fn}") do_parser = obo.Parser(fn) raw_do = {} for stanza in do_parser: if stanza.name != 'Term': continue raw_do[stanza.tags['id'][0].value] = stanza.tags dod = {} for doid, d in raw_do.items(): if not doid.startswith('DOID:'): continue if 'is_obsolete' in d: continue init = {'doid': doid, 'name': d['name'][0].value} if 'def' in d: init['def'] = d['def'][0].value if 'is_a' in d: init['parents'] = [] for parent in d['is_a']: init['parents'].append(parent.value) if 'xref' in d: init['xrefs'] = [] for xref in d['xref']: if xref.value.startswith('http'): continue try: (db, val) = xref.value.split(':') except: pass init['xrefs'].append({'db': db, 'value': val}) dod[doid] = init if not args['--quiet']: print(" Got {} Disease Ontology terms".format(len(dod))) return dod
def parse_mpo(args, fn): if not args['--quiet']: print(f"Parsing Mammalian Phenotype Ontology file {fn}") mpo_parser = obo.Parser(open(fn)) raw_mpo = {} for stanza in mpo_parser: if stanza.name != 'Term': continue raw_do[stanza.tags['id'][0].value] = stanza.tags mpod = {} for mpoid, d in raw_mpo.items(): #if not mpoid.startswith('MPOID:'): # continue if 'is_obsolete' in d: continue init = {'mpoid': mpoid, 'name': d['name'][0].value} if 'def' in d: init['def'] = d['def'][0].value if 'is_a' in d: init['parents'] = [] for parent in d['is_a']: init['parents'].append(parent.value) if 'xref' in d: init['xrefs'] = [] for xref in d['xref']: if xref.value.startswith('http'): continue try: (db, val) = xref.value.split(':') except: pass init['xrefs'].append({'db': db, 'value': val}) mpod[mpoid] = init if not args['--quiet']: print(" Got {} Mammalian Phenotype Ontology terms".format(len(mpod))) return mpod
def tinx(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = { 'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__ } dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info( "Connected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format( args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # The results of parsing the input mentions files will be the following dictionaries: pid2pmids = { } # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein # Including the UniProt accession in the key is just for convenience when # checking the output. It is not used for anything. doid2pmids = {} # DOID => set of all PMIDs that mention the disease pmid_disease_ct = { } # PMID => count of diseases mentioned in a given paper pmid_protein_ct = { } # PMID => count of proteins mentioned in a given paper # First parse the Disease Ontology OBO file to get DO names and defs dofile = DO_DOWNLOAD_DIR + DO_OBO print "\nParsing Disease Ontology file {}".format(dofile) do_parser = obo.Parser(open(dofile)) do = {} for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags print " Got {} Disease Ontology terms".format(len(do)) pbar_widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA() ] fn = JL_DOWNLOAD_DIR + PROTEIN_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in protein file {}".format(line_ct, fn) with open(fn, 'rU') as tsvf: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 pbar.update(ct) if not line.startswith('ENSP'): skip_ct += 1 continue data = line.rstrip().split('\t') ensp = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) targets = dba.find_targets({'stringid': ensp}) if not targets: # if we don't find a target by stringid, which is the more reliable and # prefered way, try by Ensembl xref targets = dba.find_targets_by_xref({ 'xtype': 'Ensembl', 'value': ensp }) if not targets: notfnd.add(ensp) continue for t in targets: p = t['components']['protein'][0] k = "%s,%s" % (p['id'], p['uniprot']) if k in pid2pmids: pid2pmids[k] = pid2pmids[k].union(pmids) else: pid2pmids[k] = set(pmids) for pmid in pmids: if pmid in pmid_protein_ct: pmid_protein_ct[pmid] += 1.0 else: pmid_protein_ct[pmid] = 1.0 pbar.finish() for ensp in notfnd: logger.warn("No target found for {}".format(ensp)) print "{} lines processed.".format(ct) print " Skipped {} non-ENSP lines".format(skip_ct) print " Saved {} protein to PMIDs mappings".format(len(pid2pmids)) print " Saved {} PMID to protein count mappings".format( len(pmid_protein_ct)) if notfnd: print " No target found for {} ENSPs. See logfile {} for details.".format( len(notfnd), logfile) fn = JL_DOWNLOAD_DIR + DISEASE_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, fn) with open(fn, 'rU') as tsvf: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 pbar.update(ct) if not line.startswith('DOID:'): skip_ct += 1 continue data = line.rstrip().split('\t') doid = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) if doid not in do: logger.warn("%s not found in DO" % doid) notfnd.add(doid) continue if doid in doid2pmids: doid2pmids[doid] = doid2pmids[doid].union(pmids) else: doid2pmids[doid] = set(pmids) for pmid in pmids: if pmid in pmid_disease_ct: pmid_disease_ct[pmid] += 1.0 else: pmid_disease_ct[pmid] = 1.0 pbar.finish() print "{} lines processed.".format(ct) print " Skipped {} non-DOID lines".format(skip_ct) print " Saved {} DOID to PMIDs mappings".format(len(doid2pmids)) print " Saved {} PMID to disease count mappings".format( len(pmid_disease_ct)) if notfnd: print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format( len(notfnd), logfile) if not args['--quiet']: print "\nComputing protein novely scores" # To calculate novelty scores, each paper (PMID) is assigned a # fractional target (FT) score of one divided by the number of targets # mentioned in it. The novelty score of a given protein is one divided # by the sum of the FT scores for all the papers mentioning that # protein. ct = 0 with open(PROTEIN_NOVELTY_FILE, 'wb') as pnovf: pnovf.write("Protein ID,UniProt,Novelty\n") for k in pid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in pid2pmids[k]: ft_score_sum += 1.0 / pmid_protein_ct[pmid] novelty = 1.0 / ft_score_sum pnovf.write("%s,%.8f\n" % (k, novelty)) print " Wrote {} novelty scores to file {}".format( ct, PROTEIN_NOVELTY_FILE) if not args['--quiet']: print "\nComputing disease novely scores" # Exactly as for proteins, but using disease mentions ct = 0 with open(DISEASE_NOVELTY_FILE, 'wb') as dnovf: dnovf.write("DOID,Novelty\n") for doid in doid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in doid2pmids[doid]: ft_score_sum += 1.0 / pmid_disease_ct[pmid] novelty = 1.0 / ft_score_sum dnovf.write("%s,%.8f\n" % (doid, novelty)) print " Wrote {} novelty scores to file {}".format( ct, DISEASE_NOVELTY_FILE) if not args['--quiet']: print "\nComputing importance scores" # To calculate importance scores, each paper is assigned a fractional # disease-target (FDT) score of one divided by the product of the # number of targets mentioned and the number of diseases # mentioned. The importance score for a given disease-target pair is # the sum of the FDT scores for all papers mentioning that disease and # protein. ct = 0 with open(IMPORTANCE_FILE, 'wb') as impf: impf.write("DOID,Protein ID,UniProt,Score\n") for k, ppmids in pid2pmids.items(): for doid, dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) fdt_score_sum = 0.0 for pmid in pd_pmids: fdt_score_sum += 1.0 / (pmid_protein_ct[pmid] * pmid_disease_ct[pmid]) if fdt_score_sum > 0: ct += 1 impf.write("%s,%s,%.8f\n" % (doid, k, fdt_score_sum)) print " Wrote {} importance scores to file {}".format(ct, IMPORTANCE_FILE) if not args['--quiet']: print "\nComputing PubMed rankings" # PMIDs are ranked for a given disease-target pair based on a score # calculated by multiplying the number of targets mentioned and the # number of diseases mentioned in that paper. Lower scores have a lower # rank (higher priority). If the scores do not discriminate, PMIDs are # reverse sorted by value with the assumption that larger PMIDs are # newer and of higher priority. ct = 0 with open(PMID_RANKING_FILE, 'wb') as pmrf: pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n") for k, ppmids in pid2pmids.items(): for doid, dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) scores = [ ] # scores are tuples of (PMID, protein_mentions*disease_mentions) for pmid in pd_pmids: scores.append( (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid])) if len(scores) > 0: scores.sort(cmp_pmids_scores) for i, t in enumerate(scores): ct += 1 pmrf.write("%s,%s,%d,%d\n" % (doid, k, t[0], i)) print " Wrote {} PubMed rankings to file {}".format(ct, PMID_RANKING_FILE)
def tinx(args, dba, logger, logfile): # The results of parsing the input mentions files will be the following dictionaries: pid2pmids = {} # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein # Including the UniProt accession in the key is just for convenience when # checking the output. It is not used for anything. doid2pmids = {} # DOID => set of all PMIDs that mention the disease pmid_disease_ct = {} # PMID => count of diseases mentioned in a given paper pmid_protein_ct = {} # PMID => count of proteins mentioned in a given paper # First parse the Disease Ontology OBO file to get DO names and defs dofile = DO_DOWNLOAD_DIR + DO_OBO print(f"\nParsing Disease Ontology file {dofile}") do_parser = obo.Parser(dofile) do = {} for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags print(" Got {} Disease Ontology terms".format(len(do))) fn = JL_DOWNLOAD_DIR+PROTEIN_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print(f"\nProcessing {line_ct} lines in protein file {fn}") with open(fn, 'rU') as tsvf: #pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 slmf.update_progress(ct/line_ct) if not line.startswith('ENSP'): skip_ct += 1 continue data = line.rstrip().split('\t') ensp = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) tids = dba.find_target_ids({'stringid': ensp}) if not tids: # if we don't find a target by stringid, which is the more reliable and # prefered way, try by Ensembl xref tids = dba.find_target_ids_by_xref({'xtype': 'Ensembl', 'value': ensp}) if not tids: notfnd.add(ensp) continue for tid in tids: t = dba.get_target(tid, annot=False) p = t['components']['protein'][0] k = "{},{}".format(p['id'], p['uniprot']) if k in pid2pmids: pid2pmids[k] = pid2pmids[k].union(pmids) else: pid2pmids[k] = set(pmids) for pmid in pmids: if pmid in pmid_protein_ct: pmid_protein_ct[pmid] += 1.0 else: pmid_protein_ct[pmid] = 1.0 for ensp in notfnd: logger.warn(f"No target found for {ensp}") print(f"\n{ct} lines processed") print(f" Skipped {skip_ct} non-ENSP lines") print(" Saved {} protein to PMIDs mappings".format(len(pid2pmids))) print(" Saved {} PMID to protein count mappings".format(len(pmid_protein_ct))) if notfnd: print(" No target found for {} ENSPs. See logfile {} for details.".format(len(notfnd), logfile)) fn = JL_DOWNLOAD_DIR+DISEASE_FILE line_ct = slmf.wcl(fn) if not args['--quiet']: print(f"\nProcessing {line_ct} lines in file {fn}") with open(fn, 'rU') as tsvf: ct = 0 skip_ct = 0 notfnd = set() for line in tsvf: ct += 1 slmf.update_progress(ct/line_ct) if not line.startswith('DOID:'): skip_ct += 1 continue data = line.rstrip().split('\t') doid = data[0] pmids = set([int(pmid) for pmid in data[1].split()]) if doid not in do: logger.warn(f"{doid} not found in DO") notfnd.add(doid) continue if doid in doid2pmids: doid2pmids[doid] = doid2pmids[doid].union(pmids) else: doid2pmids[doid] = set(pmids) for pmid in pmids: if pmid in pmid_disease_ct: pmid_disease_ct[pmid] += 1.0 else: pmid_disease_ct[pmid] = 1.0 print(f"\n{ct} lines processed.") print(f" Skipped {skip_ct} non-DOID lines") print(" Saved {} DOID to PMIDs mappings".format(len(doid2pmids))) print(" Saved {} PMID to disease count mappings".format(len(pmid_disease_ct))) if notfnd: print("WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile)) if not args['--quiet']: print("\nComputing protein novely scores") # To calculate novelty scores, each paper (PMID) is assigned a # fractional target (FT) score of one divided by the number of targets # mentioned in it. The novelty score of a given protein is one divided # by the sum of the FT scores for all the papers mentioning that # protein. ct = 0 with open(PROTEIN_NOVELTY_FILE, 'w') as pnovf: pnovf.write("Protein ID,UniProt,Novelty\n") for k in pid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in pid2pmids[k]: ft_score_sum += 1.0 / pmid_protein_ct[pmid] novelty = 1.0 / ft_score_sum pnovf.write( "%s,%.8f\n" % (k, novelty) ) print(f" Wrote {ct} novelty scores to file {PROTEIN_NOVELTY_FILE}") if not args['--quiet']: print("\nComputing disease novely scores") # Exactly as for proteins, but using disease mentions ct = 0 with open(DISEASE_NOVELTY_FILE, 'w') as dnovf: dnovf.write("DOID,Novelty\n") for doid in doid2pmids.keys(): ct += 1 ft_score_sum = 0.0 for pmid in doid2pmids[doid]: ft_score_sum += 1.0 / pmid_disease_ct[pmid] novelty = 1.0 / ft_score_sum dnovf.write( "%s,%.8f\n" % (doid, novelty) ) print(f" Wrote {ct} novelty scores to file {DISEASE_NOVELTY_FILE}") if not args['--quiet']: print("\nComputing importance scores") # To calculate importance scores, each paper is assigned a fractional # disease-target (FDT) score of one divided by the product of the # number of targets mentioned and the number of diseases # mentioned. The importance score for a given disease-target pair is # the sum of the FDT scores for all papers mentioning that disease and # protein. ct = 0 with open(IMPORTANCE_FILE, 'w') as impf: impf.write("DOID,Protein ID,UniProt,Score\n") for k,ppmids in pid2pmids.items(): for doid,dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) fdt_score_sum = 0.0 for pmid in pd_pmids: fdt_score_sum += 1.0 / ( pmid_protein_ct[pmid] * pmid_disease_ct[pmid] ) if fdt_score_sum > 0: ct += 1 impf.write( "%s,%s,%.8f\n" % (doid, k, fdt_score_sum) ) print(f" Wrote {ct} importance scores to file {IMPORTANCE_FILE}") if not args['--quiet']: print("\nComputing PubMed rankings") # PMIDs are ranked for a given disease-target pair based on a score # calculated by multiplying the number of targets mentioned and the # number of diseases mentioned in that paper. Lower scores have a lower # rank (higher priority). If the scores do not discriminate, PMIDs are # reverse sorted by value with the assumption that larger PMIDs are # newer and of higher priority. ct = 0 with open(PMID_RANKING_FILE, 'w') as pmrf: pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n") for k,ppmids in pid2pmids.items(): for doid,dpmids in doid2pmids.items(): pd_pmids = ppmids.intersection(dpmids) scores = [] # scores are tuples of (PMID, protein_mentions*disease_mentions) for pmid in pd_pmids: scores.append( (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid]) ) if len(scores) > 0: scores.sort(key = cmp_to_key(cmp_pmids_scores)) for i,t in enumerate(scores): ct += 1 pmrf.write( "%s,%s,%d,%d\n" % (doid, k, t[0], i) ) print(f" Wrote {ct} PubMed rankings to file {PMID_RANKING_FILE}")
print(f"Error deleting JensenLab rows from disease... Exiting.") exit(1) # load new DISEAESES load_DISEASES(args, dba, logger, logfile) # update dataset upds = {'app': PROGRAM, 'app_version': __version__, 'datetime': time.strftime("%Y-%m-%d %H:%M:%S")} rv = upd_dataset_by_name(self, 'Jensen Lab DISEASES', upds): assert rv "Error updating dataset 'Jensen Lab DISEASES'. Exiting." print("\Generating new TIN-X Files...") # parse the Disease Ontology OBO file to get DO names and defs dofile = DO_DOWNLOAD_DIR+DO_OBO if not args['--quiet']: print(f"\nParsing Disease Ontology file {dofile}") do_parser = obo.Parser(dofile) do = {} for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags if not args['--quiet']: print(" Got {} Disease Ontology terms".format(len(do))) tinx_logfile = LOGDIR+'TINX.log' tinx = TINX({'TINX_PROTEIN_FILE': DOWNLOAD_DIR+TINX_PROTEIN_FILE, 'TINX_DISEASE_FILE': DOWNLOAD_DIR+TINX_DISEASE_FILE, 'logfile': tinx_logfile, 'OUTDIR': TINX_OUTDIR}, dba, do) (ct1, ct2) = tinx.parse_protein_mentions() if not args['--quiet']: print(f"Saved {ct1} protein to PMIDs mappings and {ct2} PMID to protein count mappings. See logfile {tinx_logfile} for details.") (ct1, ct2) = tinx.parse_disease_mentions() if not args['--quiet']: print(f"Saved {ct1} disease to PMIDs mappings and {ct2} PMID to disease count mappings. See logfile {tinx_logfile} for details.")
def load(args): loglevel = int(args['--loglevel']) if args['--logfile']: logfile = args['--logfile'] else: logfile = LOGFILE logger = logging.getLogger(__name__) logger.setLevel(loglevel) if not args['--debug']: logger.propagate = False # turns off console logging fh = logging.FileHandler(logfile) fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S') fh.setFormatter(fmtr) logger.addHandler(fh) dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__} dba = DBAdaptor(dba_params) dbi = dba.get_dbinfo() logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])) if not args['--quiet']: print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']) # Dataset dataset_id = dba.ins_dataset( {'name': 'TIN-X Data', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'Data is generated by python/TIN-X.py from mentions files http://download.jensenlab.org/human_textmining_mentions.tsv and http://download.jensenlab.org/disease_textmining_mentions.tsv.'} ) assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile) # Provenance provs = [ {'dataset_id': dataset_id, 'table_name': 'tinx_novelty', 'comment': "Protein novelty scores are generated from results of JensenLab textmining of PubMed in the file http://download.jensenlab.org/human_textmining_mentions.tsv. To calculate novelty scores, each paper (PMID) is assigned a fractional target (FT) score of one divided by the number of targets mentioned in it. The novelty score of a given protein is one divided by the sum of the FT scores for all the papers mentioning that protein."}, {'dataset_id': dataset_id, 'table_name': 'tinx_disease', 'comment': "Disease novelty scores are generated from results of JensenLab textmining of PubMed in the file http://download.jensenlab.org/disease_textmining_mentions.tsv. To calculate novelty scores, each paper (PMID) is assigned a fractional disease (FD) score of one divided by the number of targets mentioned in it. The novelty score of a given disease is one divided by the sum of the FT scores for all the papers mentioning that disease."}, {'dataset_id': dataset_id, 'table_name': 'tinx_importance', 'comment': "To calculate importance scores, each paper is assigned a fractional disease-target (FDT) score of one divided by the product of the number of targets mentioned and the number of diseases mentioned. The importance score for a given disease-target pair is the sum of the FDT scores for all papers mentioning that disease and protein."}, {'dataset_id': dataset_id, 'table_name': 'tinx_articlerank', 'comment': "PMIDs are ranked for a given disease-target pair based on a score calculated by multiplying the number of targets mentioned and the number of diseases mentioned in that paper. Lower scores have a lower rank (higher priority). If the scores do not discriminate, PMIDs are reverse sorted by value with the assumption that larger PMIDs are newer and of higher priority."}] for prov in provs: rv = dba.ins_provenance(prov) assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile) # First parse the Disease Ontology OBO file to get DO names and defs print "\nParsing Disease Ontology file {}".format(DISEASE_ONTOLOGY_OBO) do_parser = obo.Parser(open(DISEASE_ONTOLOGY_OBO)) do = {} for stanza in do_parser: do[stanza.tags['id'][0].value] = stanza.tags print " Got {} Disease Ontology terms".format(len(do)) pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()] dmap = {} line_ct = slmf.wcl(DISEASE_NOVELTY_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, DISEASE_NOVELTY_FILE) with open(DISEASE_NOVELTY_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # DOID,Novelty ct = 0 dct = 0 notfnd = set() dba_err_ct = 0 for row in csvreader: ct += 1 pbar.update(ct) doid = row[0] if doid in do: if 'name' in do[doid]: dname = do[doid]['name'][0].value else: continue if 'def' in do[doid]: ddef = do[doid]['def'][0].value else: ddef = None else: logger.warn("%s not in DO map" % row[0]) notfnd.append(row[0]) continue rv = dba.ins_tinx_disease( {'doid': doid, 'name': dname, 'summary': ddef, 'score': float(row[1])} ) if rv: dct += 1 dmap[doid] = rv # map DOID to tinx_disease.id else: dba_err_ct += 1 pbar.finish() print "{} lines processed.".format(ct) print " Inserted {} new tinx_disease rows".format(dct) print " Saved {} keys in dmap".format(len(dmap)) if notfnd: print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) line_ct = slmf.wcl(PROTEIN_NOVELTY_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, PROTEIN_NOVELTY_FILE) with open(PROTEIN_NOVELTY_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # Protein ID,UniProt,Novelty ct = 0 tn_ct = 0 dba_err_ct = 0 for row in csvreader: ct += 1 pbar.update(ct) pid = row[0] rv = dba.ins_tinx_novelty( {'protein_id': pid, 'score': float(row[2])} ) if rv: tn_ct += 1 else: dba_err_ct += 1 pbar.finish() print "{} lines processed.".format(ct) print " Inserted {} new tinx_novelty rows".format(tn_ct) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) imap = {} line_ct = slmf.wcl(IMPORTANCE_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, IMPORTANCE_FILE) with open(IMPORTANCE_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # DOID,Protein ID,UniProt,Score ct = 0 ti_ct = 0 skips1 = set() dba_err_ct = 0 for row in csvreader: ct += 1 pbar.update(ct) if row[0] not in dmap: logger.error("%s not in dmap" % row[0]) skips1.add(row[0]) continue did = dmap[row[0]] pid = row[1] rv = dba.ins_tinx_importance( {'protein_id': pid, 'disease_id': did, 'score': float(row[3])} ) if rv: ti_ct += 1 # map DOID|PID to tinx_importance.id k = "%s|%s"%(row[0],row[1]) imap[k] = rv else: dba_err_ct += 1 pbar.finish() print "{} lines processed.".format(ct) print " Inserted {} new tinx_importance rows".format(ti_ct) print " Saved {} keys in imap".format(len(imap)) if len(skips1) > 0: print "WARNNING: No disease found in dmap for {} DOIDs. See logfile {} for details.".format(len(skips1), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile) line_ct = slmf.wcl(PMID_RANKING_FILE) if not args['--quiet']: print "\nProcessing {} lines in file {}".format(line_ct, PMID_RANKING_FILE) regex = re.compile(r"^DOID:0*") with open(PMID_RANKING_FILE, 'rU') as csvfile: pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() csvreader = csv.reader(csvfile) header = csvreader.next() # skip header line # DOID,Protein ID,UniProt,PubMed ID,Rank ct = 0 tar_ct = 0 skips = set() dba_err_ct = 0 for row in csvreader: ct += 1 pbar.update(ct) k = "%s|%s"%(row[0],row[1]) if k not in imap: logger.warn("%s not in imap" % k) skips.add(k) continue iid = imap[k] rv = dba.ins_tinx_articlerank( {'importance_id': iid, 'pmid': row[3], 'rank': row[4]} ) if rv: tar_ct += 1 else: dba_err_ct += 1 pbar.finish() print "{} lines processed.".format(ct) print " Inserted {} new tinx_articlerank rows".format(tar_ct) if len(skips) > 0: print "WARNNING: No importance found in imap for {} keys. See logfile {} for details.".format(len(skips), logfile) if dba_err_ct > 0: print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)