Beispiel #1
0
def parse_do(f):
  do = {}
  with open(f, 'r') as fh:
    do_parser = obo.Parser(fh)
    for stanza in do_parser:
      do[stanza.tags['id'][0].value] = stanza.tags
  return do
Beispiel #2
0
def parse_do(args, dofile):
  if not args['--quiet']:
    print(f"\nParsing Disease Ontology file {dofile}")
  do_parser = obo.Parser(dofile)
  do = {}
  for stanza in do_parser:
    do[stanza.tags['id'][0].value] = stanza.tags
  if not args['--quiet']:
    print("  Got {} Disease Ontology terms".format(len(do)))
  return do
Beispiel #3
0
def parse_uberon(args, fn):
    if not args['--quiet']:
        print(f"Parsing Uberon Ontology file {fn}")
    uber_parser = obo.Parser(fn)
    raw_uber = {}
    for stanza in uber_parser:
        if stanza.name != 'Term':
            continue
        raw_uber[stanza.tags['id'][0].value] = stanza.tags
    uberd = {}
    for uid, ud in raw_uber.items():
        if 'is_obsolete' in ud:
            continue
        if 'name' not in ud:
            continue
        init = {'uid': uid, 'name': ud['name'][0].value}
        if 'def' in ud:
            init['def'] = ud['def'][0].value
        if 'comment' in ud:
            init['comment'] = ud['comment'][0].value
        if 'is_a' in ud:
            init['parents'] = []
            for parent in ud['is_a']:
                # some parent values have a source ie. 'UBERON:0010134 {source="MA"}'
                # get rid of this for now
                cp = parent.value.split(' ')[0]
                init['parents'].append(cp)
        if 'xref' in ud:
            init['xrefs'] = []
            for xref in ud['xref']:
                if xref.value.startswith('http') or xref.value.startswith(
                        'url'):
                    continue
                if len(xref.value.split(' ')) == 1:
                    (db, val) = xref.value.split(':')
                    if db.endswith('_RETIRED'):
                        continue
                    init['xrefs'].append({'db': db, 'value': val})
                else:
                    (dbval, src) = xref.value.split(' ', 1)
                    (db, val) = dbval.split(':')
                    if db.endswith('_RETIRED'):
                        continue
                    init['xrefs'].append({
                        'db': db,
                        'value': val,
                        'source': src
                    })
        uberd[uid] = init
    if not args['--quiet']:
        print("  Got {} Uberon Ontology terms".format(len(uberd)))
    return uberd
Beispiel #4
0
def parse_rdo_obo(args, fn):
    if not args['--quiet']:
        print "\nParsing RGD Disease Ontology file {}".format(fn)
    rdo_parser = obo.Parser(open(fn))
    raw_rdo = {}
    for stanza in rdo_parser:
        if stanza.name != 'Term':
            continue
        raw_rdo[stanza.tags['id'][0].value] = stanza.tags
    rdod = {}
    for doid, d in raw_rdo.items():
        if not doid.startswith('DOID:'):
            continue
        if 'is_obsolete' in d:
            continue
        init = {'doid': doid, 'name': d['name'][0].value}
        if 'def' in d:
            init['def'] = d['def'][0].value
        # if 'is_a' in d:
        #   init['parents'] = []
        #   for parent in d['is_a']:
        #     init['parents'].append(parent.value)
        if 'alt_id' in d:
            init['xrefs'] = []
            for aid in d['alt_id']:
                if aid.value.startswith('http'):
                    continue
                try:
                    (db, val) = aid.value.split(':')
                except:
                    pass
                init['xrefs'].append({'db': db, 'value': val})
        if 'xref' in d:
            if 'xrefs' not in init:
                init['xrefs'] = []
            for xref in d['xref']:
                if xref.value.startswith('http'):
                    continue
                try:
                    (db, val) = xref.value.split(':')
                except:
                    pass
                init['xrefs'].append({'db': db, 'value': val})
        rdod[doid] = init
    if not args['--quiet']:
        print "Got {} RGD Disease Ontology terms".format(len(rdod))
    return rdod
Beispiel #5
0
def parse_uberon_obo(args, fn):
    if not args['--quiet']:
        print "Parsing Uberon Ontology file {}".format(fn)
    uber_parser = obo.Parser(open(fn))
    raw_uber = {}
    for stanza in uber_parser:
        if stanza.name != 'Term':
            continue
        raw_uber[stanza.tags['id'][0].value] = stanza.tags
    uberd = {}
    for uid, ud in raw_uber.items():
        if 'is_obsolete' in ud:
            continue
        if 'name' not in ud:
            continue
        init = {'uid': uid, 'name': ud['name'][0].value}
        if 'def' in ud:
            init['def'] = ud['def'][0].value
        if 'comment' in ud:
            init['comment'] = ud['comment'][0].value
        if 'is_a' in ud:
            init['parents'] = []
            for parent in ud['is_a']:
                # some parent values have a source ie. 'UBERON:0010134 {source="MA"}'
                # get rid of this for now
                cp = parent.value.split(' ')[0]
                init['parents'].append(cp)
        if 'xref' in ud:
            init['xrefs'] = []
            for xref in ud['xref']:
                if xref.value.startswith('http'):
                    continue
                try:
                    (db, val) = xref.value.split(':')
                except:
                    pass
                if not db.isupper():
                    # there are all kinds of xrefs like xref: Wolffian:duct
                    # skip these
                    continue
                if db.endswith('_RETIRED'):
                    continue
                init['xrefs'].append({'db': db, 'value': val})
        uberd[uid] = init
    if not args['--quiet']:
        print "  Got {} Uberon Ontology terms".format(len(uberd))
    return uberd
Beispiel #6
0
def parse_mondo(args, fn):
    if not args['--quiet']:
        print(f"Parsing Mondo file {fn}")
    mondo_parser = obo.Parser(fn)
    raw_mondo = {}
    for stanza in mondo_parser:
        if stanza.name != 'Term':
            continue
        raw_mondo[stanza.tags['id'][0].value] = stanza.tags
    mondod = {}
    for mondoid, md in raw_mondo.items():
        if 'is_obsolete' in md:
            continue
        if 'name' not in md:
            continue
        init = {'mondoid': mondoid, 'name': md['name'][0].value}
        if 'def' in md:
            init['def'] = md['def'][0].value
        if 'comment' in md:
            init['comment'] = md['comment'][0].value
        if 'is_a' in md:
            init['parents'] = []
            for parent in md['is_a']:
                # for now, just ignore parent source infos, if any.
                cp = parent.value.split(' ')[0]
                init['parents'].append(cp)
        if 'xref' in md:
            init['xrefs'] = []
            for xref in md['xref']:
                if xref.value.startswith('http') or xref.value.startswith(
                        'url'):
                    continue
                if len(xref.value.split(' ')) == 1:
                    (db, val) = xref.value.split(':')
                    init['xrefs'].append({'db': db, 'value': val})
                else:
                    (dbval, src) = xref.value.split(' ', 1)
                    (db, val) = dbval.split(':')
                    init['xrefs'].append({
                        'db': db,
                        'value': val,
                        'source': src
                    })
        mondod[mondoid] = init
    if not args['--quiet']:
        print("  Got {} Mondo terms".format(len(mondod)))
    return mondod
Beispiel #7
0
def mk_eco_map():
  print "\nParsing Evidence Ontology file {}".format(ECO_OBO_FILE)
  parser = obo.Parser(ECO_OBO_FILE)
  eco = {}
  for stanza in parser:
    eco[stanza.tags['id'][0].value] = stanza.tags
  regex = re.compile(r'GOECO:([A-Z]{2,3})')
  eco_map = {}
  for e,d in eco.items():
    if not e.startswith('ECO:'):
      continue
    if 'xref' in d:
      for x in d['xref']:
        m = regex.match(x.value)
        if m:
          eco_map[e] = m.group(1)
  return eco_map
Beispiel #8
0
def mk_eco_map(args):
    """
  Return a mapping of Evidence Ontology ECO IDs to Go Evidence Codes.
  """
    fn = ECO_DOWNLOAD_DIR + ECO_OBO
    if not args['--quiet']:
        print(f"\nParsing Evidence Ontology file {fn}")
    eco = {}
    eco_map = {}
    parser = obo.Parser(fn)
    for stanza in parser:
        eco[stanza.tags['id'][0].value] = stanza.tags
    regex = re.compile(r'GOECO:([A-Z]{2,3})')
    for e, d in eco.items():
        if not e.startswith('ECO:'):
            continue
        if 'xref' in d:
            for x in d['xref']:
                m = regex.match(x.value)
                if m:
                    eco_map[e] = m.group(1)
    return eco_map
Beispiel #9
0
def parse_do(args, fn):
    if not args['--quiet']:
        print(f"Parsing Disease Ontology file {fn}")
    do_parser = obo.Parser(fn)
    raw_do = {}
    for stanza in do_parser:
        if stanza.name != 'Term':
            continue
        raw_do[stanza.tags['id'][0].value] = stanza.tags
    dod = {}
    for doid, d in raw_do.items():
        if not doid.startswith('DOID:'):
            continue
        if 'is_obsolete' in d:
            continue
        init = {'doid': doid, 'name': d['name'][0].value}
        if 'def' in d:
            init['def'] = d['def'][0].value
        if 'is_a' in d:
            init['parents'] = []
            for parent in d['is_a']:
                init['parents'].append(parent.value)
        if 'xref' in d:
            init['xrefs'] = []
            for xref in d['xref']:
                if xref.value.startswith('http'):
                    continue
                try:
                    (db, val) = xref.value.split(':')
                except:
                    pass
                init['xrefs'].append({'db': db, 'value': val})
        dod[doid] = init
    if not args['--quiet']:
        print("  Got {} Disease Ontology terms".format(len(dod)))
    return dod
Beispiel #10
0
def parse_mpo(args, fn):
    if not args['--quiet']:
        print(f"Parsing Mammalian Phenotype Ontology file {fn}")
    mpo_parser = obo.Parser(open(fn))
    raw_mpo = {}
    for stanza in mpo_parser:
        if stanza.name != 'Term':
            continue
        raw_do[stanza.tags['id'][0].value] = stanza.tags
    mpod = {}
    for mpoid, d in raw_mpo.items():
        #if not mpoid.startswith('MPOID:'):
        #  continue
        if 'is_obsolete' in d:
            continue
        init = {'mpoid': mpoid, 'name': d['name'][0].value}
        if 'def' in d:
            init['def'] = d['def'][0].value
        if 'is_a' in d:
            init['parents'] = []
            for parent in d['is_a']:
                init['parents'].append(parent.value)
        if 'xref' in d:
            init['xrefs'] = []
            for xref in d['xref']:
                if xref.value.startswith('http'):
                    continue
                try:
                    (db, val) = xref.value.split(':')
                except:
                    pass
                init['xrefs'].append({'db': db, 'value': val})
        mpod[mpoid] = init
    if not args['--quiet']:
        print("  Got {} Mammalian Phenotype Ontology terms".format(len(mpod)))
    return mpod
Beispiel #11
0
def tinx(args):
    loglevel = int(args['--loglevel'])
    if args['--logfile']:
        logfile = args['--logfile']
    else:
        logfile = LOGFILE
    logger = logging.getLogger(__name__)
    logger.setLevel(loglevel)
    if not args['--debug']:
        logger.propagate = False  # turns off console logging
    fh = logging.FileHandler(logfile)
    fmtr = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s: %(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')
    fh.setFormatter(fmtr)
    logger.addHandler(fh)

    dba_params = {
        'dbhost': args['--dbhost'],
        'dbname': args['--dbname'],
        'logger_name': __name__
    }
    dba = DBAdaptor(dba_params)
    dbi = dba.get_dbinfo()
    logger.info(
        "Connected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
    if not args['--quiet']:
        print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(
            args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

    # The results of parsing the input mentions files will be the following dictionaries:
    pid2pmids = {
    }  # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein
    # Including the UniProt accession in the key is just for convenience when
    # checking the output. It is not used for anything.
    doid2pmids = {}  # DOID => set of all PMIDs that mention the disease
    pmid_disease_ct = {
    }  # PMID => count of diseases mentioned in a given paper
    pmid_protein_ct = {
    }  # PMID => count of proteins mentioned in a given paper

    # First parse the Disease Ontology OBO file to get DO names and defs
    dofile = DO_DOWNLOAD_DIR + DO_OBO
    print "\nParsing Disease Ontology file {}".format(dofile)
    do_parser = obo.Parser(open(dofile))
    do = {}
    for stanza in do_parser:
        do[stanza.tags['id'][0].value] = stanza.tags
    print "  Got {} Disease Ontology terms".format(len(do))

    pbar_widgets = [
        'Progress: ',
        Percentage(), ' ',
        Bar(marker='#', left='[', right=']'), ' ',
        ETA()
    ]

    fn = JL_DOWNLOAD_DIR + PROTEIN_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in protein file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsvf:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        notfnd = set()
        for line in tsvf:
            ct += 1
            pbar.update(ct)
            if not line.startswith('ENSP'):
                skip_ct += 1
                continue
            data = line.rstrip().split('\t')
            ensp = data[0]
            pmids = set([int(pmid) for pmid in data[1].split()])
            targets = dba.find_targets({'stringid': ensp})
            if not targets:
                # if we don't find a target by stringid, which is the more reliable and
                # prefered way, try by Ensembl xref
                targets = dba.find_targets_by_xref({
                    'xtype': 'Ensembl',
                    'value': ensp
                })
            if not targets:
                notfnd.add(ensp)
                continue
            for t in targets:
                p = t['components']['protein'][0]
                k = "%s,%s" % (p['id'], p['uniprot'])
                if k in pid2pmids:
                    pid2pmids[k] = pid2pmids[k].union(pmids)
                else:
                    pid2pmids[k] = set(pmids)
                for pmid in pmids:
                    if pmid in pmid_protein_ct:
                        pmid_protein_ct[pmid] += 1.0
                    else:
                        pmid_protein_ct[pmid] = 1.0
    pbar.finish()
    for ensp in notfnd:
        logger.warn("No target found for {}".format(ensp))
    print "{} lines processed.".format(ct)
    print "  Skipped {} non-ENSP lines".format(skip_ct)
    print "  Saved {} protein to PMIDs mappings".format(len(pid2pmids))
    print "  Saved {} PMID to protein count mappings".format(
        len(pmid_protein_ct))
    if notfnd:
        print "  No target found for {} ENSPs. See logfile {} for details.".format(
            len(notfnd), logfile)

    fn = JL_DOWNLOAD_DIR + DISEASE_FILE
    line_ct = slmf.wcl(fn)
    if not args['--quiet']:
        print "\nProcessing {} lines in file {}".format(line_ct, fn)
    with open(fn, 'rU') as tsvf:
        pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start()
        ct = 0
        skip_ct = 0
        notfnd = set()
        for line in tsvf:
            ct += 1
            pbar.update(ct)
            if not line.startswith('DOID:'):
                skip_ct += 1
                continue
            data = line.rstrip().split('\t')
            doid = data[0]
            pmids = set([int(pmid) for pmid in data[1].split()])
            if doid not in do:
                logger.warn("%s not found in DO" % doid)
                notfnd.add(doid)
                continue
            if doid in doid2pmids:
                doid2pmids[doid] = doid2pmids[doid].union(pmids)
            else:
                doid2pmids[doid] = set(pmids)
            for pmid in pmids:
                if pmid in pmid_disease_ct:
                    pmid_disease_ct[pmid] += 1.0
                else:
                    pmid_disease_ct[pmid] = 1.0
    pbar.finish()
    print "{} lines processed.".format(ct)
    print "  Skipped {} non-DOID lines".format(skip_ct)
    print "  Saved {} DOID to PMIDs mappings".format(len(doid2pmids))
    print "  Saved {} PMID to disease count mappings".format(
        len(pmid_disease_ct))
    if notfnd:
        print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(
            len(notfnd), logfile)

    if not args['--quiet']:
        print "\nComputing protein novely scores"
    # To calculate novelty scores, each paper (PMID) is assigned a
    # fractional target (FT) score of one divided by the number of targets
    # mentioned in it. The novelty score of a given protein is one divided
    # by the sum of the FT scores for all the papers mentioning that
    # protein.
    ct = 0
    with open(PROTEIN_NOVELTY_FILE, 'wb') as pnovf:
        pnovf.write("Protein ID,UniProt,Novelty\n")
        for k in pid2pmids.keys():
            ct += 1
            ft_score_sum = 0.0
            for pmid in pid2pmids[k]:
                ft_score_sum += 1.0 / pmid_protein_ct[pmid]
            novelty = 1.0 / ft_score_sum
            pnovf.write("%s,%.8f\n" % (k, novelty))
    print "  Wrote {} novelty scores to file {}".format(
        ct, PROTEIN_NOVELTY_FILE)

    if not args['--quiet']:
        print "\nComputing disease novely scores"
    # Exactly as for proteins, but using disease mentions
    ct = 0
    with open(DISEASE_NOVELTY_FILE, 'wb') as dnovf:
        dnovf.write("DOID,Novelty\n")
        for doid in doid2pmids.keys():
            ct += 1
            ft_score_sum = 0.0
            for pmid in doid2pmids[doid]:
                ft_score_sum += 1.0 / pmid_disease_ct[pmid]
            novelty = 1.0 / ft_score_sum
            dnovf.write("%s,%.8f\n" % (doid, novelty))
    print "  Wrote {} novelty scores to file {}".format(
        ct, DISEASE_NOVELTY_FILE)

    if not args['--quiet']:
        print "\nComputing importance scores"
    # To calculate importance scores, each paper is assigned a fractional
    # disease-target (FDT) score of one divided by the product of the
    # number of targets mentioned and the number of diseases
    # mentioned. The importance score for a given disease-target pair is
    # the sum of the FDT scores for all papers mentioning that disease and
    # protein.
    ct = 0
    with open(IMPORTANCE_FILE, 'wb') as impf:
        impf.write("DOID,Protein ID,UniProt,Score\n")
        for k, ppmids in pid2pmids.items():
            for doid, dpmids in doid2pmids.items():
                pd_pmids = ppmids.intersection(dpmids)
                fdt_score_sum = 0.0
                for pmid in pd_pmids:
                    fdt_score_sum += 1.0 / (pmid_protein_ct[pmid] *
                                            pmid_disease_ct[pmid])
                if fdt_score_sum > 0:
                    ct += 1
                    impf.write("%s,%s,%.8f\n" % (doid, k, fdt_score_sum))
    print "  Wrote {} importance scores to file {}".format(ct, IMPORTANCE_FILE)

    if not args['--quiet']:
        print "\nComputing PubMed rankings"
    # PMIDs are ranked for a given disease-target pair based on a score
    # calculated by multiplying the number of targets mentioned and the
    # number of diseases mentioned in that paper. Lower scores have a lower
    # rank (higher priority). If the scores do not discriminate, PMIDs are
    # reverse sorted by value with the assumption that larger PMIDs are
    # newer and of higher priority.
    ct = 0
    with open(PMID_RANKING_FILE, 'wb') as pmrf:
        pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n")
        for k, ppmids in pid2pmids.items():
            for doid, dpmids in doid2pmids.items():
                pd_pmids = ppmids.intersection(dpmids)
                scores = [
                ]  # scores are tuples of (PMID, protein_mentions*disease_mentions)
                for pmid in pd_pmids:
                    scores.append(
                        (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid]))
                if len(scores) > 0:
                    scores.sort(cmp_pmids_scores)
                    for i, t in enumerate(scores):
                        ct += 1
                        pmrf.write("%s,%s,%d,%d\n" % (doid, k, t[0], i))
    print "  Wrote {} PubMed rankings to file {}".format(ct, PMID_RANKING_FILE)
Beispiel #12
0
def tinx(args, dba, logger, logfile):
  # The results of parsing the input mentions files will be the following dictionaries:
  pid2pmids = {}  # 'TCRD.protein.id,UniProt' => set of all PMIDs that mention the protein
                  # Including the UniProt accession in the key is just for convenience when
                  # checking the output. It is not used for anything.
  doid2pmids = {} # DOID => set of all PMIDs that mention the disease
  pmid_disease_ct = {} # PMID => count of diseases mentioned in a given paper 
  pmid_protein_ct = {} # PMID => count of proteins mentioned in a given paper 

  # First parse the Disease Ontology OBO file to get DO names and defs
  dofile = DO_DOWNLOAD_DIR + DO_OBO
  print(f"\nParsing Disease Ontology file {dofile}")
  do_parser = obo.Parser(dofile)
  do = {}
  for stanza in do_parser:
    do[stanza.tags['id'][0].value] = stanza.tags
  print("  Got {} Disease Ontology terms".format(len(do)))

  fn = JL_DOWNLOAD_DIR+PROTEIN_FILE
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines in protein file {fn}")
  with open(fn, 'rU') as tsvf:
    #pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    ct = 0
    skip_ct = 0
    notfnd = set()
    for line in tsvf:
      ct += 1
      slmf.update_progress(ct/line_ct)
      if not line.startswith('ENSP'):
        skip_ct += 1
        continue
      data = line.rstrip().split('\t')
      ensp = data[0]
      pmids = set([int(pmid) for pmid in data[1].split()])
      tids = dba.find_target_ids({'stringid': ensp})
      if not tids:
        # if we don't find a target by stringid, which is the more reliable and
        # prefered way, try by Ensembl xref
        tids = dba.find_target_ids_by_xref({'xtype': 'Ensembl', 'value': ensp})
      if not tids:
        notfnd.add(ensp)
        continue
      for tid in tids:
        t = dba.get_target(tid, annot=False)
        p = t['components']['protein'][0]
        k = "{},{}".format(p['id'], p['uniprot'])
        if k in pid2pmids:
          pid2pmids[k] = pid2pmids[k].union(pmids)
        else:
          pid2pmids[k] = set(pmids)
        for pmid in pmids:
          if pmid in pmid_protein_ct:
            pmid_protein_ct[pmid] += 1.0
          else:
            pmid_protein_ct[pmid] = 1.0
  for ensp in notfnd:
    logger.warn(f"No target found for {ensp}")
  print(f"\n{ct} lines processed")
  print(f"  Skipped {skip_ct} non-ENSP lines")
  print("  Saved {} protein to PMIDs mappings".format(len(pid2pmids)))
  print("  Saved {} PMID to protein count mappings".format(len(pmid_protein_ct)))
  if notfnd:
    print("  No target found for {} ENSPs. See logfile {} for details.".format(len(notfnd), logfile))

  fn = JL_DOWNLOAD_DIR+DISEASE_FILE
  line_ct = slmf.wcl(fn)
  if not args['--quiet']:
    print(f"\nProcessing {line_ct} lines in file {fn}")
  with open(fn, 'rU') as tsvf:
    ct = 0
    skip_ct = 0
    notfnd = set()
    for line in tsvf:
      ct += 1
      slmf.update_progress(ct/line_ct)
      if not line.startswith('DOID:'):
        skip_ct += 1
        continue
      data = line.rstrip().split('\t')
      doid = data[0]
      pmids = set([int(pmid) for pmid in data[1].split()])
      if doid not in do:
        logger.warn(f"{doid} not found in DO")
        notfnd.add(doid)
        continue
      if doid in doid2pmids:
        doid2pmids[doid] = doid2pmids[doid].union(pmids)
      else:
        doid2pmids[doid] = set(pmids)
      for pmid in pmids:
        if pmid in pmid_disease_ct:
          pmid_disease_ct[pmid] += 1.0
        else:
          pmid_disease_ct[pmid] = 1.0
  print(f"\n{ct} lines processed.")
  print(f"  Skipped {skip_ct} non-DOID lines")
  print("  Saved {} DOID to PMIDs mappings".format(len(doid2pmids)))
  print("  Saved {} PMID to disease count mappings".format(len(pmid_disease_ct)))
  if notfnd:
    print("WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile))

  if not args['--quiet']:
    print("\nComputing protein novely scores")
  # To calculate novelty scores, each paper (PMID) is assigned a
  # fractional target (FT) score of one divided by the number of targets
  # mentioned in it. The novelty score of a given protein is one divided
  # by the sum of the FT scores for all the papers mentioning that
  # protein.
  ct = 0
  with open(PROTEIN_NOVELTY_FILE, 'w') as pnovf:
    pnovf.write("Protein ID,UniProt,Novelty\n")
    for k in pid2pmids.keys():
      ct += 1
      ft_score_sum = 0.0
      for pmid in pid2pmids[k]:
        ft_score_sum += 1.0 / pmid_protein_ct[pmid]
      novelty = 1.0 / ft_score_sum
      pnovf.write( "%s,%.8f\n" % (k, novelty) )
  print(f"  Wrote {ct} novelty scores to file {PROTEIN_NOVELTY_FILE}")

  if not args['--quiet']:
    print("\nComputing disease novely scores")
  # Exactly as for proteins, but using disease mentions
  ct = 0
  with open(DISEASE_NOVELTY_FILE, 'w') as dnovf:
    dnovf.write("DOID,Novelty\n")
    for doid in doid2pmids.keys():
      ct += 1
      ft_score_sum = 0.0
      for pmid in doid2pmids[doid]:
        ft_score_sum += 1.0 / pmid_disease_ct[pmid]
      novelty = 1.0 / ft_score_sum
      dnovf.write( "%s,%.8f\n" % (doid, novelty) )
  print(f"  Wrote {ct} novelty scores to file {DISEASE_NOVELTY_FILE}")

  if not args['--quiet']:
    print("\nComputing importance scores")
  # To calculate importance scores, each paper is assigned a fractional
  # disease-target (FDT) score of one divided by the product of the
  # number of targets mentioned and the number of diseases
  # mentioned. The importance score for a given disease-target pair is
  # the sum of the FDT scores for all papers mentioning that disease and
  # protein.
  ct = 0
  with open(IMPORTANCE_FILE, 'w') as impf:
    impf.write("DOID,Protein ID,UniProt,Score\n")
    for k,ppmids in pid2pmids.items():
      for doid,dpmids in doid2pmids.items():
        pd_pmids = ppmids.intersection(dpmids)
        fdt_score_sum = 0.0
        for pmid in pd_pmids:
          fdt_score_sum += 1.0 / ( pmid_protein_ct[pmid] * pmid_disease_ct[pmid] )
        if fdt_score_sum > 0:
          ct += 1
          impf.write( "%s,%s,%.8f\n" % (doid, k, fdt_score_sum) )
  print(f"  Wrote {ct} importance scores to file {IMPORTANCE_FILE}")

  if not args['--quiet']:
    print("\nComputing PubMed rankings")
  # PMIDs are ranked for a given disease-target pair based on a score
  # calculated by multiplying the number of targets mentioned and the
  # number of diseases mentioned in that paper. Lower scores have a lower
  # rank (higher priority). If the scores do not discriminate, PMIDs are
  # reverse sorted by value with the assumption that larger PMIDs are
  # newer and of higher priority.
  ct = 0
  with open(PMID_RANKING_FILE, 'w') as pmrf:
    pmrf.write("DOID,Protein ID,UniProt,PubMed ID,Rank\n")
    for k,ppmids in pid2pmids.items():
      for doid,dpmids in doid2pmids.items():
        pd_pmids = ppmids.intersection(dpmids)
        scores = [] # scores are tuples of (PMID, protein_mentions*disease_mentions)
        for pmid in pd_pmids:
          scores.append( (pmid, pmid_protein_ct[pmid] * pmid_disease_ct[pmid]) )
        if len(scores) > 0:
          scores.sort(key = cmp_to_key(cmp_pmids_scores))
          for i,t in enumerate(scores):
            ct += 1
            pmrf.write( "%s,%s,%d,%d\n" % (doid, k, t[0], i) )
  print(f"  Wrote {ct} PubMed rankings to file {PMID_RANKING_FILE}")
Beispiel #13
0
    print(f"Error deleting JensenLab rows from disease... Exiting.")
    exit(1)
  # load new DISEAESES
  load_DISEASES(args, dba, logger, logfile)
  # update dataset
  upds = {'app': PROGRAM, 'app_version': __version__,
          'datetime': time.strftime("%Y-%m-%d %H:%M:%S")}
  rv = upd_dataset_by_name(self, 'Jensen Lab DISEASES', upds):
  assert rv "Error updating dataset 'Jensen Lab DISEASES'. Exiting."

  print("\Generating new TIN-X Files...")
  # parse the Disease Ontology OBO file to get DO names and defs
  dofile = DO_DOWNLOAD_DIR+DO_OBO
  if not args['--quiet']:
    print(f"\nParsing Disease Ontology file {dofile}")
  do_parser = obo.Parser(dofile)
  do = {}
  for stanza in do_parser:
    do[stanza.tags['id'][0].value] = stanza.tags
  if not args['--quiet']:
    print("  Got {} Disease Ontology terms".format(len(do)))
  tinx_logfile = LOGDIR+'TINX.log'
  tinx = TINX({'TINX_PROTEIN_FILE': DOWNLOAD_DIR+TINX_PROTEIN_FILE,
               'TINX_DISEASE_FILE': DOWNLOAD_DIR+TINX_DISEASE_FILE,
               'logfile': tinx_logfile, 'OUTDIR': TINX_OUTDIR}, dba, do)
  (ct1, ct2) = tinx.parse_protein_mentions()
  if not args['--quiet']:
    print(f"Saved {ct1} protein to PMIDs mappings and {ct2} PMID to protein count mappings. See logfile {tinx_logfile} for details.")
  (ct1, ct2) = tinx.parse_disease_mentions()
  if not args['--quiet']:
    print(f"Saved {ct1} disease to PMIDs mappings and {ct2} PMID to disease count mappings. See logfile {tinx_logfile} for details.")
Beispiel #14
0
def load(args):
  loglevel = int(args['--loglevel'])
  if args['--logfile']:
    logfile = args['--logfile']
  else:
    logfile = LOGFILE
  logger = logging.getLogger(__name__)
  logger.setLevel(loglevel)
  if not args['--debug']:
    logger.propagate = False # turns off console logging
  fh = logging.FileHandler(logfile)
  fmtr = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
  fh.setFormatter(fmtr)
  logger.addHandler(fh)

  dba_params = {'dbhost': args['--dbhost'], 'dbname': args['--dbname'], 'logger_name': __name__}
  dba = DBAdaptor(dba_params)
  dbi = dba.get_dbinfo()
  logger.info("Connected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver']))
  if not args['--quiet']:
    print "\nConnected to TCRD database {} (schema ver {}; data ver {})".format(args['--dbname'], dbi['schema_ver'], dbi['data_ver'])

  # Dataset
  dataset_id = dba.ins_dataset( {'name': 'TIN-X Data', 'source': 'IDG-KMC generated data by Steve Mathias at UNM.', 'app': PROGRAM, 'app_version': __version__, 'comments': 'Data is generated by python/TIN-X.py from mentions files http://download.jensenlab.org/human_textmining_mentions.tsv and http://download.jensenlab.org/disease_textmining_mentions.tsv.'} )
  assert dataset_id, "Error inserting dataset See logfile {} for details.".format(logfile)
  # Provenance
  provs = [ {'dataset_id': dataset_id, 'table_name': 'tinx_novelty', 'comment': "Protein novelty scores are generated from results of JensenLab textmining of PubMed in the file http://download.jensenlab.org/human_textmining_mentions.tsv. To calculate novelty scores, each paper (PMID) is assigned a fractional target (FT) score of one divided by the number of targets mentioned in it. The novelty score of a given protein is one divided by the sum of the FT scores for all the papers mentioning that protein."},
            {'dataset_id': dataset_id, 'table_name': 'tinx_disease', 'comment': "Disease novelty scores are generated from results of JensenLab textmining of PubMed in the file http://download.jensenlab.org/disease_textmining_mentions.tsv. To calculate novelty scores, each paper (PMID) is assigned a fractional disease (FD) score of one divided by the number of targets mentioned in it. The novelty score of a given disease is one divided by the sum of the FT scores for all the papers mentioning that disease."},
            {'dataset_id': dataset_id, 'table_name': 'tinx_importance', 'comment': "To calculate importance scores, each paper is assigned a fractional disease-target (FDT) score of one divided by the product of the number of targets mentioned and the number of diseases mentioned. The importance score for a given disease-target pair is the sum of the FDT scores for all papers mentioning that disease and protein."},
            {'dataset_id': dataset_id, 'table_name': 'tinx_articlerank', 'comment': "PMIDs are ranked for a given disease-target pair based on a score calculated by multiplying the number of targets mentioned and the number of diseases mentioned in that paper. Lower scores have a lower rank (higher priority). If the scores do not discriminate, PMIDs are reverse sorted by value with the assumption that larger PMIDs are newer and of higher priority."}]
  for prov in provs:
    rv = dba.ins_provenance(prov)
    assert rv, "Error inserting provenance. See logfile {} for details.".format(logfile)

  # First parse the Disease Ontology OBO file to get DO names and defs
  print "\nParsing Disease Ontology file {}".format(DISEASE_ONTOLOGY_OBO)
  do_parser = obo.Parser(open(DISEASE_ONTOLOGY_OBO))
  do = {}
  for stanza in do_parser:
    do[stanza.tags['id'][0].value] = stanza.tags
  print "  Got {} Disease Ontology terms".format(len(do))
  
  pbar_widgets = ['Progress: ',Percentage(),' ',Bar(marker='#',left='[',right=']'),' ',ETA()]

  dmap = {}
  line_ct = slmf.wcl(DISEASE_NOVELTY_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in file {}".format(line_ct, DISEASE_NOVELTY_FILE)
  with open(DISEASE_NOVELTY_FILE, 'rU') as csvfile:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # DOID,Novelty
    ct = 0
    dct = 0
    notfnd = set()
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      pbar.update(ct)
      doid = row[0]
      if doid in do:
        if 'name' in do[doid]:
          dname = do[doid]['name'][0].value
        else:
          continue
        if 'def' in do[doid]:
          ddef = do[doid]['def'][0].value
        else:
          ddef = None
      else:
        logger.warn("%s not in DO map" % row[0])
        notfnd.append(row[0])
        continue
      rv = dba.ins_tinx_disease( {'doid': doid, 'name': dname, 
                                  'summary': ddef, 'score': float(row[1])} )
      if rv:
        dct += 1
        dmap[doid] = rv # map DOID to tinx_disease.id
      else:
        dba_err_ct += 1
  pbar.finish()
  print "{} lines processed.".format(ct)
  print "  Inserted {} new tinx_disease rows".format(dct)
  print "  Saved {} keys in dmap".format(len(dmap))
  if notfnd:
    print "WARNNING: No entry found in DO map for {} DOIDs. See logfile {} for details.".format(len(notfnd), logfile)
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)
  
  line_ct = slmf.wcl(PROTEIN_NOVELTY_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in file {}".format(line_ct, PROTEIN_NOVELTY_FILE)
  with open(PROTEIN_NOVELTY_FILE, 'rU') as csvfile:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # Protein ID,UniProt,Novelty
    ct = 0
    tn_ct = 0
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      pbar.update(ct)
      pid = row[0]
      rv = dba.ins_tinx_novelty( {'protein_id': pid, 'score': float(row[2])} )
      if rv:
        tn_ct += 1
      else:
        dba_err_ct += 1
  pbar.finish()
  print "{} lines processed.".format(ct)
  print "  Inserted {} new tinx_novelty rows".format(tn_ct)
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  imap = {}
  line_ct = slmf.wcl(IMPORTANCE_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in file {}".format(line_ct, IMPORTANCE_FILE)
  with open(IMPORTANCE_FILE, 'rU') as csvfile:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # DOID,Protein ID,UniProt,Score
    ct = 0
    ti_ct = 0
    skips1 = set()
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      pbar.update(ct)
      if row[0] not in dmap:
        logger.error("%s not in dmap" % row[0])
        skips1.add(row[0])
        continue
      did = dmap[row[0]]
      pid = row[1]
      rv = dba.ins_tinx_importance( {'protein_id': pid, 'disease_id': did,
                                     'score': float(row[3])} )
      if rv:
        ti_ct += 1
        # map DOID|PID to tinx_importance.id
        k = "%s|%s"%(row[0],row[1])
        imap[k] = rv 
      else:
        dba_err_ct += 1
  pbar.finish()
  print "{} lines processed.".format(ct)
  print "  Inserted {} new tinx_importance rows".format(ti_ct)
  print "  Saved {} keys in imap".format(len(imap))
  if len(skips1) > 0:
    print "WARNNING: No disease found in dmap for {} DOIDs. See logfile {} for details.".format(len(skips1), logfile)
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)

  line_ct = slmf.wcl(PMID_RANKING_FILE)
  if not args['--quiet']:
    print "\nProcessing {} lines in file {}".format(line_ct, PMID_RANKING_FILE)
  regex = re.compile(r"^DOID:0*")
  with open(PMID_RANKING_FILE, 'rU') as csvfile:
    pbar = ProgressBar(widgets=pbar_widgets, maxval=line_ct).start() 
    csvreader = csv.reader(csvfile)
    header = csvreader.next() # skip header line
    # DOID,Protein ID,UniProt,PubMed ID,Rank
    ct = 0
    tar_ct = 0
    skips = set()
    dba_err_ct = 0
    for row in csvreader:
      ct += 1
      pbar.update(ct)
      k = "%s|%s"%(row[0],row[1])
      if k not in imap:
        logger.warn("%s not in imap" % k)
        skips.add(k)
        continue
      iid = imap[k]
      rv = dba.ins_tinx_articlerank( {'importance_id': iid, 'pmid': row[3], 'rank': row[4]} )
      if rv:
        tar_ct += 1
      else:
        dba_err_ct += 1
  pbar.finish()
  print "{} lines processed.".format(ct)
  print "  Inserted {} new tinx_articlerank rows".format(tar_ct)
  if len(skips) > 0:
    print "WARNNING: No importance found in imap for {} keys. See logfile {} for details.".format(len(skips), logfile)
  if dba_err_ct > 0:
    print "WARNNING: {} DB errors occurred. See logfile {} for details.".format(dba_err_ct, logfile)