### ATTENTION!!!! PLEASE PIPE THE OUTPUT OF THIS SCRIPT THROUGH sort | uniq !!! ###
### Doing it within python is a waste of resources. Linux does it much faster.  ###


def get_parents(bottom_id, dag, root_id='HP:0000118'):
    if bottom_id == root_id:
        return set([bottom_id])
    rv = set()
    if bottom_id in dag.edges:
        for parent in dag.edges[bottom_id]:
            rv |= get_parents(parent, dag)
    rv.add(bottom_id)
    return rv


if __name__ == '__main__':
    hpo_dag = dutil.read_hpo_dag()
    with open('%s/onto/data/hpo_phenotypes.tsv' % APP_HOME) as f:
        for line in f:
            toks = line.strip().split('\t')
            hpo_id = toks[0]
            pheno_name = toks[1]
            parent_ids = get_parents(hpo_id,
                                     hpo_dag)  # includes the original hpo_id

            assert hpo_id in parent_ids
            if 'HP:0000118' not in parent_ids:
                continue
            sys.stdout.write(hpo_id + '\t' + pheno_name + '\n')
            sys.stdout.flush()
Ejemplo n.º 2
0
def main(id_file, candidate_file):
  # Load list of all pubmed IDs in the dataset
  print >> sys.stderr, 'Loading list of pubmed IDs from doc ID list.'
  doi_to_pmid = dutil.read_doi_to_pmid()
  pmids_in_data = set()
  num_docs = 0
  with open(id_file) as f:
    for line in f:
      doc_id = line.strip()
      pmid = dutil.get_pubmed_id_for_doc(doc_id, doi_to_pmid=doi_to_pmid)
      if pmid:
        pmids_in_data.add(pmid)
      num_docs += 1
  print >> sys.stderr, '%d/%d documents have PubMed IDs.' % (
      len(pmids_in_data), num_docs)

  # Load map from Pubmed ID to HPO term via MeSH
  print >> sys.stderr, 'Loading supervision data via MeSH'
  mesh_supervision = collections.defaultdict(set)
  with open('%s/onto/data/hpo_to_pmid_via_mesh.tsv' % util.APP_HOME) as f:
    for line in f:
      hpo_id, pmid = line.strip().split('\t')
      if pmid in pmids_in_data:
        mesh_supervision[pmid].add(hpo_id)

  # Identify all true pairs from MeSH
  true_pairs = set()
  for pmid in pmids_in_data:
    for hpo in mesh_supervision[pmid]:
      true_pairs.add((pmid, hpo))

  # Load map from Pubmed ID to HPO term based on extracted candidates
  print >> sys.stderr, 'Loading extracted pheno candidates'
  candidates = collections.defaultdict(set)
  with open(candidate_file) as f:
    for line in f:
      doc_id, hpo_id = line.strip().split('\t')
      pmid = dutil.get_pubmed_id_for_doc(doc_id, doi_to_pmid=doi_to_pmid)
      if pmid:
        candidates[pmid].add(hpo_id)

  # Load HPO DAG
  # We say we found a HPO term if we find either the exact HPO term
  # or one of its children
  hpo_dag = dutil.read_hpo_dag()

  # Determine which true pairs had candidate mentions for them
  found_pairs = set()
  missed_pairs = set()
  for pmid, hpo in true_pairs:
    found_hpo_ids = candidates[pmid]
    for cand_hpo in found_hpo_ids:
      if cand_hpo == '\N': continue
      if hpo_dag.has_child(hpo, cand_hpo):
        found_pairs.add((pmid, hpo))
        break
    else:
      missed_pairs.add((pmid, hpo))

  # Compute recall
  num_true = len(true_pairs)
  num_found = len(found_pairs)
  print >> sys.stderr, 'Recall: %d/%d = %g' % (
      num_found, num_true, float(num_found) / num_true)

  # Compute other statistics
  num_article = len(pmids_in_data)
  num_annotated = sum(1 for x in pmids_in_data if len(mesh_supervision[x]) > 0)
  print >> sys.stderr, '%d/%d = %g pubmed articles had HPO annotation' % (
      num_annotated, num_article, float(num_annotated) / num_article)

  # Read in HPO information
  hpo_info_dict = dict()
  with open('%s/onto/data/hpo_phenotypes.tsv' % util.APP_HOME) as f:
    for line in f:
      toks = line.strip('\r\n').split('\t')
      hpo_id = toks[0]
      hpo_info_dict[hpo_id] = toks[0:3]

  # Sample some error cases
  missed_sample = random.sample(list(missed_pairs), 100)
  for pmid, hpo in missed_sample:
    hpo_info = hpo_info_dict[hpo]
    pubmed_url = 'http://www.ncbi.nlm.nih.gov/pubmed/%s' % pmid
    hpo_url = 'www.human-phenotype-ontology.org/hpoweb/showterm?id=%s' % hpo
    toks = [pubmed_url, hpo_url] + hpo_info
    print '\t'.join(toks)
Ejemplo n.º 3
0
                         ('section_id', 'text'), ('sent_id', 'int'),
                         ('gene_mention_id', 'text'), ('gene_name', 'text'),
                         ('gene_wordidxs', 'int[]'),
                         ('gene_is_correct', 'boolean'),
                         ('pheno_mention_id', 'text'),
                         ('pheno_entity', 'text'), ('pheno_wordidxs', 'int[]'),
                         ('pheno_is_correct', 'boolean'), ('words', 'text[]'),
                         ('lemmas', 'text[]'), ('poses', 'text[]'),
                         ('ners', 'text[]'), ('dep_paths', 'text[]'),
                         ('dep_parents', 'int[]')])

# This defines the output Relation object
Feature = collections.namedtuple(
    'Feature', ['doc_id', 'section_id', 'relation_id', 'name'])

HPO_DAG = dutil.read_hpo_dag()


def replace_opts(opts, replaceList):
    ret = {}
    for name in opts:
        strings = opts[name]
        for (pattern, subst) in replaceList:
            if name.endswith('rgx'):
                subst = re.escape(subst)
            strings = [s.replace(pattern, subst) for s in strings]
        ret[name] = strings
    return ret


CACHE = {}
#! /usr/bin/env python

from data_util import get_hpo_phenos, get_parents, read_hpo_dag, read_hpo_synonyms

if __name__ == "__main__":
  hpo_dag = read_hpo_dag()
  names = read_hpo_synonyms(1)
  synonyms = read_hpo_synonyms()
  allowed_phenos = set(get_hpo_phenos(hpo_dag))
  for hpo_id in allowed_phenos.copy():
    parent_ids = get_parents(hpo_id, hpo_dag) # includes the original hpo_id
    assert hpo_id in parent_ids
    if 'HP:0000118' not in parent_ids:
      sys.stderr.write('"{0}": not a phenotypic abnormality\n'.format(hpo_id.strip()))
      continue
    parent_ids.remove('HP:0000118')
    for parent_id in parent_ids:
      allowed_phenos.add(parent_id)
  for hpo_id in allowed_phenos:
    print "%s\t%s\t%s" % (hpo_id, '|^|'.join(names[hpo_id]), '|^|'.join(synonyms[hpo_id]))