### ATTENTION!!!! PLEASE PIPE THE OUTPUT OF THIS SCRIPT THROUGH sort | uniq !!! ### ### Doing it within python is a waste of resources. Linux does it much faster. ### def get_parents(bottom_id, dag, root_id='HP:0000118'): if bottom_id == root_id: return set([bottom_id]) rv = set() if bottom_id in dag.edges: for parent in dag.edges[bottom_id]: rv |= get_parents(parent, dag) rv.add(bottom_id) return rv if __name__ == '__main__': hpo_dag = dutil.read_hpo_dag() with open('%s/onto/data/hpo_phenotypes.tsv' % APP_HOME) as f: for line in f: toks = line.strip().split('\t') hpo_id = toks[0] pheno_name = toks[1] parent_ids = get_parents(hpo_id, hpo_dag) # includes the original hpo_id assert hpo_id in parent_ids if 'HP:0000118' not in parent_ids: continue sys.stdout.write(hpo_id + '\t' + pheno_name + '\n') sys.stdout.flush()
def main(id_file, candidate_file): # Load list of all pubmed IDs in the dataset print >> sys.stderr, 'Loading list of pubmed IDs from doc ID list.' doi_to_pmid = dutil.read_doi_to_pmid() pmids_in_data = set() num_docs = 0 with open(id_file) as f: for line in f: doc_id = line.strip() pmid = dutil.get_pubmed_id_for_doc(doc_id, doi_to_pmid=doi_to_pmid) if pmid: pmids_in_data.add(pmid) num_docs += 1 print >> sys.stderr, '%d/%d documents have PubMed IDs.' % ( len(pmids_in_data), num_docs) # Load map from Pubmed ID to HPO term via MeSH print >> sys.stderr, 'Loading supervision data via MeSH' mesh_supervision = collections.defaultdict(set) with open('%s/onto/data/hpo_to_pmid_via_mesh.tsv' % util.APP_HOME) as f: for line in f: hpo_id, pmid = line.strip().split('\t') if pmid in pmids_in_data: mesh_supervision[pmid].add(hpo_id) # Identify all true pairs from MeSH true_pairs = set() for pmid in pmids_in_data: for hpo in mesh_supervision[pmid]: true_pairs.add((pmid, hpo)) # Load map from Pubmed ID to HPO term based on extracted candidates print >> sys.stderr, 'Loading extracted pheno candidates' candidates = collections.defaultdict(set) with open(candidate_file) as f: for line in f: doc_id, hpo_id = line.strip().split('\t') pmid = dutil.get_pubmed_id_for_doc(doc_id, doi_to_pmid=doi_to_pmid) if pmid: candidates[pmid].add(hpo_id) # Load HPO DAG # We say we found a HPO term if we find either the exact HPO term # or one of its children hpo_dag = dutil.read_hpo_dag() # Determine which true pairs had candidate mentions for them found_pairs = set() missed_pairs = set() for pmid, hpo in true_pairs: found_hpo_ids = candidates[pmid] for cand_hpo in found_hpo_ids: if cand_hpo == '\N': continue if hpo_dag.has_child(hpo, cand_hpo): found_pairs.add((pmid, hpo)) break else: missed_pairs.add((pmid, hpo)) # Compute recall num_true = len(true_pairs) num_found = len(found_pairs) print >> sys.stderr, 'Recall: %d/%d = %g' % ( num_found, num_true, float(num_found) / num_true) # Compute other statistics num_article = len(pmids_in_data) num_annotated = sum(1 for x in pmids_in_data if len(mesh_supervision[x]) > 0) print >> sys.stderr, '%d/%d = %g pubmed articles had HPO annotation' % ( num_annotated, num_article, float(num_annotated) / num_article) # Read in HPO information hpo_info_dict = dict() with open('%s/onto/data/hpo_phenotypes.tsv' % util.APP_HOME) as f: for line in f: toks = line.strip('\r\n').split('\t') hpo_id = toks[0] hpo_info_dict[hpo_id] = toks[0:3] # Sample some error cases missed_sample = random.sample(list(missed_pairs), 100) for pmid, hpo in missed_sample: hpo_info = hpo_info_dict[hpo] pubmed_url = 'http://www.ncbi.nlm.nih.gov/pubmed/%s' % pmid hpo_url = 'www.human-phenotype-ontology.org/hpoweb/showterm?id=%s' % hpo toks = [pubmed_url, hpo_url] + hpo_info print '\t'.join(toks)
('section_id', 'text'), ('sent_id', 'int'), ('gene_mention_id', 'text'), ('gene_name', 'text'), ('gene_wordidxs', 'int[]'), ('gene_is_correct', 'boolean'), ('pheno_mention_id', 'text'), ('pheno_entity', 'text'), ('pheno_wordidxs', 'int[]'), ('pheno_is_correct', 'boolean'), ('words', 'text[]'), ('lemmas', 'text[]'), ('poses', 'text[]'), ('ners', 'text[]'), ('dep_paths', 'text[]'), ('dep_parents', 'int[]')]) # This defines the output Relation object Feature = collections.namedtuple( 'Feature', ['doc_id', 'section_id', 'relation_id', 'name']) HPO_DAG = dutil.read_hpo_dag() def replace_opts(opts, replaceList): ret = {} for name in opts: strings = opts[name] for (pattern, subst) in replaceList: if name.endswith('rgx'): subst = re.escape(subst) strings = [s.replace(pattern, subst) for s in strings] ret[name] = strings return ret CACHE = {}
#! /usr/bin/env python from data_util import get_hpo_phenos, get_parents, read_hpo_dag, read_hpo_synonyms if __name__ == "__main__": hpo_dag = read_hpo_dag() names = read_hpo_synonyms(1) synonyms = read_hpo_synonyms() allowed_phenos = set(get_hpo_phenos(hpo_dag)) for hpo_id in allowed_phenos.copy(): parent_ids = get_parents(hpo_id, hpo_dag) # includes the original hpo_id assert hpo_id in parent_ids if 'HP:0000118' not in parent_ids: sys.stderr.write('"{0}": not a phenotypic abnormality\n'.format(hpo_id.strip())) continue parent_ids.remove('HP:0000118') for parent_id in parent_ids: allowed_phenos.add(parent_id) for hpo_id in allowed_phenos: print "%s\t%s\t%s" % (hpo_id, '|^|'.join(names[hpo_id]), '|^|'.join(synonyms[hpo_id]))