class Ontology: """A class for an ontology instance to work with the HPO data """ config = ProjectConfig() name = 'pro' def __init__(self, obo_file=config.pro_data_path, id_filter=[], human_filter=False): """ initializes the ontology instance by reading an obo_file """ self.id_filter = id_filter # mapps each alternative ID to ist original ID self.alt2id = altID2ID(self, obo_file) # mapps each ID to its alternative IDs self.id2alt = ID2altID(self, obo_file) # mapps each term ID to its parent(s) self.id2par = is_a_mapping(self, obo_file) # maps each term ID to its name: self.id2name = id2name(self, obo_file, human_filter=human_filter) # mapps each synonym and names to the termID self.name2ID = name_synonym2ID(self, obo_file) # maps each term ID to its child terms self.term2child = parse_top_down(self, obo_file, self.id2alt, self.alt2id) def collect_terms_to_root(self, termID, path=set()): """recursively collects all terms in the path to the root of the ontology by using the id2par dict """ # check if term is the root (to stop recursion in that case) if not self.id2par[termID]: return path else: for par in self.id2par[termID]: # check if the path is already known: if par in self.id2root_path: path.update(self.id2root_path[par]) else: path.add(par) # add also alternative IDs for the parents if availabel if par in self.id2alt: path.update(self.id2par[par]) # recursively add rest of the path path.update( self.collect_terms_to_root(par, path) ) return path
class Ontology: config = ProjectConfig() name = 'mesh' IDinit = 'D' def __init__(self, xml_files=[config.mesh_data_path], extend_with_umls=False): """ initializes the ontology instance by reading an obo_file """ sys.stdout.write("Loading MeSH ") map = mappings.mapper() records = [] sys.stdout.write("%s" % (" " * 11)) sys.stdout.flush() sys.stdout.write("\b" * (10+1)) for file in xml_files: records.extend(parse_mesh(file)) print("") self.id2name = {} self.name2ID = {} for rec in records: id = rec.ui names = set() names.add(rec.name) for conc in rec.concepts: for term in conc.terms: names.add(term.term_name) self.id2name[id] = names for name in names: if name not in self.name2ID: self.name2ID[name] = set() self.name2ID[name].add(id) if not name.lower() in self.name2ID: self.name2ID[name.lower()] = set() self.name2ID[name.lower()].add(id) restricted_umls_set = set() for id in self.id2name.keys(): if id in map.mesh2umls: restricted_umls_set.update(map.mesh2umls[id]) if extend_with_umls: for id in self.id2name.keys(): extended_lex = map.get_umls_lexicalizations(id, restrict_to=restricted_umls_set) print("done for", id) self.id2name[id].update(extended_lex)
def __init__(self, remove_parentheses=True, keep_only=[], discard_stop_concepts=True, prune=False): self.id2name = {} self.name2ID = {} config = ProjectConfig() file = config.snomed_data_path self.active_IDs = self.get_active_IDs(config.snomed_active_concepts) if prune: keeps = config.snomed_filtered_branches self.keepIDs = self.get_keep_IDs(config.snomed_filtered_concepts_path, keeps) self.parse_data(file, remove_parentheses=True, keep_only=keep_only, discard_inactive=discard_stop_concepts, prune=prune)
class Ontology: name = 'fma' config = ProjectConfig() def __init__(self, file=config.fma_data_path): """ initializes the ontology instance by reading an obo_file """ self.id2name = {} self.name2ID = {} self.parse_data(file) def parse_data(self, path): with open(path) as ifile: for line in ifile: if line.startswith("#"): continue comps = line.strip().split(",") if len(comps) < 3: continue id = comps[0].split("/")[-1] if 'fma' not in id: continue title = comps[1] self.name2ID[title] = set([id]) self.id2name[id] = set([title]) for syn in comps[2].split("|"): if len(syn) > 0 and len(id) > 0: if syn not in self.name2ID: self.name2ID[syn] = set() self.name2ID[syn].add(id) self.id2name[id].add(syn)
def __init__(self): config = ProjectConfig() self.gnd = grounding.grounding() p = WordPreprocessor() with open(config.phenebank_data_tag_vocab, 'rb') as handle: p.vocab_tag = pickle.load(handle) with open(config.phenebank_data_word_vocab, 'rb') as handle: p.vocab_word = pickle.load(handle) with open(config.phenebank_data_char_vocab, 'rb') as handle: p.vocab_char = pickle.load(handle) model_config = ModelConfig() model_config.vocab_size = len(p.vocab_word) model_config.char_vocab_size = len(p.vocab_char) self.main_anago_tagger = tgger.Tagger( model_config, config.anago_model, save_path=config.anago_models_dir, preprocessor=p)
class Ontology: name = 'omim' config = ProjectConfig() def __init__(self, file=config.omim_data_path): """ initializes the ontology instance by reading an obo_file """ self.id2name = {} self.id2abbrv = {} self.name2ID = {} self.abbr2ID = {} self.abbreviations = {} self.parse_data(file) def parse_data(self, path): with open(path) as ifile: for line in ifile: if line.startswith("#"): continue comps = line.strip().split("\t") if len(comps) > 1: id = comps[1] id = "OMIM:" + id titles = set() abbrvs = set() for t in comps[2:]: for d in t.split(";;"): if ";" in d: tcomps = d.split(";") title = tcomps[0].strip() titles.add(title) for abbrv in tcomps[1:]: abbrv = abbrv.strip() abbrvs.add(abbrv) if abbrv not in self.abbreviations: self.abbreviations[abbrv] = set() self.abbreviations[abbrv].add(title) else: titles.add(d) self.id2name[id] = titles for title in titles: if title not in self.name2ID: self.name2ID[title] = set() self.name2ID[title].add(id) for abbrv in abbrvs: if abbrv not in self.abbr2ID: self.abbr2ID[abbrv] = set() self.abbr2ID[abbrv].add(title)
class grounding(): config = ProjectConfig() def __init__(self, ontology_space_path=config.ontology_embedding_path, base_embedding_path=config.base_embedding_path, ontologies=[]): self.ontologies = ontologies if len(ontologies) == 0: self.ontologies = [ Ontology.HPO, Ontology.SNOMED, Ontology.MESH, Ontology.FMA, Ontology.PRO ] if Ontology.HPO in self.ontologies: self.hp = hpo.Ontology() if Ontology.SNOMED in self.ontologies: self.snmd = snomed.Ontology(prune=False) if Ontology.MESH in self.ontologies: self.msh = mesh.Ontology() if Ontology.FMA in self.ontologies: self.fma = fma.Ontology() if Ontology.PRO in self.ontologies: self.pro = pro.Ontology(human_filter=True) self.id2string_path = ontology_space_path.replace(".bin", ".key.txt") self.id2string = {} def get_dict(path): map = {} with open(path) as ifile: for line in ifile: comps = line.strip().split("\t") if len(comps) > 1: map[comps[0]] = comps[1] else: map[comps[0]] = "" return map self.ngram_vectors = {} self.word_vectors = {} for ont in self.ontologies: this_path = ontology_space_path.replace('###', ont.value) print("Loading ontology vectors from", this_path.replace(".bin", ".words.bin"), "and", this_path.replace(".bin", ".ngrams.bin")) self.word_vectors[ont] = KeyedVectors.load_word2vec_format( this_path.replace(".bin", ".words.bin"), binary=True) self.ngram_vectors[ont] = KeyedVectors.load_word2vec_format( this_path.replace(".bin", ".ngrams.bin"), binary=True) print("vocab:", len(self.word_vectors[ont].vocab), len(self.ngram_vectors[ont].vocab)) print("vector size:", self.word_vectors[ont].vector_size) self.id2string.update( get_dict(this_path.replace(".bin", ".key.txt"))) self.word2index_map = {} print("[Vectors loaded!]") self.name_embedding = ne.namemb( base_embedding_path=base_embedding_path) self.factor = 0.5 # if the word-based similarity is lower than this, it will back off to ngram similarity self.word_threshold = 0.9 def get_ontologies(self, type, object=False): if type == EntityType.Phenotype or type == EntityType.Disease: if object: return self.get_ontology(["HP", "SCTID", "D"]) return ["HP", "SCTID", "D"] elif type == EntityType.GPR: if object: return self.get_ontology(["PR", "SCTID"]) return ["PR", "SCTID"] elif type == EntityType.Anatomy: if object: return self.get_ontology(["SCTID", "fma"]) return ["SCTID", "fma"] elif type == EntityType.Molecule: if object: return self.get_ontology(["SCTID"]) return ["SCTID"] elif type == EntityType.Cell: if object: return self.get_ontology(["SCTID"]) return ["SCTID"] elif type == EntityType.Gene_variant or type == EntityType.Pathway: if object: return self.get_ontology(["SCTID"]) return ["SCTID"] else: return None def get_ontology(self, ids): onts = [] for id in ids: if id.startswith("SCTID"): onts.append(self.snmd) elif id.startswith("HP"): onts.append(self.hp) elif re.match("^D[0-9]+", id): onts.append(self.msh) elif id.startswith("f"): onts.append(self.fma) elif id.startswith("P"): onts.append(self.pro) return onts def get_ontology_by_name(self, ids): out_onts = set() for id in ids: if id == Ontology.SNOMED: out_onts.add(self.snmd) elif id == Ontology.MESH: out_onts.add(self.msh) elif id == Ontology.HPO: out_onts.add(self.hp) elif id == Ontology.FMA: out_onts.add(self.fma) elif id == Ontology.PRO: out_onts.add(self.pro) return out_onts def length_effect(self, name, items): out_items = {} for item in items: title = self.id2string[item[0]] ratio = 2.0 * abs(len(name) - len(title)) / (len(name) + len(title)) score = item[1] - ratio / 10 out_items[item[0]] = score sorted_items = [] for k in sorted(out_items.items(), reverse=True, key=operator.itemgetter(1)): sorted_items.append((k[0], k[1])) return sorted_items def filter_IDs(self, items, ID_set): filtered = [] for item in items: id = item[0] for ont in ID_set: if id.startswith(ont): filtered.append((item[0], item[1])) return filtered def get_alternatives(self, name): alternatives = [name.lower()] if re.match("[A-Z0-9]+ gene", name): alternatives.append(name.replace('gene', '').strip()) if re.match("[A-Z0-9]+ genes", name): alternatives.append(name.replace('genes', '').strip()) if re.match("[A-Z0-9]+ protein", name): alternatives.append(name.replace('protein', '').strip()) return alternatives def get_closests_match(self, name, type, restricted_ontologies=[], replace_unk=True, ngram_backoff=True, topn=3, alternatives=False, keep_id=False): if len(restricted_ontologies) > 0: ontologies = self.get_ontology_by_name(restricted_ontologies) else: ontologies = self.get_ontologies(type, object=True) ID_set = self.get_ontologies(type) names = [name] if alternatives: names += self.get_alternatives(name) for nm in names: if ontologies is None: return [] for ontology in ontologies: if nm in ontology.name2ID or nm.lower() in ontology.name2ID: try: fid = list(ontology.name2ID[nm])[0] except: fid = list(ontology.name2ID[nm.lower()])[0] if len(ID_set) > 0: for ids in ID_set: if fid.startswith(ids): return [(fid, 1.0)] else: return [(fid, 1.0)] if name.isupper(): ngram_backoff = False wemb, cemb = self.name_embedding.get_embedding( name, replace_unk=replace_unk, ngram_backoff=ngram_backoff) results = self.get_candidates(name, self.word_vectors, wemb, topn, ID_set, keep_id=keep_id) oov, total = self.name_embedding.get_oov(name) if results[0][ 1] < self.word_threshold and ngram_backoff and oov > 0 and oov < 3 and ( oov == 2 and total > 4): print(name) print(">", results) results2 = self.get_candidates(name, self.ngram_vectors, cemb, topn, ID_set, keep_id=keep_id) print(">>", results2) return results2 return results def get_candidates(self, name, vectors, emb, topn, ID_set, keep_id=False): max = 0 most_similars = [] for ont in self.ontologies: if get_IDinit(ont) in ID_set: most_similars.extend(vectors[ont].similar_by_vector(emb, topn=10)) most_similars = self.length_effect(name, most_similars) pruned_set = [] done = 1 for sim in most_similars: if done > topn: continue if keep_id: pruned_set.append((sim[0], sim[1])) else: pruned_set.append((sim[0].split("#")[0], sim[1])) done += 1 return pruned_set
if name == Ontology.MESH: return "D" if name == Ontology.FMA: return "f" if name == Ontology.PRO: return "P" if __name__ == "__main__": #g = grounding(ontology_space_path='/home/pilehvar/taher/grounding-release/outputs/###.v3.name.embedding.50d.txt', # base_embedding_path='/media/Data/taher-data/embeddings/PubMed-2016.fasttext.50d.vec.bin', ontologies=[Ontology.HPO]) #print(g.get_closests_match("copper deficiency", EntityType.Phenotype, topn=10, restricted_ontologies=[Ontology.HPO])) #exit() config = ProjectConfig() g = grounding(ontology_space_path=config.ontology_embedding_path, ontologies=[Ontology.HPO, Ontology.SNOMED, Ontology.MESH], base_embedding_path=config.base_embedding_path) while True: try: input = raw_input except NameError: pass inp = input("Phenotype: ") results = g.get_closests_match(inp, EntityType.Phenotype, ngram_backoff=True, restricted_ontologies=[Ontology.HPO], topn=5)