Ejemplo n.º 1
0
class Ontology:
    """A class for an ontology instance to work with the HPO data """
    config = ProjectConfig()

    name = 'pro'
    def __init__(self, obo_file=config.pro_data_path, id_filter=[], human_filter=False):
        """ initializes the ontology instance by reading an obo_file """

        self.id_filter = id_filter

        # mapps each alternative ID to ist original ID
        self.alt2id = altID2ID(self, obo_file)

        # mapps each ID to its alternative IDs
        self.id2alt = ID2altID(self, obo_file)

        # mapps each term ID  to its parent(s)
        self.id2par = is_a_mapping(self, obo_file)

        # maps each term ID to its name:
        self.id2name = id2name(self, obo_file, human_filter=human_filter)

        # mapps each synonym and names to the termID
        self.name2ID = name_synonym2ID(self, obo_file)

        # maps each term ID to its child terms
        self.term2child = parse_top_down(self, obo_file, self.id2alt, self.alt2id)


    def collect_terms_to_root(self, termID, path=set()):
        """recursively collects all terms in the path to the root of the ontology by using the id2par dict """

        # check if term is the root (to stop recursion in that case)
        if not self.id2par[termID]:

            return path

        else:

            for par in self.id2par[termID]:

                # check if the path is already known:
                if par in self.id2root_path:

                    path.update(self.id2root_path[par])

                else:

                    path.add(par)

                    # add also alternative IDs for the parents if availabel
                    if par in self.id2alt:
                        path.update(self.id2par[par])

                    # recursively add rest of the path
                    path.update( self.collect_terms_to_root(par, path) )

        return path
Ejemplo n.º 2
0
class Ontology:

    config = ProjectConfig()

    name = 'mesh'
    IDinit = 'D'
    def __init__(self, xml_files=[config.mesh_data_path], extend_with_umls=False):
        """ initializes the ontology instance by reading an obo_file """
        sys.stdout.write("Loading MeSH ")

        map = mappings.mapper()
        records = []

        sys.stdout.write("%s" % (" " * 11))
        sys.stdout.flush()
        sys.stdout.write("\b" * (10+1))

        for file in xml_files:
            records.extend(parse_mesh(file))
        print("")

        self.id2name = {}
        self.name2ID = {}

        for rec in records:
            id = rec.ui
            names = set()
            names.add(rec.name)
            for conc in rec.concepts:
                for term in conc.terms:
                    names.add(term.term_name)

            self.id2name[id] = names

            for name in names:
                if name not in self.name2ID:
                    self.name2ID[name] = set()
                self.name2ID[name].add(id)

            if not name.lower() in self.name2ID:
                self.name2ID[name.lower()] = set()
            self.name2ID[name.lower()].add(id)

        restricted_umls_set = set()
        for id in self.id2name.keys():
            if id in map.mesh2umls:
                restricted_umls_set.update(map.mesh2umls[id])

        if extend_with_umls:
            for id in self.id2name.keys():
                extended_lex = map.get_umls_lexicalizations(id, restrict_to=restricted_umls_set)
                print("done for", id)
                self.id2name[id].update(extended_lex)
Ejemplo n.º 3
0
    def __init__(self, remove_parentheses=True, keep_only=[], discard_stop_concepts=True, prune=False):
        self.id2name = {}
        self.name2ID = {}

        config = ProjectConfig()
        file = config.snomed_data_path
 
        self.active_IDs = self.get_active_IDs(config.snomed_active_concepts)
        if prune:
            keeps = config.snomed_filtered_branches
            self.keepIDs = self.get_keep_IDs(config.snomed_filtered_concepts_path, keeps)

        self.parse_data(file, remove_parentheses=True, keep_only=keep_only, discard_inactive=discard_stop_concepts, prune=prune)
Ejemplo n.º 4
0
class Ontology:

    name = 'fma'

    config = ProjectConfig()

    def __init__(self, file=config.fma_data_path):
        """ initializes the ontology instance by reading an obo_file """
        self.id2name = {}
        self.name2ID = {}

        self.parse_data(file)

    def parse_data(self, path):

        with open(path) as ifile:
            for line in ifile:
                if line.startswith("#"):
                    continue

                comps = line.strip().split(",")
                if len(comps) < 3:
                    continue
                id = comps[0].split("/")[-1]
                if 'fma' not in id:
                    continue

                title = comps[1]

                self.name2ID[title] = set([id])
                self.id2name[id] = set([title])

                for syn in comps[2].split("|"):
                    if len(syn) > 0 and len(id) > 0:
                        if syn not in self.name2ID:
                            self.name2ID[syn] = set()
                        self.name2ID[syn].add(id)
                        self.id2name[id].add(syn)
Ejemplo n.º 5
0
    def __init__(self):
        config = ProjectConfig()

        self.gnd = grounding.grounding()
        p = WordPreprocessor()

        with open(config.phenebank_data_tag_vocab, 'rb') as handle:
            p.vocab_tag = pickle.load(handle)

        with open(config.phenebank_data_word_vocab, 'rb') as handle:
            p.vocab_word = pickle.load(handle)

        with open(config.phenebank_data_char_vocab, 'rb') as handle:
            p.vocab_char = pickle.load(handle)

        model_config = ModelConfig()
        model_config.vocab_size = len(p.vocab_word)
        model_config.char_vocab_size = len(p.vocab_char)

        self.main_anago_tagger = tgger.Tagger(
            model_config,
            config.anago_model,
            save_path=config.anago_models_dir,
            preprocessor=p)
Ejemplo n.º 6
0
class Ontology:

    name = 'omim'
    config = ProjectConfig()

    def __init__(self, file=config.omim_data_path):
        """ initializes the ontology instance by reading an obo_file """
        self.id2name = {}
        self.id2abbrv = {}
        self.name2ID = {}
        self.abbr2ID = {}
        self.abbreviations = {}

        self.parse_data(file)

    def parse_data(self, path):
        with open(path) as ifile:
            for line in ifile:
                if line.startswith("#"):
                    continue

                comps = line.strip().split("\t")

                if len(comps) > 1:
                    id = comps[1]
                    id = "OMIM:" + id

                    titles = set()
                    abbrvs = set()

                    for t in comps[2:]:
                        for d in t.split(";;"):
                            if ";" in d:
                                tcomps = d.split(";")
                                title = tcomps[0].strip()
                                titles.add(title)

                                for abbrv in tcomps[1:]:
                                    abbrv = abbrv.strip()
                                    abbrvs.add(abbrv)

                                if abbrv not in self.abbreviations:
                                    self.abbreviations[abbrv] = set()
                                self.abbreviations[abbrv].add(title)

                            else:
                                titles.add(d)

                    self.id2name[id] = titles

                    for title in titles:
                        if title not in self.name2ID:
                            self.name2ID[title] = set()

                        self.name2ID[title].add(id)

                    for abbrv in abbrvs:
                        if abbrv not in self.abbr2ID:
                            self.abbr2ID[abbrv] = set()

                        self.abbr2ID[abbrv].add(title)
Ejemplo n.º 7
0
class grounding():

    config = ProjectConfig()

    def __init__(self,
                 ontology_space_path=config.ontology_embedding_path,
                 base_embedding_path=config.base_embedding_path,
                 ontologies=[]):

        self.ontologies = ontologies

        if len(ontologies) == 0:
            self.ontologies = [
                Ontology.HPO, Ontology.SNOMED, Ontology.MESH, Ontology.FMA,
                Ontology.PRO
            ]

        if Ontology.HPO in self.ontologies:
            self.hp = hpo.Ontology()
        if Ontology.SNOMED in self.ontologies:
            self.snmd = snomed.Ontology(prune=False)
        if Ontology.MESH in self.ontologies:
            self.msh = mesh.Ontology()
        if Ontology.FMA in self.ontologies:
            self.fma = fma.Ontology()
        if Ontology.PRO in self.ontologies:
            self.pro = pro.Ontology(human_filter=True)

        self.id2string_path = ontology_space_path.replace(".bin", ".key.txt")
        self.id2string = {}

        def get_dict(path):
            map = {}
            with open(path) as ifile:
                for line in ifile:
                    comps = line.strip().split("\t")
                    if len(comps) > 1:
                        map[comps[0]] = comps[1]
                    else:
                        map[comps[0]] = ""
            return map

        self.ngram_vectors = {}
        self.word_vectors = {}

        for ont in self.ontologies:
            this_path = ontology_space_path.replace('###', ont.value)
            print("Loading ontology vectors from",
                  this_path.replace(".bin", ".words.bin"), "and",
                  this_path.replace(".bin", ".ngrams.bin"))
            self.word_vectors[ont] = KeyedVectors.load_word2vec_format(
                this_path.replace(".bin", ".words.bin"), binary=True)
            self.ngram_vectors[ont] = KeyedVectors.load_word2vec_format(
                this_path.replace(".bin", ".ngrams.bin"), binary=True)
            print("vocab:", len(self.word_vectors[ont].vocab),
                  len(self.ngram_vectors[ont].vocab))
            print("vector size:", self.word_vectors[ont].vector_size)
            self.id2string.update(
                get_dict(this_path.replace(".bin", ".key.txt")))

        self.word2index_map = {}

        print("[Vectors loaded!]")

        self.name_embedding = ne.namemb(
            base_embedding_path=base_embedding_path)

        self.factor = 0.5

        # if the word-based similarity is lower than this, it will back off to ngram similarity
        self.word_threshold = 0.9

    def get_ontologies(self, type, object=False):
        if type == EntityType.Phenotype or type == EntityType.Disease:
            if object:
                return self.get_ontology(["HP", "SCTID", "D"])
            return ["HP", "SCTID", "D"]

        elif type == EntityType.GPR:
            if object:
                return self.get_ontology(["PR", "SCTID"])
            return ["PR", "SCTID"]

        elif type == EntityType.Anatomy:
            if object:
                return self.get_ontology(["SCTID", "fma"])
            return ["SCTID", "fma"]

        elif type == EntityType.Molecule:
            if object:
                return self.get_ontology(["SCTID"])
            return ["SCTID"]

        elif type == EntityType.Cell:
            if object:
                return self.get_ontology(["SCTID"])
            return ["SCTID"]
        elif type == EntityType.Gene_variant or type == EntityType.Pathway:
            if object:
                return self.get_ontology(["SCTID"])
            return ["SCTID"]
        else:
            return None

    def get_ontology(self, ids):
        onts = []
        for id in ids:
            if id.startswith("SCTID"):
                onts.append(self.snmd)
            elif id.startswith("HP"):
                onts.append(self.hp)
            elif re.match("^D[0-9]+", id):
                onts.append(self.msh)
            elif id.startswith("f"):
                onts.append(self.fma)
            elif id.startswith("P"):
                onts.append(self.pro)

        return onts

    def get_ontology_by_name(self, ids):
        out_onts = set()
        for id in ids:
            if id == Ontology.SNOMED:
                out_onts.add(self.snmd)
            elif id == Ontology.MESH:
                out_onts.add(self.msh)
            elif id == Ontology.HPO:
                out_onts.add(self.hp)
            elif id == Ontology.FMA:
                out_onts.add(self.fma)
            elif id == Ontology.PRO:
                out_onts.add(self.pro)
        return out_onts

    def length_effect(self, name, items):
        out_items = {}

        for item in items:
            title = self.id2string[item[0]]
            ratio = 2.0 * abs(len(name) - len(title)) / (len(name) +
                                                         len(title))
            score = item[1] - ratio / 10
            out_items[item[0]] = score

        sorted_items = []
        for k in sorted(out_items.items(),
                        reverse=True,
                        key=operator.itemgetter(1)):
            sorted_items.append((k[0], k[1]))

        return sorted_items

    def filter_IDs(self, items, ID_set):
        filtered = []
        for item in items:
            id = item[0]
            for ont in ID_set:
                if id.startswith(ont):
                    filtered.append((item[0], item[1]))
        return filtered

    def get_alternatives(self, name):
        alternatives = [name.lower()]
        if re.match("[A-Z0-9]+ gene", name):
            alternatives.append(name.replace('gene', '').strip())
        if re.match("[A-Z0-9]+ genes", name):
            alternatives.append(name.replace('genes', '').strip())
        if re.match("[A-Z0-9]+ protein", name):
            alternatives.append(name.replace('protein', '').strip())
        return alternatives

    def get_closests_match(self,
                           name,
                           type,
                           restricted_ontologies=[],
                           replace_unk=True,
                           ngram_backoff=True,
                           topn=3,
                           alternatives=False,
                           keep_id=False):

        if len(restricted_ontologies) > 0:
            ontologies = self.get_ontology_by_name(restricted_ontologies)
        else:
            ontologies = self.get_ontologies(type, object=True)

        ID_set = self.get_ontologies(type)

        names = [name]
        if alternatives:
            names += self.get_alternatives(name)

        for nm in names:
            if ontologies is None:
                return []
            for ontology in ontologies:
                if nm in ontology.name2ID or nm.lower() in ontology.name2ID:
                    try:
                        fid = list(ontology.name2ID[nm])[0]
                    except:
                        fid = list(ontology.name2ID[nm.lower()])[0]
                    if len(ID_set) > 0:
                        for ids in ID_set:
                            if fid.startswith(ids):
                                return [(fid, 1.0)]
                    else:
                        return [(fid, 1.0)]

        if name.isupper():
            ngram_backoff = False

        wemb, cemb = self.name_embedding.get_embedding(
            name, replace_unk=replace_unk, ngram_backoff=ngram_backoff)

        results = self.get_candidates(name,
                                      self.word_vectors,
                                      wemb,
                                      topn,
                                      ID_set,
                                      keep_id=keep_id)

        oov, total = self.name_embedding.get_oov(name)

        if results[0][
                1] < self.word_threshold and ngram_backoff and oov > 0 and oov < 3 and (
                    oov == 2 and total > 4):
            print(name)
            print(">", results)
            results2 = self.get_candidates(name,
                                           self.ngram_vectors,
                                           cemb,
                                           topn,
                                           ID_set,
                                           keep_id=keep_id)
            print(">>", results2)
            return results2

        return results

    def get_candidates(self, name, vectors, emb, topn, ID_set, keep_id=False):
        max = 0
        most_similars = []
        for ont in self.ontologies:
            if get_IDinit(ont) in ID_set:
                most_similars.extend(vectors[ont].similar_by_vector(emb,
                                                                    topn=10))
        most_similars = self.length_effect(name, most_similars)

        pruned_set = []
        done = 1
        for sim in most_similars:
            if done > topn:
                continue
            if keep_id:
                pruned_set.append((sim[0], sim[1]))
            else:
                pruned_set.append((sim[0].split("#")[0], sim[1]))
            done += 1

        return pruned_set
Ejemplo n.º 8
0
    if name == Ontology.MESH:
        return "D"
    if name == Ontology.FMA:
        return "f"
    if name == Ontology.PRO:
        return "P"


if __name__ == "__main__":

    #g = grounding(ontology_space_path='/home/pilehvar/taher/grounding-release/outputs/###.v3.name.embedding.50d.txt',
    #              base_embedding_path='/media/Data/taher-data/embeddings/PubMed-2016.fasttext.50d.vec.bin', ontologies=[Ontology.HPO])
    #print(g.get_closests_match("copper deficiency", EntityType.Phenotype, topn=10, restricted_ontologies=[Ontology.HPO]))
    #exit()

    config = ProjectConfig()
    g = grounding(ontology_space_path=config.ontology_embedding_path,
                  ontologies=[Ontology.HPO, Ontology.SNOMED, Ontology.MESH],
                  base_embedding_path=config.base_embedding_path)

    while True:
        try:
            input = raw_input
        except NameError:
            pass
        inp = input("Phenotype: ")
        results = g.get_closests_match(inp,
                                       EntityType.Phenotype,
                                       ngram_backoff=True,
                                       restricted_ontologies=[Ontology.HPO],
                                       topn=5)