Exemple #1
0
    def _compute_tokens(self, ent):
        """
        Compute tokens from given entity
        :param ent:
        :return:
        """
        name_string = string_utils.normalize_string(ent['canonical_name'])
        name_tokens = string_utils.tokenize_string(name_string, self.tokenizer,
                                                   self.STOP)
        stemmed_tokens = tuple([self.stemmer.stem(w) for w in name_tokens])
        lemmatized_tokens = tuple(
            [self.lemmatizer.lemmatize(w) for w in name_tokens])
        character_tokens = tuple(
            string_utils.get_character_n_grams(name_string,
                                               constants.NGRAM_SIZE))

        alias_tokens = []

        for a in ent['aliases']:
            alias_tokens.append(
                string_utils.tokenize_string(string_utils.normalize_string(a),
                                             self.tokenizer, self.STOP))

        parent_names = ent['par_relations']
        child_names = ent['chd_relations']

        return [
            name_tokens, stemmed_tokens, lemmatized_tokens, character_tokens,
            alias_tokens,
            set(parent_names),
            set(child_names)
        ]
 def _normalize_ent(ent):
     norm_ent = dict()
     norm_ent['canonical_name'] = string_utils.normalize_string(
         ent['canonical_name'])
     norm_ent['aliases'] = [
         string_utils.normalize_string(a) for a in ent['aliases']
     ]
     norm_ent['definition'] = string_utils.normalize_string(
         ent['definition'])
     norm_ent['par_relations'] = set(
         [string_utils.normalize_string(i) for i in ent['par_relations']])
     norm_ent['chd_relations'] = set(
         [string_utils.normalize_string(i) for i in ent['chd_relations']])
     return norm_ent
Exemple #3
0
    def _get_ent_names_from_relations(self, ent, kb, rel_types):
        """
        fetch the set of entity names that are related to the given entity
        :param ent:
        :param kb:
        :param rel_types: set of relations to extract
        :return:
        """
        matching_rels = [kb.relations[rel_id] for rel_id in ent.relation_ids]

        ent_ids = [
            rel.entity_ids[1] for rel in matching_rels
            if rel.relation_type in rel_types
            and rel.entity_ids[1] in kb.research_entity_id_to_entity_index
        ]

        ent_names = []
        for ent_id in ent_ids:
            ent = kb.get_entity_by_research_entity_id(ent_id)
            if ent:
                ent_names.append(
                    tuple(
                        string_utils.tokenize_string(
                            string_utils.normalize_string(ent.canonical_name),
                            self.tokenizer, self.STOP)))

        return ent_names
Exemple #4
0
    def _generate_token_map(self, ents: List[KBEntity]):
        """
        Generates token-to-entity and entity-to-token map for an input list
        of KBEntity objects
        :param ents: list of KBEntity objects
        :return: token-to-entity dict and entity-to-token dict
        """
        # maps entity id key to word tokens in entity
        ent_to_tokens = dict()

        # maps token key to entities that have that token
        token_to_ents = defaultdict(set)

        for ent in ents:
            ent_id = ent.research_entity_id

            # tokenize all names and definitions
            name_tokens = []
            char_tokens = []
            for name in ent.aliases:
                name_tokens += string_utils.tokenize_string(
                    name, self.tokenizer, self.STOP)
                char_tokens += [
                    ''.join(c) for c in string_utils.get_character_n_grams(
                        string_utils.normalize_string(name),
                        constants.NGRAM_SIZE)
                ]

            def_tokens = string_utils.tokenize_string(ent.definition,
                                                      self.tokenizer,
                                                      self.STOP)

            # combine tokens
            tokens = set(name_tokens).union(set(char_tokens)).union(
                set(def_tokens))

            # add to ent-to-token map
            ent_to_tokens[ent_id] = tokens

            # add to token-to-ent map
            for tok in tokens:
                token_to_ents[tok].add(ent_id)

            # generate n-grams for all aliases
            for ng in char_tokens:
                token_to_ents[ng].add(ent_id)
        return token_to_ents, ent_to_tokens
Exemple #5
0
    def normalize_kb(self):
        """
        Normalize all strings in kb
        :param kb:
        :return:
        """
        for ent in self.entities:
            ent.canonical_name = string_utils.normalize_string(
                ent.canonical_name)
            ent.aliases = [
                string_utils.normalize_string(a) for a in ent.aliases
            ]
            ent.definition = string_utils.normalize_string(ent.definition)

            ent.additional_details['wiki_entities'] = [
                string_utils.normalize_string(i)
                for i in ent.additional_details['wiki_entities']
            ] if 'wiki_entities' in ent.additional_details else []

            ent.additional_details['mesh_synonyms'] = [
                string_utils.normalize_string(i)
                for i in ent.additional_details['mesh_synonynms']
            ] if 'mesh_synonynms' in ent.additional_details else []

            ent.additional_details['dbpedia_synonyms'] = [
                string_utils.normalize_string(i)
                for i in ent.additional_details['dbpedia_synonyms']
            ] if 'dbpedia_synonyms' in ent.additional_details else []

            all_rels = [self.relations[r_id] for r_id in ent.relation_ids]
            par_ents = [
                r.entity_ids[1] for r in all_rels
                if r.relation_type in constants.UMLS_PARENT_REL_LABELS
            ]
            chd_ents = [
                r.entity_ids[1] for r in all_rels
                if r.relation_type in constants.UMLS_CHILD_REL_LABELS
            ]
            sib_ents = [
                r.entity_ids[1] for r in all_rels
                if r.relation_type in constants.UMLS_SIBLING_REL_LABELS
            ]
            syn_ents = [
                r.entity_ids[1] for r in all_rels
                if r.relation_type in constants.UMLS_SYNONYM_REL_LABELS
            ]

            ent.additional_details['par_relations'] = list(set(par_ents))
            ent.additional_details['chd_relations'] = list(set(chd_ents))
            ent.additional_details['sib_relations'] = list(set(sib_ents))
            ent.additional_details['syn_relations'] = list(set(syn_ents))

        return
    def get_synonyms_to_entity(self, aliases: List):
        """
        Return synonyms of entity
        :param aliases: entity aliases
        :return:
        """
        # normalize aliases
        norm_aliases = [string_utils.normalize_string(a) for a in aliases]

        # intialize synonym lists
        mesh_syns = []
        dbpedia_syns = []

        # get synonyms from synonym dicts
        for a in norm_aliases:
            mesh_syns += self.mesh_synonyms[a]
            dbpedia_syns += self.dbpedia_synonyms[a]

        return list(set(mesh_syns)), list(set(dbpedia_syns))