def _compute_tokens(self, ent): """ Compute tokens from given entity :param ent: :return: """ name_tokens = string_utils.tokenize_string(ent['canonical_name'], self.tokenizer, self.STOP) stemmed_tokens = tuple([self.stemmer.stem(w) for w in name_tokens]) lemmatized_tokens = tuple( [self.lemmatizer.lemmatize(w) for w in name_tokens]) character_tokens = tuple( string_utils.get_character_n_grams(ent['canonical_name'], constants.NGRAM_SIZE)) alias_tokens = [ string_utils.tokenize_string(a, self.tokenizer, self.STOP) for a in ent['aliases'] ] def_tokens = string_utils.tokenize_string(ent['definition'], self.tokenizer, self.STOP) return [ name_tokens, stemmed_tokens, lemmatized_tokens, character_tokens, alias_tokens, def_tokens ]
def _compute_tokens(self, ent): """ Compute tokens from given entity :param ent: :return: """ name_string = string_utils.normalize_string(ent['canonical_name']) name_tokens = string_utils.tokenize_string(name_string, self.tokenizer, self.STOP) stemmed_tokens = tuple([self.stemmer.stem(w) for w in name_tokens]) lemmatized_tokens = tuple( [self.lemmatizer.lemmatize(w) for w in name_tokens]) character_tokens = tuple( string_utils.get_character_n_grams(name_string, constants.NGRAM_SIZE)) alias_tokens = [] for a in ent['aliases']: alias_tokens.append( string_utils.tokenize_string(string_utils.normalize_string(a), self.tokenizer, self.STOP)) parent_names = ent['par_relations'] child_names = ent['chd_relations'] return [ name_tokens, stemmed_tokens, lemmatized_tokens, character_tokens, alias_tokens, set(parent_names), set(child_names) ]
def _tokenize(self, s): """ Tokenize string s :param s: :return: """ return string_utils.tokenize_string(s, self.tokenizer, self.STOP)
def _get_ent_names_from_relations(self, ent, kb, rel_types): """ fetch the set of entity names that are related to the given entity :param ent: :param kb: :param rel_types: set of relations to extract :return: """ matching_rels = [kb.relations[rel_id] for rel_id in ent.relation_ids] ent_ids = [ rel.entity_ids[1] for rel in matching_rels if rel.relation_type in rel_types and rel.entity_ids[1] in kb.research_entity_id_to_entity_index ] ent_names = [] for ent_id in ent_ids: ent = kb.get_entity_by_research_entity_id(ent_id) if ent: ent_names.append( tuple( string_utils.tokenize_string( string_utils.normalize_string(ent.canonical_name), self.tokenizer, self.STOP))) return ent_names
def _generate_token_map(self, ents: List[KBEntity]): """ Generates token-to-entity and entity-to-token map for an input list of KBEntity objects :param ents: list of KBEntity objects :return: token-to-entity dict and entity-to-token dict """ # maps entity id key to word tokens in entity ent_to_tokens = dict() # maps token key to entities that have that token token_to_ents = defaultdict(set) for ent in ents: ent_id = ent.research_entity_id # tokenize all names and definitions name_tokens = [] char_tokens = [] for name in ent.aliases: name_tokens += string_utils.tokenize_string( name, self.tokenizer, self.STOP) char_tokens += [ ''.join(c) for c in string_utils.get_character_n_grams( string_utils.normalize_string(name), constants.NGRAM_SIZE) ] def_tokens = string_utils.tokenize_string(ent.definition, self.tokenizer, self.STOP) # combine tokens tokens = set(name_tokens).union(set(char_tokens)).union( set(def_tokens)) # add to ent-to-token map ent_to_tokens[ent_id] = tokens # add to token-to-ent map for tok in tokens: token_to_ents[tok].add(ent_id) # generate n-grams for all aliases for ng in char_tokens: token_to_ents[ng].add(ent_id) return token_to_ents, ent_to_tokens