def builtin_entity_match(tokens, token_index): text = initial_string_from_tokens(tokens) start = tokens[token_index].start end = tokens[token_index].end builtin_entities = self.builtin_entity_parser.parse( text, scope=[builtin_entity], use_cache=True) # only keep builtin entities (of type `builtin_entity`) which # overlap with the current token builtin_entities = [ ent for ent in builtin_entities if entity_filter(ent, start, end) ] if builtin_entities: # In most cases, 0 or 1 entity will be found. We fall back to # the first entity if 2 or more were found ent = builtin_entities[0] entity_start = ent[RES_MATCH_RANGE][START] entity_end = ent[RES_MATCH_RANGE][END] indexes = [] for index, token in enumerate(tokens): if (entity_start <= token.start < entity_end) \ and (entity_start < token.end <= entity_end): indexes.append(index) return get_scheme_prefix(token_index, indexes, self.tagging_scheme) return None
def entity_match(tokens, token_index): transformed_tokens = self._transform(tokens) text = initial_string_from_tokens(transformed_tokens) token_start = transformed_tokens[token_index].start token_end = transformed_tokens[token_index].end custom_entities = self.custom_entity_parser.parse(text, scope=[entity], use_cache=True) # only keep builtin entities (of type `entity`) which overlap with # the current token custom_entities = [ ent for ent in custom_entities if entity_filter(ent, token_start, token_end) ] if custom_entities: # In most cases, 0 or 1 entity will be found. We fall back to # the first entity if 2 or more were found ent = custom_entities[0] indexes = [] for index, token in enumerate(transformed_tokens): if entity_filter(ent, token.start, token.end): indexes.append(index) return get_scheme_prefix(token_index, indexes, self.tagging_scheme) return None
def collection_match(tokens, token_index): normalized_tokens = list(map(self._transform, tokens)) ngrams = get_all_ngrams(normalized_tokens) ngrams = [ngram for ngram in ngrams if token_index in ngram[TOKEN_INDEXES]] ngrams = sorted(ngrams, key=lambda ng: len(ng[TOKEN_INDEXES]), reverse=True) for ngram in ngrams: if ngram[NGRAM] in collection_set: return get_scheme_prefix(token_index, sorted(ngram[TOKEN_INDEXES]), self.tagging_scheme) return None
def entity_match(tokens, token_index): transformed_tokens = self._transform(tokens) text = initial_string_from_tokens(transformed_tokens) token_start = transformed_tokens[token_index].start token_end = transformed_tokens[token_index].end custom_entities = custom_entity_parser.parse(text, scope=[entity], use_cache=True) custom_entities = [ ent for ent in custom_entities if entity_filter(ent, token_start, token_end) ] for ent in custom_entities: indexes = [] for index, token in enumerate(transformed_tokens): if entity_filter(ent, token.start, token.end): indexes.append(index) return get_scheme_prefix(token_index, indexes, self.tagging_scheme)
def builtin_entity_match(tokens, token_index): text = initial_string_from_tokens(tokens) start = tokens[token_index].start end = tokens[token_index].end builtin_entities = get_builtin_entities( text, self.language, scope=[builtin_entity]) builtin_entities = [ent for ent in builtin_entities if entity_filter(ent, start, end)] for ent in builtin_entities: entity_start = ent[RES_MATCH_RANGE][START] entity_end = ent[RES_MATCH_RANGE][END] indexes = [] for index, token in enumerate(tokens): if (entity_start <= token.start < entity_end) \ and (entity_start < token.end <= entity_end): indexes.append(index) return get_scheme_prefix(token_index, indexes, self.tagging_scheme)