def prepare_ngrams_in_text(text: str, n: int) \ -> Generator[Tuple[List[int], List[str], int, int], None, None]: tokens = TOKENIZER.tokenize(text) token_spans = align_tokens(tokens, text) tagged_words = nltk.pos_tag(tokens) words2 = [] for i in range(len(tagged_words)): span = token_spans[i] word = text[span[0]:span[1]] pos = tagged_words[i][1] features = address_features.get_word_features(word, pos) words2.append((word, span, pos, features)) for i in range(len(words2)): word = words2[i][0] word_start_pos = words2[i][1][0] word_end_pos = words2[i][1][1] features = list() for j in range(i - n, i + n - 1): if 0 <= j < len(words2): features.extend(words2[j][3]) else: features.extend(address_features.ZERO_FEATURES) yield features, word, word_start_pos, word_end_pos
def prepare_ngrams_in_text(text: str, window_half_width: int, window_step: int) \ -> Generator[Tuple[List[int], List[str], int, int], None, None]: words2 = [] for word, pos_token, word_start_pos, word_end_pos in TOKENIZER.get_token_spans(text): features = address_features.get_word_features(word, pos_token) # our tokenizer returns exact word_end_pos and we need it so that text[word_start_pos:word_end_pos] == word words2.append((word, pos_token, word_start_pos, word_end_pos + 1, features)) i = 0 while i < len(words2): word, pos_token, word_start_pos, word_end_pos, features = words2[i] features = list() for j in range(i - window_half_width, i + window_half_width): if 0 <= j < len(words2): features.extend(words2[j][4]) else: features.extend(address_features.ZERO_FEATURES) yield features, word, word_start_pos, word_end_pos i += window_step