def __call__(self, text): tokens = list(self.tokenizer(text)) tokens_lemmas = lemmatize([t.text for t in tokens], self._morph) tags = self.network.predict_for_token_batch([tokens_lemmas])[0] previous_tag = null_tag = 'O' previous_tokens = [] for token, current_tag in zip( itertools.chain(tokens, [None]), itertools.chain(tags, [null_tag]) ): if current_tag.startswith('I'): previous_tokens.append(token) elif previous_tag != null_tag: yield Match( previous_tokens, Span( previous_tokens[0].span[0], previous_tokens[-1].span[1], ), previous_tag[-3:] ) if current_tag.startswith('B'): previous_tokens = [token] previous_tag = current_tag
def print_predict(sentence): # Split sentence into tokens tokens = tokenize(sentence) # Lemmatize every token # Example: был -> быть, его -> он tokens_lemmas = lemmatize(tokens) tags = network.predict_for_token_batch([tokens_lemmas])[0] for token, tag in zip(tokens, tags): print(token, tag)
def print_predict(sentence, network, f=sys.stdout, threshold=0.2): # Split sentence into tokens tokens = tokenize(sentence) # Lemmatize every token tokens_lemmas = lemmatize(tokens) tags, logits = network.predict_for_token_batch([tokens_lemmas]) tags, logits = tags[0], logits[0] o_idx = network.corpus.tag_dict.toks2idxs(['O']) predicted_tags = [] last_number = None last_tk = None for token, tag, l in zip(tokens, tags, logits): if is_number(token.lower()): last_number = normalize_num(token) second_best = np.argsort(l)[-2] third_best = np.argsort(l)[-3] if tag == 'O': ratio = l[second_best] / l[third_best] #if ratio * l[second_best] > .2 and token not in '.?,\':!': if ratio * l[ second_best] > threshold and token not in '.?,\':!' and token not in stopwords.words( 'english'): tag = network.corpus.tag_dict.idxs2toks([second_best])[0] elif tag.startswith('B') and (token in stopwords.words('english') or token in '.?,\':!'): tag = 'O' # print(token, tag, file=f) if tag.startswith('B') and token not in stopwords.words( 'english') and token not in '.?,\':!': if 'calendric' in tag and 'night' in token and last_number is not None: predicted_tags.append(('nights', last_number)) last_number = None elif 'people' in tag and last_number is not None: predicted_tags.append(('people', last_number)) last_number = None elif 'performers' in tag and last_number is not None: predicted_tags.append(('stars', last_number)) last_number = None elif 'gpe' in tag: if last_tk == 'to': predicted_tags.append(('toloc.city_name', token)) else: predicted_tags.append(('fromloc.city_name', token)) else: predicted_tags.append((tag[2:], token)) last_tk = token return predicted_tags
def _preprocess_task(self, task): # tokens = tokenize(task) tokens = task.split(' ') tokens_lemmas = lemmatize(tokens) return tokens_lemmas
def _preprocess_humaninput(self, task): tokens = nltk.tokenize.wordpunct_tokenize(task) tokens_lemmas = lemmatize(tokens) return tokens_lemmas