def filter_bad(row): original, paraphrase = row['original'].strip( '\n'), row['paraphrase'].strip('\n') basic_processing = lambda x: x.lower() \ .replace('ё','е') \ .translate(str.maketrans('', '', string.punctuation+'—')) if any([ not_cyrillic(text) or '...' in text for text in [original, paraphrase] ]): row['original'], row['paraphrase'] = None, None return row original = [_.text for _ in razdel.tokenize(basic_processing(original))] paraphrase = [ _.text for _ in razdel.tokenize(basic_processing(paraphrase)) ] n_tokens = [len(original), len(paraphrase)] if any([n_tok < 2 or n_tok > 20 for n_tok in n_tokens]) \ or set(original) == set(paraphrase) \ or any(len(toks) > 1 and len(set(toks)) == 1 for toks in [original, paraphrase]): row['original'], row['paraphrase'] = None, None return row
def postprocess(ref, hyp, is_multiple_ref=False, detokenize_after=False, tokenize_after=True): if is_multiple_ref: reference_sents = ref.split(" s_s ") decoded_sents = hyp.split("s_s") hyp = [ w.replace("<", "<").replace(">", ">").strip() for w in decoded_sents ] ref = [ w.replace("<", "<").replace(">", ">").strip() for w in reference_sents ] hyp = " ".join(hyp) ref = " ".join(ref) ref = ref.strip() hyp = hyp.strip() if detokenize_after: hyp = punct_detokenize(hyp) ref = punct_detokenize(ref) if tokenize_after: hyp = hyp.replace("@@UNKNOWN@@", "<unk>") hyp = " ".join([token.text for token in razdel.tokenize(hyp)]) ref = " ".join([token.text for token in razdel.tokenize(ref)]) return ref, hyp
def calc_extraction_score(text, summary, threshold=2): text_tokens = [t.text for t in razdel.tokenize(text.lower())] summary_tokens = [t.text for t in razdel.tokenize(summary.lower())] acs = find_acs(text_tokens, summary_tokens, threshold) answer = 0.0 for s in acs: s = s / len(summary_tokens) answer += s * (math.exp(s - 1) - (1 - s) / math.exp(1)) return answer, acs[0] / len(summary_tokens) if acs else 0.0
def postprocess(refs, hyp, tokenize_after, lower): refs = [ref.strip() for ref in refs] hyp = hyp.strip() if tokenize_after: hyp = " ".join([token.text for token in razdel.tokenize(hyp)]) refs = [" ".join([token.text for token in razdel.tokenize(ref)]) for ref in refs] if lower: hyp = hyp.lower() refs = [ref.lower() for ref in refs] return refs, hyp
def main(gold_path, predicted_path, metric, tokenize_after): refs = [] hyps = [] with open(gold_path, "r") as gold, open(predicted_path, "r") as pred: for gold_summary, pred_summary in zip(gold, pred): gold_summary = "".join(gold_summary.split(" ")[1:-1]).replace("▁", " ").strip() pred_summary = "".join(pred_summary.split(" ")[1:-1]).replace("▁", " ").strip() if tokenize_after: pred_summary = " ".join([token.text for token in razdel.tokenize(pred_summary)]) pred_summary = pred_summary.replace("@ @ UNKNOWN @ @", "@@UNKNOWN@@") gold_summary = " ".join([token.text for token in razdel.tokenize(gold_summary)]) refs.append(gold_summary) hyps.append(pred_summary) print_metrics(refs, hyps, metric)
def prepare_dataset(df_train, df_val, df_test): train_labels = df_train["label"].tolist() val_labels = df_val["label"].tolist() test_labels = df_test["label"].tolist() # text -> list of tokens train_texts = [[token.text for token in tokenize(text)] for text in df_train["text"].tolist()] val_texts = [[token.text for token in tokenize(text)] for text in df_val["text"].tolist()] test_texts = [[token.text for token in tokenize(text)] for text in df_test["text"].tolist()] return (train_texts, train_labels), (val_texts, val_labels), (test_texts, test_labels)
def tokenize_roles(role_data): tokenized = [] for set_ in role_data: data = {'roles1': [], 'roles2': [], 'y': []} for datum in set_: # label = int(datum['is_duplicate']) if int(datum['is_duplicate']) != -1 else 2 data['roles1'].append( [token.text for token in list(tokenize(datum['roles1']))]) data['roles2'].append( [token.text for token in list(tokenize(datum['roles2']))]) # -1 -> 0 0 -> 1 1 -> 2 data['y'].append(int(datum['is_duplicate']) + 1) tokenized.append(data) return tokenized
def get_texts(dataset): texts = [] for text in dataset["text"]: for sentence in sentenize(text): texts.append([ token.text.lower() for token in tokenize(sentence.text) if token.text not in punctuation ]) for title in dataset["title"]: texts.append([ token.text.lower() for token in tokenize(title) if token.text not in punctuation ]) return texts
def stop_words_remove(text): ''' очистка текста от стоп-слов на выходе - очищенный от стоп-слов текст ''' # [0] if not isinstance(text, str): text = str(text) # [1] tokens = list(tokenize(text)) words = [_.text for _ in tokens] words_lem = [] for w in words: if w[0] == '-': # [2] w = w[1:] if len(w) > 1: # [3] if w in cache: # [4] words_lem.append(cache[w]) else: # [5] temp_cach = cache[w] = morph.parse(w)[0].normal_form words_lem.append(temp_cach) words_lem_without_stopwords = [ i for i in words_lem if not i in stopword_ru ] # [6] words_lem_without_stopwords_string = str(words_lem_without_stopwords) return words_lem_without_stopwords_string
def lemmatization(text): ''' Lemmatizes a text: [0] - if the type!="str" - make it ""str [1] - tokenizes text [2] - removes words with the first letter=="-" [3] - removes stopwords [4] - removes words with len<=1 [5] - trying to find a word in cache [6] - lemmatizes words Output: lemmatized text ''' global cache # [0] if not isinstance(text, str): text = str(text) # [1] tokens = list(tokenize(text)) words = [_.text for _ in tokens] words_lem = [] for w in words: if w[0] == '-': # [2] w = w[1:] if not w in stopword_ru: # [3] if len(w) > 1: # [4] if w in cache: # [5] words_lem.append(cache[w]) else: # [6] temp_cach = cache[w] = morph.parse(w)[0].normal_form words_lem.append(temp_cach) return words_lem
def __call__(self, text: str): """Performs tokenization and sentence splitting. Args: text(str): text. Returns: Dictionary that contains: 1. tokens - list of objects Token. 2. sentences - list of objects Sentences. """ ann_tokens = [ ann.Token(text=token.text, begin=token.start, end=token.stop) for token in razdel.tokenize(text) ] sentences = [ ProcessorRazdel.offset_to_tokens(offset.start, offset.stop, ann_tokens) for offset in razdel.sentenize(text) ] ann_sentences = [ann.Sentence(begin, end) for begin, end in sentences] return {'tokens': ann_tokens, 'sentences': ann_sentences}
def anaphoras_to_corpus(self, anaphoras: List) -> List[Dict]: '''Конвертируем найденные анафорические связи в корпус ''' for item in anaphoras: corpus: List = [] sequence = list(tokenize(item['text'])) for i, s in enumerate(sequence): if s.start == item['antecedent']['start']: antecedent = { 'token': item['antecedent']['text'], 'lemma': item['antecedent']['text'], 'start': i, 'end': i, } break for i, s in enumerate(sequence): if s.start == item['anaphor']['start']: mentions = [{ 'token': item['anaphor']['text'], 'lemma': item['anaphor']['text'], 'start': i, 'end': i, 'coref': antecedent['start'], }] break corpus.append({ 'text': item['text'], 'sequence': [s.text for s in sequence], 'coreferences': [{ 'antecedent': antecedent, 'mentions': mentions, }], }) return corpus
def generate_errors(text): tokens = list(tokenize(text)) pos = 0 while pos < len(tokens): token = tokens[pos] token_text = token.text if token_text.lower() in {'тоже', 'также'}: yield (text[:token.start] + token_text[:-2] + ' же' + text[token.stop:]) pos += 1 elif token_text.lower() == 'чтоб': yield (text[:token.start] + token_text[:-1] + ' б' + text[token.stop:]) pos += 1 elif token_text.lower() == 'чтобы': yield (text[:token.start] + token_text[:-2] + ' бы' + text[token.stop:]) pos += 1 elif pos + 1 < len(tokens) and token_text.lower() in {'то', 'так'} and tokens[pos + 1].text == 'же': yield (text[:token.start] + token_text + 'же' + text[tokens[pos + 1].stop:]) pos += 2 elif pos + 1 < len(tokens) and token_text.lower() == 'что' and tokens[pos + 1].text in {'б', 'бы'}: yield (text[:token.start] + token_text + tokens[pos + 1].text + text[tokens[pos + 1].stop:]) pos += 2 else: pos += 1
def ru_tokenizer(text: str) -> list: """ Tokenizes texts in Russian Args: text (str): input text Returns: flair Token objects """ all_sentences = [] for paragraph in split_newline(text): sentences = [x.text for x in list(sentenize(paragraph))] all_sentences.extend(sentences) words = [] for sentence in all_sentences: sentence_tokens = [x.text for x in list(tokenize(sentence))] words.extend(sentence_tokens) prev_start_position = 0 tokens = [] for word in words: start_position = text[prev_start_position:].index(word) token = Token(text=word, start_position=prev_start_position + start_position, whitespace_after=False) tokens.append(token) prev_start_position = start_position + prev_start_position + len(word) return tokens
def _calc_embedding(self, text: List[str], return_numpy=False) -> Union[torch.Tensor, np.ndarray]: """ Calculate sentence embedding. Args: text (List[str]): list of sentences return_numpy (boolean): if True return numpy.ndarray, else torch.tensor Returns: np.ndarray: sentence embedding with shape 'bert hidden_size (default: 768)' """ sentences = [[token.text.lower() for token in razdel.tokenize(sent)] for sent in text] encoded = self.tokenizer.batch_encode_plus( sentences, padding="longest", is_split_into_words=True, truncation="longest_first", max_length=256, ) input_ids = torch.tensor(encoded["input_ids"]).to("cuda") attention_mask = torch.tensor(encoded["attention_mask"]).to("cuda") with torch.no_grad(): hidden_states = self.bert(input_ids, attention_mask)["last_hidden_state"] sentence_embedding = hidden_states[:, 0, :] if return_numpy: return sentence_embedding.cpu().numpy() return sentence_embedding
def predict(text: str) -> List[Dict[str, Union[List[Dict[str, Any]], dict]]]: tokens = [t.text for t in tokenize(text)] cache = { t: { "loanword": 0, "obscene": 0, "expressive": 0 } for t in set(t.lower() for t in tokens) } for t in cache: if is_word(t, min_len=3, max_len=30, s_words=stops): cache[t]["emb"] = model[t] cache[t]["loanword"] = loanword_clf.predict([cache[t]["emb"] ]).item() cache[t]["obscene"] = obscene_clf.predict([cache[t]["emb"]]).item() cache[t]["expressive"] = expressive_clf.predict([cache[t]["emb"] ]).item() analysis = [{ "word": t, "loanword": cache[t.lower()]["loanword"], "obscene": cache[t.lower()]["obscene"], "expressive": cache[t.lower()]["expressive"] } for t in tokens] a = [{ "word": d["word"], "categories": [k for k, v in d.items() if v and k != "word"] } for d in analysis] return [{"analysis": a, "statistics": statistics(analysis)}]
def _fix_dictionary(original_sentences): for i, sentence in enumerate(original_sentences): for key, value in substrings_fixes.items(): if key in sentence or key.capitalize() in sentence: original_sentences[i] = sentence.replace(key, value) original_sentences[i] = original_sentences[i].replace(key.capitalize(), value.capitalize()) tokenized_sentences = [(sentence, list(tokenize(sentence))) for sentence in original_sentences] fixed_sentences = [] for sentence, tokens in tokenized_sentences: fixed_sentence = "" offset = 0 for i, token in enumerate(tokens): tokens[i].start += offset tokens[i].stop += offset token_text = token.text fixed_token_text = tokens_fixes.get(token_text, None) if fixed_token_text is not None: tokens[i].text = fixed_token_text offset += len(fixed_token_text) - len(token_text) fixed_sentence = sentence for token in tokens: fixed_sentence = fixed_sentence[:token.start] + token.text + fixed_sentence[token.stop:] fixed_sentences.append(fixed_sentence) return fixed_sentences
def __TextTokenize(self, text): tokens = sent_tokenize(text) for sentence in range(len(tokens)): tokens[sentence] = list(tokenize(tokens[sentence])) tokens[sentence] = [_.text for _ in tokens[sentence]] return tokens
def find_jokes(query, index): result = Counter() for token in tokenize(query): lemma = lemmatize(token.text) docs = index.get(lemma, []) # find documents with this word for doc in docs: result[doc] += 1 / max(1.0, math.log(len(docs) + 1)) return result
def tokenize(string): string = re.sub(r"[^A-Za-zА-Яа-я0-9()\-,!?\'\`’:]", " ", string) string = re.sub(r"’|`", "'", string) string = re.sub(znaks, lambda x: f' {x.group()} ', string) string = re.sub(r"\s{2,}", " ", string) string = string.strip().lower() tokens = [token.text for token in razdel.tokenize(string)] return tokens
def evaluate(test_path, batch_size, metric, max_count, report_every, is_multiple_ref=False, model_path=None, model_config_path=None, baseline=None, reader_config_path=None, detokenize_after=False, tokenize_after=False): reader_params = get_reader_params(reader_config_path, model_config_path, model_path) is_subwords = "tokenizer" in reader_params and reader_params["tokenizer"]["type"] == "subword" reader = DatasetReader.from_params(reader_params) run_model = get_model_runner(model_path, reader) if not baseline else None hyps = [] refs = [] for batch in get_batches(reader, test_path, batch_size): batch_refs, batch_hyps = run_model(batch) if not baseline else run_baseline(batch, baseline) for ref, hyp in zip(batch_refs, batch_hyps): hyp = hyp if not is_subwords else "".join(hyp.split(" ")).replace("▁", " ") if is_multiple_ref: reference_sents = ref.split(" s_s ") decoded_sents = hyp.split("s_s") hyp = [w.replace("<", "<").replace(">", ">").strip() for w in decoded_sents] ref = [w.replace("<", "<").replace(">", ">").strip() for w in reference_sents] hyp = " ".join(hyp) ref = " ".join(ref) ref = ref.strip() hyp = hyp.strip() if detokenize_after: hyp = detokenize(hyp) ref = detokenize(ref) if tokenize_after: hyp = " ".join([token.text for token in razdel.tokenize(hyp)]) hyp = hyp.replace("@ @ UNKNOWN @ @", "@@UNKNOWN@@") ref = " ".join([token.text for token in razdel.tokenize(ref)]) if isinstance(ref, str) and len(ref) <= 1: ref = "some content" print("Empty ref") if isinstance(hyp, str) and len(hyp) <= 1: hyp = "some content" print("Empty hyp. Ref: ", ref) refs.append(ref) hyps.append(hyp) if len(hyps) % report_every == 0: calc_metrics(refs, hyps, metric) if max_count and len(hyps) >= max_count: break calc_metrics(refs, hyps, metric)
def parse(refs_path, hyps_path, num_refs, lng='en'): logging.info('STARTING TO PARSE INPUTS...') print('STARTING TO PARSE INPUTS...') # references references = [] for i in range(num_refs): fname = refs_path + str(i) if num_refs > 1 else refs_path with codecs.open(fname, 'r', 'utf-8') as f: texts = f.read().split('\n') for j, text in enumerate(texts): if len(references) <= j: references.append([text]) else: references[j].append(text) # references tokenized references_tok = copy.copy(references) for i, refs in enumerate(references_tok): if lng == 'ru': references_tok[i] = [ ' '.join([_.text for _ in tokenize(ref)]) for ref in refs ] else: references_tok[i] = [ ' '.join(nltk.word_tokenize(ref)) for ref in refs ] # hypothesis with codecs.open(hyps_path, 'r', 'utf-8') as f: hypothesis = f.read().split('\n') # hypothesis tokenized hypothesis_tok = copy.copy(hypothesis) if lng == 'ru': hypothesis_tok = [ ' '.join([_.text for _ in tokenize(hyp)]) for hyp in hypothesis_tok ] else: hypothesis_tok = [ ' '.join(nltk.word_tokenize(hyp)) for hyp in hypothesis_tok ] logging.info('FINISHING TO PARSE INPUTS...') print('FINISHING TO PARSE INPUTS...') return references, references_tok, hypothesis, hypothesis_tok
def my_preprocess(text: str): text = str(text) text = text.replace("\n", " ").replace('/', ' ') text = text.lower() text = text.translate(str.maketrans('', '', string.punctuation)) tokenized_text = list(tokenize(text)) lemm = [morph.parse(i.text)[0].normal_form for i in tokenized_text] words = [i for i in lemm if i not in stop] return " ".join(words)
def preprocess_for_conds(self, text): ''' Предобработка строки для правил ''' text = text.lower() tokens = list(razdel.tokenize(text)) return ' '.join([self.lemmatize(t.text) for t in tokens if regex.search(r'^[\w\d]*$', t.text)])
def new_model(self, arr: list): if arr == []: return 1 fil = open('dict.json') self.dict0 = json.loads(fil.read()) fil.close() fil = open('dialog.txt') dialog = fil.read().split("\n") fil.close() fil = open('text.txt', "w") fil.write(" ".join(dialog)) fil.close() a = [] b = [] for i in range(len(dialog)): if i % 2: b += [dialog[i]] else: a += [dialog[i]] fil = open('train.a', 'w') fil.write(json.dumps(a, ensure_ascii=False)) fil.close() fil = open('train.b', 'w') fil.write(json.dumps(b, ensure_ascii=False)) fil.close() fil = open('text.txt') text = fil.read() fil.close() self.dict0 = list(set([_.text for _ in list(tokenize(text.lower()))])) fil = open('dict.json', 'w') fil.write(json.dumps(self.dict0, ensure_ascii=False)) fil.close() fil = open('train.a') a = json.loads(fil.read()) fil.close() fil = open('train.b') b = json.loads(fil.read()) fil.close() for i in range(len(a)): a[i] = self.text2dict1(a[i]) b[i] = self.text2dict1(b[i]) self.x = np.asarray(a) self.y = np.asarray(b) self.model = models.Sequential() self.model.add( layers.Dense(arr[0], input_dim=len(self.dict0), activation='tanh')) for i in arr[1:]: self.model.add(layers.Dense(i, activation='tanh')) self.model.add(layers.Dense(len(self.dict0), activation='tanh')) self.model.compile(optimizer=tf.train.AdamOptimizer(0.001), loss='mse', metrics=['mae'])
def generate_errors(text): for token in tokenize(text): symbol_index = 0 token_text = token.text if token_text.endswith('тся'): yield (text[:token.start] + token_text[:-3] + 'ться' + text[token.stop:]) if token_text.endswith('ться'): yield (text[:token.start] + token_text[:-4] + 'тся' + text[token.stop:])
def words(self, fileids=None, categories=None): """ Функция проводит токенизацию всех предложений по каждой новости, возвращает список токенизированных предложений по всем новостям """ for sentence in self.sents(fileids, categories): for word in sentence: if len(word) == 0: continue tokens = list(tokenize(word)) yield([_.text for _ in tokens])
def transform_token_answer_words(self, data): question, answer = data.question, data.answer words = razdel.tokenize(answer) answer_list = list(answer) for j in range(len(words), -1, -1): word = words[j] text = word.text if text not in string.punctuation: start, end = word.start, word.end answer_list.insert(start, self.token_unk) return f'{question}{self.token_unk}{"".join(answer_list)}'
def save_syntax_analysis_by_text(self, text, file, is_many_sentences=False): f = open(file, 'a') sys.stdout = f print('-' * 100) if text != 'None': if not is_many_sentences: chunk = list() for sent in sentenize(text): tokens = [_.text for _ in tokenize(sent.text)] chunk.append(tokens) markup = next(self.syntax.map(chunk)) words, deps = list(), list() for token in markup.tokens: words.append(token.text) source = int(token.head_id) - 1 target = int(token.id) - 1 if source > 0 and source != target: deps.append([source, target, token.rel]) show_markup(words, deps) else: for sentence in text.split('.'): if len(sentence.split()) > 5: chunk = list() for sent in sentenize(sentence): tokens = [_.text for _ in tokenize(sent.text)] chunk.append(tokens) markup = next(self.syntax.map(chunk)) words, deps = list(), list() for token in markup.tokens: words.append(token.text) source = int(token.head_id) - 1 target = int(token.id) - 1 if source > 0 and source != target: deps.append([source, target, token.rel]) show_markup(words, deps) else: print('None') print('-' * 100)
def __TextTokenize(self, text): punct = string.punctuation punct += '—–...«»***\n ' tokens = sent_tokenize(text) for sentence in range(len(tokens)): tokens[sentence] = list(tokenize(tokens[sentence].lower())) tokens[sentence] = [ _.text for _ in tokens[sentence] if _.text not in punct ] return tokens