def setUpClass(self): self._text = 'Texto para teste. Este texto contém 3 frases. 3a frase.' self._st1, self._st2, self._st3 = utils.get_sentence_tokens( 'Texto para teste. Este texto contém 3 frases. 3a frase.') self._st1_words = utils.get_word_tokens(self._st1._text) self._st2_words = utils.get_word_tokens(self._st2._text) self._st3_words = utils.get_word_tokens(self._st3._text) self._created_file_name = "./resources/test.conll" self._correct_file_name = "./resources/correct_test.conll"
def get_preds_df(preds, input_texts): preds_df = pd.DataFrame({"pred": preds.values.reshape(-1, 1).ravel()}) preds_df["input"] = np.array([[i] * 5 for i in input_texts]).ravel() preds_df["pred_len"] = preds_df["pred"].apply( lambda x: len(get_word_tokens(x))) preds_df["input_len"] = preds_df["input"].apply( lambda x: len(get_word_tokens(x))) preds_df["cosine_sim"] = preds_df.apply( lambda x: get_similarities(model, tokenizer, x["pred"], x["input"]), axis=1) preds_df["cosine_sim"] = preds_df["cosine_sim"].apply(lambda x: x[0][0]) preds_df["rouge_l"] = preds_df.apply( lambda x: get_rougel(x["pred"], x["input"]), axis=1) return preds_df
def test_fit_known_tokens_create_token_to_the_left(self): # Create a token positioned inside the first word text = 'to' # Tex-to init = self._text.find(text) end = init + len(text) - 1 known_token = Token(text, init, end, "teste") # Only the first sentence will be used text = self._st1._text word_tokens_before = utils.get_word_tokens(text) pipeline = NerCorpusPipeline(text, [known_token]) pipeline.apply_processing_rules() word_tokens_after = pipeline.word_tokens # Ensure the words have a valid structure for token in word_tokens_after: self.assertTrue(self._text[token._init_index:token._end_index + 1] == token._text) # Ensure the known tokens have a valid structure for token in pipeline.known_tokens: self.assertTrue(self._text[token._init_index:token._end_index + 1] == token._text) self.assertTrue(len(word_tokens_after) == 4) self.assertTrue( set(['Tex', 'to', 'para', 'teste', '.']) == set([ t._text for t in utils.sort_tokens(pipeline.known_tokens + word_tokens_after) ]))
def get_rf_from_dev(dev_df, preds_dev, max_depth=None, random_state=19): preds_df = preds_dev.copy() dev_df_grouped = dev_df.groupby("input").agg({ "output": list, "cosine_sim": list, "rouge_l": list, "input_len": max, "output_len": list }).reset_index() preds_df["ref"] = [ l for sublist in dev_df_grouped["output"].apply( lambda x: [x] * 5).tolist() for l in sublist ] preds_df["ref"] = preds_df["ref"].apply(lambda x: [[i] for i in x]) preds_df["pred_len"] = preds_df["pred"].apply( lambda x: len(get_word_tokens(x))) preds_df["input_len"] = preds_df["input"].apply( lambda x: len(get_word_tokens(x))) preds_df["sari"] = preds_df.apply(lambda x: corpus_sari( orig_sents=[x["input"]], sys_sents=[x["pred"]], refs_sents=x["ref"], ), axis=1) rf = RandomForestRegressor(n_estimators=1000, max_depth=max_depth, n_jobs=-1, random_state=random_state) X_train = preds_df[["cosine_sim", "rouge_l", "input_len", "pred_len"]] y_train = preds_df["sari"] rf.fit(X_train, y_train) return rf, preds_df
tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/sbert_large_nlu_ru") model = AutoModel.from_pretrained("sberbank-ai/sbert_large_nlu_ru") model.to("cuda") # validation dataset dev_df = pd.read_csv(DEV_PATH, index_col=0) dev_df.columns = ["input", "output"] dev_df["cosine_sim"] = dev_df.apply( lambda x: get_similarities(model, tokenizer, x["input"], x["output"]), axis=1) dev_df["cosine_sim"] = dev_df["cosine_sim"].apply(lambda x: x[0][0]) dev_df["rouge_l"] = dev_df.apply( lambda x: get_rougel(x["input"], x["output"]), axis=1) dev_df["input_len"] = dev_df["input"].apply( lambda x: len(get_word_tokens(x))) dev_df["output_len"] = dev_df["output"].apply( lambda x: len(get_word_tokens(x))) dev_df.to_csv(OUTPUT_DIR / "dev_df_metrics.csv", index=False) # train dataset dfs = [ pd.read_csv(path, usecols=["target_x", "target_y"]) for path in WIKI_DIR.glob("*") ] wiki_df = pd.concat(dfs).reset_index(drop=True) wiki_df.columns = ["input", "output"] wiki_df["cosine_sim"] = wiki_df.apply( lambda x: get_similarities(model, tokenizer, x["input"], x["output"]), axis=1)
def __init__(self, text, known_tokens): self._text = text # TODO: Guarantee that all known tokens do not intersect self.known_tokens = known_tokens self.sentences_tokens = get_sentence_tokens(text) self.word_tokens = get_word_tokens(text, 'O')
def test_word_tokenizer(self): word_tokens = utils.get_word_tokens(self._text) for token in word_tokens: self.assertTrue(self._text[token._init_index:token._end_index + 1] == token._text)