def _fix_merge_to(fixed_sentences, border=0.6, model_path="models/to_merge_predictor.bin", bpe_model_path="models/opencorpora_bpe.model"): predictor = fasttext.load_model(model_path) bpe_model = sp_processor() bpe_model.load(bpe_model_path) for i, sentence in enumerate(fixed_sentences): text = sentence chto_bi_count = text.count("что бы") to_je_count = text.count("то же") tak_je_count = text.count("так же") if chto_bi_count + to_je_count + tak_je_count != 1: continue processed_sentence = " ".join(bpe_model.EncodeAsPieces(sentence.lower())) predictions = predictor.predict(processed_sentence) proba = float(predictions[1][0]) label = int(predictions[0][0][-1]) if label == 1 or proba < border: continue if chto_bi_count == 1: fixed_sentences[i] = sentence.replace("что бы", "чтобы") elif to_je_count == 1: fixed_sentences[i] = sentence.replace("то же", "тоже") elif tak_je_count == 1: fixed_sentences[i] = sentence.replace("так же", "также")
def _fix_pronouns(fixed_sentences, nn_border=0.65, nn_model_path="models/pronoun_model.bin", bpe_model_path="models/opencorpora_bpe.model"): nn_predictor = fasttext.load_model(nn_model_path) bpe_model = sp_processor() bpe_model.load(bpe_model_path) for i, sentence in enumerate(fixed_sentences): for from_text, to_text in _HARD_PRONOUNS: if (from_text in sentence or to_text in sentence or from_text.capitalize() in sentence or to_text.capitalize() in sentence ): processed_sentence = " ".join(bpe_model.EncodeAsPieces(sentence.lower())) nn_predictions = nn_predictor.predict(processed_sentence) nn_proba = float(nn_predictions[1][0]) nn_label = int(nn_predictions[0][0][-1]) if nn_label == 0 and nn_proba > nn_border: if from_text in sentence: fixed_sentences[i] = sentence.replace(from_text, to_text) elif from_text.capitalize() in sentence: fixed_sentences[i] = sentence.replace(from_text.capitalize(), to_text) elif to_text in sentence: fixed_sentences[i] = sentence.replace(to_text, from_text) else: fixed_sentences[i] = sentence.replace(to_text.capitalize(), from_text)
def __init__(self, model_path: str = None, nbest_size: int = None, alpha: float = None): self._model_path = model_path self._processor = sp_processor() self._processor.Load(model_path) self._nbest_size = nbest_size self._alpha = alpha
def train(input_file, opencorpora_file): records = [] with open(input_file, "r") as r: next(r) reader = csv.reader(r) for row in reader: _, _, text, _ = row text = text.replace("\n", " ").lower() nn_count = text.count("нн") if nn_count == 1: records.append((text, 0)) with open(opencorpora_file, "r") as r: for line in r: text = line.strip().lower() if "нн" in text: records.append((text, 1)) random.shuffle(records) border = int(0.8 * len(records)) train = records[:border] val = records[border:] model_path = "subword_model" if False: temp = tempfile.NamedTemporaryFile(mode="w", delete=False) for text, _ in train: temp.write(text + "\n") temp.close() cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format( temp.name, model_path, 30000, "bpe") sp_trainer.Train(cmd) os.unlink(temp.name) processor = sp_processor() processor.load(model_path + ".model") fixed_train = [] for text, label in train: text = " ".join(tokenize(processor, text)) fixed_train.append((text, label)) fixed_val = [] for text, label in val: text = " ".join(tokenize(processor, text)) fixed_val.append((text, label)) to_ft_format(fixed_train, "nn_train.txt") to_ft_format(fixed_val, "nn_val.txt")
def _fix_nn(fixed_sentences, nn_border=0.6, nn_model_path="models/nn_predictor.bin", bpe_model_path="models/opencorpora_bpe.model"): nn_predictor = fasttext.load_model(nn_model_path) bpe_model = sp_processor() bpe_model.load(bpe_model_path) for i, sentence in enumerate(fixed_sentences): nn_count = sentence.count("нн") if nn_count != 1: continue processed_sentence = " ".join(bpe_model.EncodeAsPieces(sentence.lower())) nn_predictions = nn_predictor.predict(processed_sentence) nn_proba = float(nn_predictions[1][0]) nn_label = int(nn_predictions[0][0][-1]) if nn_label == 0 and nn_proba > nn_border and nn_count == 1: fixed_sentences[i] = sentence.replace("нн", "н")
def train(input_file): records = [] with open(input_file, "r") as r: next(r) reader = csv.reader(r) for row in reader: _, text, label = row text = text.replace("\n", " ").lower() tjsya_count = text.count("ться") tsya_count = text.count("тся") if (tjsya_count != 0 and tsya_count == 0) or (tjsya_count == 0 and tsya_count != 0): records.append((text, label)) random.shuffle(records) border = int(0.8 * len(records)) train = records[:border] val = records[border:] model_path = "subword_model" if True: temp = tempfile.NamedTemporaryFile(mode="w", delete=False) for text, _ in train: temp.write(text + "\n") temp.close() cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format( temp.name, model_path, 30000, "bpe") sp_trainer.Train(cmd) os.unlink(temp.name) processor = sp_processor() processor.load(model_path + ".model") fixed_train = [] for text, label in train: text = " ".join(tokenize(processor, text)) fixed_train.append((text, label)) fixed_val = [] for text, label in val: text = " ".join(tokenize(processor, text)) fixed_val.append((text, label)) to_ft_format(fixed_train, "grammar_endings_train.txt") to_ft_format(fixed_val, "grammar_endings_val.txt")
def _fix_tsya(fixed_sentences, tsya_border=0.55, tsya_model_path="models/tsya_predictor.bin", bpe_model_path="models/grammar_bpe.model"): tsya_predictor = fasttext.load_model(tsya_model_path) bpe_model = sp_processor() bpe_model.load(bpe_model_path) for i, sentence in enumerate(fixed_sentences): tsya_count = sentence.count("тся") tsjya_count = sentence.count("ться") if tsya_count + tsjya_count != 1: continue processed_sentence = " ".join(bpe_model.EncodeAsPieces(sentence.lower())) tsya_predictions = tsya_predictor.predict(processed_sentence) tsya_proba = float(tsya_predictions[1][0]) tsya_label = int(tsya_predictions[0][0][-1]) if tsya_label == 0 and tsya_proba > tsya_border and tsya_count >= 1 and tsjya_count == 0: fixed_sentences[i] = sentence.replace("тся", "ться") elif tsya_label == 0 and tsya_proba > tsya_border and tsjya_count >= 1 and tsya_count == 0: fixed_sentences[i] = sentence.replace("ться", "тся")
def __init__(self, model_path: str = None): self._model_path = model_path self._processor = sp_processor() self._processor.Load(model_path)