Ejemplo n.º 1
0
def _fix_merge_to(fixed_sentences,
                  border=0.6,
                  model_path="models/to_merge_predictor.bin",
                  bpe_model_path="models/opencorpora_bpe.model"):
    predictor = fasttext.load_model(model_path)
    bpe_model = sp_processor()
    bpe_model.load(bpe_model_path)

    for i, sentence in enumerate(fixed_sentences):
        text = sentence
        chto_bi_count = text.count("что бы")
        to_je_count = text.count("то же")
        tak_je_count = text.count("так же")
        if chto_bi_count + to_je_count + tak_je_count != 1:
            continue
        processed_sentence = " ".join(bpe_model.EncodeAsPieces(sentence.lower()))
        predictions = predictor.predict(processed_sentence)
        proba = float(predictions[1][0])
        label = int(predictions[0][0][-1])
        if label == 1 or proba < border:
            continue
        if chto_bi_count == 1:
            fixed_sentences[i] = sentence.replace("что бы", "чтобы")
        elif to_je_count == 1:
            fixed_sentences[i] = sentence.replace("то же", "тоже")
        elif tak_je_count == 1:
            fixed_sentences[i] = sentence.replace("так же", "также")
Ejemplo n.º 2
0
def _fix_pronouns(fixed_sentences,
                  nn_border=0.65,
                  nn_model_path="models/pronoun_model.bin",
                  bpe_model_path="models/opencorpora_bpe.model"):
    nn_predictor = fasttext.load_model(nn_model_path)
    bpe_model = sp_processor()
    bpe_model.load(bpe_model_path)

    for i, sentence in enumerate(fixed_sentences):
        for from_text, to_text in _HARD_PRONOUNS:
            if (from_text in sentence or to_text in sentence
                or from_text.capitalize() in sentence
                or to_text.capitalize() in sentence
            ):
                processed_sentence = " ".join(bpe_model.EncodeAsPieces(sentence.lower()))
                nn_predictions = nn_predictor.predict(processed_sentence)
                nn_proba = float(nn_predictions[1][0])
                nn_label = int(nn_predictions[0][0][-1])
                if nn_label == 0 and nn_proba > nn_border:
                    if from_text in sentence:
                        fixed_sentences[i] = sentence.replace(from_text, to_text)
                    elif from_text.capitalize() in sentence:
                        fixed_sentences[i] = sentence.replace(from_text.capitalize(), to_text)
                    elif to_text in sentence:
                        fixed_sentences[i] = sentence.replace(to_text, from_text)
                    else:
                        fixed_sentences[i] = sentence.replace(to_text.capitalize(), from_text)
Ejemplo n.º 3
0
 def __init__(self,
              model_path: str = None,
              nbest_size: int = None,
              alpha: float = None):
     self._model_path = model_path
     self._processor = sp_processor()
     self._processor.Load(model_path)
     self._nbest_size = nbest_size
     self._alpha = alpha
Ejemplo n.º 4
0
def train(input_file, opencorpora_file):
    records = []
    with open(input_file, "r") as r:
        next(r)
        reader = csv.reader(r)
        for row in reader:
            _, _, text, _ = row
            text = text.replace("\n", " ").lower()
            nn_count = text.count("нн")
            if nn_count == 1:
                records.append((text, 0))
    with open(opencorpora_file, "r") as r:
        for line in r:
            text = line.strip().lower()
            if "нн" in text:
                records.append((text, 1))
    random.shuffle(records)
    border = int(0.8 * len(records))
    train = records[:border]
    val = records[border:]

    model_path = "subword_model"
    if False:
        temp = tempfile.NamedTemporaryFile(mode="w", delete=False)
        for text, _ in train:
            temp.write(text + "\n")
        temp.close()
        cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format(
            temp.name, model_path, 30000, "bpe")
        sp_trainer.Train(cmd)
        os.unlink(temp.name)

    processor = sp_processor()
    processor.load(model_path + ".model")
    fixed_train = []
    for text, label in train:
        text = " ".join(tokenize(processor, text))
        fixed_train.append((text, label))
    fixed_val = []
    for text, label in val:
        text = " ".join(tokenize(processor, text))
        fixed_val.append((text, label))

    to_ft_format(fixed_train, "nn_train.txt")
    to_ft_format(fixed_val, "nn_val.txt")
Ejemplo n.º 5
0
def _fix_nn(fixed_sentences,
            nn_border=0.6,
            nn_model_path="models/nn_predictor.bin",
            bpe_model_path="models/opencorpora_bpe.model"):
    nn_predictor = fasttext.load_model(nn_model_path)
    bpe_model = sp_processor()
    bpe_model.load(bpe_model_path)

    for i, sentence in enumerate(fixed_sentences):
        nn_count = sentence.count("нн")
        if nn_count != 1:
            continue
        processed_sentence = " ".join(bpe_model.EncodeAsPieces(sentence.lower()))
        nn_predictions = nn_predictor.predict(processed_sentence)
        nn_proba = float(nn_predictions[1][0])
        nn_label = int(nn_predictions[0][0][-1])
        if nn_label == 0 and nn_proba > nn_border and nn_count == 1:
            fixed_sentences[i] = sentence.replace("нн", "н")
Ejemplo n.º 6
0
def train(input_file):
    records = []
    with open(input_file, "r") as r:
        next(r)
        reader = csv.reader(r)
        for row in reader:
            _, text, label = row
            text = text.replace("\n", " ").lower()
            tjsya_count = text.count("ться")
            tsya_count = text.count("тся")
            if (tjsya_count != 0 and tsya_count == 0) or (tjsya_count == 0
                                                          and tsya_count != 0):
                records.append((text, label))
    random.shuffle(records)
    border = int(0.8 * len(records))
    train = records[:border]
    val = records[border:]

    model_path = "subword_model"
    if True:
        temp = tempfile.NamedTemporaryFile(mode="w", delete=False)
        for text, _ in train:
            temp.write(text + "\n")
        temp.close()
        cmd = "--input={} --model_prefix={} --vocab_size={} --model_type={}".format(
            temp.name, model_path, 30000, "bpe")
        sp_trainer.Train(cmd)
        os.unlink(temp.name)

    processor = sp_processor()
    processor.load(model_path + ".model")
    fixed_train = []
    for text, label in train:
        text = " ".join(tokenize(processor, text))
        fixed_train.append((text, label))
    fixed_val = []
    for text, label in val:
        text = " ".join(tokenize(processor, text))
        fixed_val.append((text, label))

    to_ft_format(fixed_train, "grammar_endings_train.txt")
    to_ft_format(fixed_val, "grammar_endings_val.txt")
Ejemplo n.º 7
0
def _fix_tsya(fixed_sentences,
              tsya_border=0.55,
              tsya_model_path="models/tsya_predictor.bin",
              bpe_model_path="models/grammar_bpe.model"):
    tsya_predictor = fasttext.load_model(tsya_model_path)
    bpe_model = sp_processor()
    bpe_model.load(bpe_model_path)

    for i, sentence in enumerate(fixed_sentences):
        tsya_count = sentence.count("тся")
        tsjya_count = sentence.count("ться")
        if tsya_count + tsjya_count != 1:
            continue
        processed_sentence = " ".join(bpe_model.EncodeAsPieces(sentence.lower()))
        tsya_predictions = tsya_predictor.predict(processed_sentence)
        tsya_proba = float(tsya_predictions[1][0])
        tsya_label = int(tsya_predictions[0][0][-1])
        if tsya_label == 0 and tsya_proba > tsya_border and tsya_count >= 1 and tsjya_count == 0:
            fixed_sentences[i] = sentence.replace("тся", "ться")
        elif tsya_label == 0 and tsya_proba > tsya_border and tsjya_count >= 1 and tsya_count == 0:
            fixed_sentences[i] = sentence.replace("ться", "тся")
Ejemplo n.º 8
0
 def __init__(self, model_path: str = None):
     self._model_path = model_path
     self._processor = sp_processor()
     self._processor.Load(model_path)