def transform(input_text,
              LengthRatioProcessor=0.95,
              LevenshteinPreprocessor=0.75,
              WordRankRatioPreprocessor=0.75,
              SentencePiecePreprocessor=10000):
    input_lines = input_text.split("\n")
    # Read from input
    source_filepath = get_temp_filepath()
    write_lines([word_tokenize(line) for line in input_lines], source_filepath)

    # Load best model
    best_model_dir = prepare_models()
    recommended_preprocessors_kwargs = {
        'LengthRatioPreprocessor': {
            'target_ratio': LengthRatioProcessor
        },
        'LevenshteinPreprocessor': {
            'target_ratio': LevenshteinPreprocessor
        },
        'WordRankRatioPreprocessor': {
            'target_ratio': WordRankRatioPreprocessor
        },
        'SentencePiecePreprocessor': {
            'vocab_size': SentencePiecePreprocessor
        },
    }
    preprocessors = get_preprocessors(recommended_preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(best_model_dir, beam=1)
    simplifier = get_preprocessed_simplifier(simplifier,
                                             preprocessors=preprocessors)
    # Simplify
    pred_filepath = get_temp_filepath()

    with mute():
        simplifier(source_filepath, pred_filepath)
    return list(yield_lines(pred_filepath))
Esempio n. 2
0
from access.preprocessors import get_preprocessors
from access.resources.prepare import prepare_models
from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier
from access.text import word_tokenize
from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute


if __name__ == '__main__':
    # Usage: python generate.py < my_file.complex
    # Read from stdin
    source_filepath = get_temp_filepath()
    write_lines([word_tokenize(line) for line in fileinput.input()], source_filepath)
    # Load best model
    best_model_dir = prepare_models()
    recommended_preprocessors_kwargs = {
        'LengthRatioPreprocessor': {'target_ratio': 0.95},
        'LevenshteinPreprocessor': {'target_ratio': 0.75},
        'WordRankRatioPreprocessor': {'target_ratio': 0.75},
        'SentencePiecePreprocessor': {'vocab_size': 10000},
    }
    preprocessors = get_preprocessors(recommended_preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(best_model_dir, beam=8)
    simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors)
    # Simplify
    pred_filepath = get_temp_filepath()
    with mute():
        simplifier(source_filepath, pred_filepath)
    for line in yield_lines(pred_filepath):
        print(line)
Esempio n. 3
0
def get_simplifier(exp_dir, preprocessors_kwargs, generate_kwargs):
    # TODO: Take kwargs as input and separate between get_preprocessors kwargs and generate_kwargs
    preprocessors = get_preprocessors(preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(exp_dir, **generate_kwargs)
    return get_preprocessed_simplifier(simplifier, preprocessors=preprocessors)