Esempio n. 1
0
def fairseq_train_and_evaluate(dataset,
                               metrics_coefs=[1, 1, 1],
                               parametrization_budget=64,
                               **kwargs):
    check_dataset(dataset)
    kwargs = check_and_resolve_args(kwargs)
    exp_dir = prepare_exp_dir()
    preprocessors_kwargs = kwargs.get('preprocessors_kwargs', {})
    preprocessors = get_preprocessors(preprocessors_kwargs)
    if len(preprocessors) > 0:
        dataset = create_preprocessed_dataset(dataset, preprocessors, n_jobs=1)
        shutil.copy(get_dataset_dir(dataset) / 'preprocessors.pickle', exp_dir)
    preprocessed_dir = fairseq_preprocess(dataset)
    train_kwargs = get_allowed_kwargs(fairseq_train, preprocessed_dir, exp_dir,
                                      **kwargs)
    fairseq_train(preprocessed_dir, exp_dir=exp_dir, **train_kwargs)
    # Evaluation
    generate_kwargs = get_allowed_kwargs(fairseq_generate, 'complex_filepath',
                                         'pred_filepath', exp_dir, **kwargs)
    recommended_preprocessors_kwargs = find_best_parametrization(
        exp_dir, metrics_coefs, preprocessors_kwargs, parametrization_budget)
    print(
        f'recommended_preprocessors_kwargs={recommended_preprocessors_kwargs}')
    simplifier = get_simplifier(exp_dir, recommended_preprocessors_kwargs,
                                generate_kwargs)
    scores = evaluate_simplifier_on_turkcorpus(simplifier, phase='valid')
    print(f'scores={scores}')
    score = combine_metrics(scores['BLEU'], scores['SARI'], scores['FKGL'],
                            metrics_coefs)
    return score
def transform(input_text,
              LengthRatioProcessor=0.95,
              LevenshteinPreprocessor=0.75,
              WordRankRatioPreprocessor=0.75,
              SentencePiecePreprocessor=10000):
    input_lines = input_text.split("\n")
    # Read from input
    source_filepath = get_temp_filepath()
    write_lines([word_tokenize(line) for line in input_lines], source_filepath)

    # Load best model
    best_model_dir = prepare_models()
    recommended_preprocessors_kwargs = {
        'LengthRatioPreprocessor': {
            'target_ratio': LengthRatioProcessor
        },
        'LevenshteinPreprocessor': {
            'target_ratio': LevenshteinPreprocessor
        },
        'WordRankRatioPreprocessor': {
            'target_ratio': WordRankRatioPreprocessor
        },
        'SentencePiecePreprocessor': {
            'vocab_size': SentencePiecePreprocessor
        },
    }
    preprocessors = get_preprocessors(recommended_preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(best_model_dir, beam=1)
    simplifier = get_preprocessed_simplifier(simplifier,
                                             preprocessors=preprocessors)
    # Simplify
    pred_filepath = get_temp_filepath()

    with mute():
        simplifier(source_filepath, pred_filepath)
    return list(yield_lines(pred_filepath))
Esempio n. 3
0
from access.preprocessors import get_preprocessors
from access.resources.prepare import prepare_models
from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier
from access.text import word_tokenize
from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute


if __name__ == '__main__':
    # Usage: python generate.py < my_file.complex
    # Read from stdin
    source_filepath = get_temp_filepath()
    write_lines([word_tokenize(line) for line in fileinput.input()], source_filepath)
    # Load best model
    best_model_dir = prepare_models()
    recommended_preprocessors_kwargs = {
        'LengthRatioPreprocessor': {'target_ratio': 0.95},
        'LevenshteinPreprocessor': {'target_ratio': 0.75},
        'WordRankRatioPreprocessor': {'target_ratio': 0.75},
        'SentencePiecePreprocessor': {'vocab_size': 10000},
    }
    preprocessors = get_preprocessors(recommended_preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(best_model_dir, beam=8)
    simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors)
    # Simplify
    pred_filepath = get_temp_filepath()
    with mute():
        simplifier(source_filepath, pred_filepath)
    for line in yield_lines(pred_filepath):
        print(line)
Esempio n. 4
0
def get_simplifier(exp_dir, preprocessors_kwargs, generate_kwargs):
    # TODO: Take kwargs as input and separate between get_preprocessors kwargs and generate_kwargs
    preprocessors = get_preprocessors(preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(exp_dir, **generate_kwargs)
    return get_preprocessed_simplifier(simplifier, preprocessors=preprocessors)