def fairseq_train_and_evaluate(dataset, metrics_coefs=[1, 1, 1], parametrization_budget=64, **kwargs): check_dataset(dataset) kwargs = check_and_resolve_args(kwargs) exp_dir = prepare_exp_dir() preprocessors_kwargs = kwargs.get('preprocessors_kwargs', {}) preprocessors = get_preprocessors(preprocessors_kwargs) if len(preprocessors) > 0: dataset = create_preprocessed_dataset(dataset, preprocessors, n_jobs=1) shutil.copy(get_dataset_dir(dataset) / 'preprocessors.pickle', exp_dir) preprocessed_dir = fairseq_preprocess(dataset) train_kwargs = get_allowed_kwargs(fairseq_train, preprocessed_dir, exp_dir, **kwargs) fairseq_train(preprocessed_dir, exp_dir=exp_dir, **train_kwargs) # Evaluation generate_kwargs = get_allowed_kwargs(fairseq_generate, 'complex_filepath', 'pred_filepath', exp_dir, **kwargs) recommended_preprocessors_kwargs = find_best_parametrization( exp_dir, metrics_coefs, preprocessors_kwargs, parametrization_budget) print( f'recommended_preprocessors_kwargs={recommended_preprocessors_kwargs}') simplifier = get_simplifier(exp_dir, recommended_preprocessors_kwargs, generate_kwargs) scores = evaluate_simplifier_on_turkcorpus(simplifier, phase='valid') print(f'scores={scores}') score = combine_metrics(scores['BLEU'], scores['SARI'], scores['FKGL'], metrics_coefs) return score
def transform(input_text, LengthRatioProcessor=0.95, LevenshteinPreprocessor=0.75, WordRankRatioPreprocessor=0.75, SentencePiecePreprocessor=10000): input_lines = input_text.split("\n") # Read from input source_filepath = get_temp_filepath() write_lines([word_tokenize(line) for line in input_lines], source_filepath) # Load best model best_model_dir = prepare_models() recommended_preprocessors_kwargs = { 'LengthRatioPreprocessor': { 'target_ratio': LengthRatioProcessor }, 'LevenshteinPreprocessor': { 'target_ratio': LevenshteinPreprocessor }, 'WordRankRatioPreprocessor': { 'target_ratio': WordRankRatioPreprocessor }, 'SentencePiecePreprocessor': { 'vocab_size': SentencePiecePreprocessor }, } preprocessors = get_preprocessors(recommended_preprocessors_kwargs) simplifier = get_fairseq_simplifier(best_model_dir, beam=1) simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors) # Simplify pred_filepath = get_temp_filepath() with mute(): simplifier(source_filepath, pred_filepath) return list(yield_lines(pred_filepath))
from access.preprocessors import get_preprocessors from access.resources.prepare import prepare_models from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier from access.text import word_tokenize from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute if __name__ == '__main__': # Usage: python generate.py < my_file.complex # Read from stdin source_filepath = get_temp_filepath() write_lines([word_tokenize(line) for line in fileinput.input()], source_filepath) # Load best model best_model_dir = prepare_models() recommended_preprocessors_kwargs = { 'LengthRatioPreprocessor': {'target_ratio': 0.95}, 'LevenshteinPreprocessor': {'target_ratio': 0.75}, 'WordRankRatioPreprocessor': {'target_ratio': 0.75}, 'SentencePiecePreprocessor': {'vocab_size': 10000}, } preprocessors = get_preprocessors(recommended_preprocessors_kwargs) simplifier = get_fairseq_simplifier(best_model_dir, beam=8) simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors) # Simplify pred_filepath = get_temp_filepath() with mute(): simplifier(source_filepath, pred_filepath) for line in yield_lines(pred_filepath): print(line)
def get_simplifier(exp_dir, preprocessors_kwargs, generate_kwargs): # TODO: Take kwargs as input and separate between get_preprocessors kwargs and generate_kwargs preprocessors = get_preprocessors(preprocessors_kwargs) simplifier = get_fairseq_simplifier(exp_dir, **generate_kwargs) return get_preprocessed_simplifier(simplifier, preprocessors=preprocessors)