Example #1
0
 def encode_file_pair(self, complex_filepath, simple_filepath,
                      output_complex_filepath, output_simple_filepath):
     for preprocessor in self.preprocessors:
         intermediary_output_complex_filepath = get_temp_filepath()
         intermediary_output_simple_filepath = get_temp_filepath()
         preprocessor.encode_file_pair(
             complex_filepath, simple_filepath,
             intermediary_output_complex_filepath,
             intermediary_output_simple_filepath)
         complex_filepath = intermediary_output_complex_filepath
         simple_filepath = intermediary_output_simple_filepath
     shutil.copyfile(complex_filepath, output_complex_filepath)
     shutil.copyfile(simple_filepath, output_simple_filepath)
def get_prediction_on_turkcorpus(simplifier, phase):
    source_filepath = get_data_filepath('turkcorpus', phase, 'complex')
    pred_filepath = get_temp_filepath()
    print(pred_filepath)
    with mute():
        simplifier(source_filepath, pred_filepath)
    return pred_filepath
Example #3
0
def apply_line_method_to_file(line_method, input_filepath):
    output_filepath = get_temp_filepath()
    with open(input_filepath, 'r') as input_file, open(output_filepath,
                                                       'w') as output_file:
        for line in input_file:
            transformed_line = line_method(line.rstrip('\n'))
            if transformed_line is not None:
                output_file.write(transformed_line + '\n')
    return output_filepath
Example #4
0
 def decode_file(self,
                 input_filepath,
                 output_filepath,
                 encoder_filepath=None):
     for preprocessor in self.preprocessors:
         intermediary_output_filepath = get_temp_filepath()
         preprocessor.decode_file(input_filepath,
                                  intermediary_output_filepath,
                                  encoder_filepath)
         input_filepath = intermediary_output_filepath
     shutil.copyfile(input_filepath, output_filepath)
Example #5
0
 def encode_file(self,
                 input_filepath,
                 output_filepath,
                 encoder_filepath=None):
     if encoder_filepath is None:
         # We will use an empty temporary file which will yield None for each line
         encoder_filepath = get_temp_filepath(create=True)
     with open(output_filepath, 'w') as f:
         for input_line, encoder_line in yield_lines_in_parallel(
             [input_filepath, encoder_filepath], strict=False):
             f.write(self.encode_sentence(input_line, encoder_line) + '\n')
def transform(input_text,
              LengthRatioProcessor=0.95,
              LevenshteinPreprocessor=0.75,
              WordRankRatioPreprocessor=0.75,
              SentencePiecePreprocessor=10000):
    input_lines = input_text.split("\n")
    # Read from input
    source_filepath = get_temp_filepath()
    write_lines([word_tokenize(line) for line in input_lines], source_filepath)

    # Load best model
    best_model_dir = prepare_models()
    recommended_preprocessors_kwargs = {
        'LengthRatioPreprocessor': {
            'target_ratio': LengthRatioProcessor
        },
        'LevenshteinPreprocessor': {
            'target_ratio': LevenshteinPreprocessor
        },
        'WordRankRatioPreprocessor': {
            'target_ratio': WordRankRatioPreprocessor
        },
        'SentencePiecePreprocessor': {
            'vocab_size': SentencePiecePreprocessor
        },
    }
    preprocessors = get_preprocessors(recommended_preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(best_model_dir, beam=1)
    simplifier = get_preprocessed_simplifier(simplifier,
                                             preprocessors=preprocessors)
    # Simplify
    pred_filepath = get_temp_filepath()

    with mute():
        simplifier(source_filepath, pred_filepath)
    return list(yield_lines(pred_filepath))
Example #7
0
sys.path.append(str(REPO_DIR))
# -- end fix path --

import fileinput

from access.preprocessors import get_preprocessors
from access.resources.prepare import prepare_models
from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier
from access.text import word_tokenize
from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute


if __name__ == '__main__':
    # Usage: python generate.py < my_file.complex
    # Read from stdin
    source_filepath = get_temp_filepath()
    write_lines([word_tokenize(line) for line in fileinput.input()], source_filepath)
    # Load best model
    best_model_dir = prepare_models()
    recommended_preprocessors_kwargs = {
        'LengthRatioPreprocessor': {'target_ratio': 0.95},
        'LevenshteinPreprocessor': {'target_ratio': 0.75},
        'WordRankRatioPreprocessor': {'target_ratio': 0.75},
        'SentencePiecePreprocessor': {'vocab_size': 10000},
    }
    preprocessors = get_preprocessors(recommended_preprocessors_kwargs)
    simplifier = get_fairseq_simplifier(best_model_dir, beam=8)
    simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors)
    # Simplify
    pred_filepath = get_temp_filepath()
    with mute():