def encode_file_pair(self, complex_filepath, simple_filepath, output_complex_filepath, output_simple_filepath): for preprocessor in self.preprocessors: intermediary_output_complex_filepath = get_temp_filepath() intermediary_output_simple_filepath = get_temp_filepath() preprocessor.encode_file_pair( complex_filepath, simple_filepath, intermediary_output_complex_filepath, intermediary_output_simple_filepath) complex_filepath = intermediary_output_complex_filepath simple_filepath = intermediary_output_simple_filepath shutil.copyfile(complex_filepath, output_complex_filepath) shutil.copyfile(simple_filepath, output_simple_filepath)
def get_prediction_on_turkcorpus(simplifier, phase): source_filepath = get_data_filepath('turkcorpus', phase, 'complex') pred_filepath = get_temp_filepath() print(pred_filepath) with mute(): simplifier(source_filepath, pred_filepath) return pred_filepath
def apply_line_method_to_file(line_method, input_filepath): output_filepath = get_temp_filepath() with open(input_filepath, 'r') as input_file, open(output_filepath, 'w') as output_file: for line in input_file: transformed_line = line_method(line.rstrip('\n')) if transformed_line is not None: output_file.write(transformed_line + '\n') return output_filepath
def decode_file(self, input_filepath, output_filepath, encoder_filepath=None): for preprocessor in self.preprocessors: intermediary_output_filepath = get_temp_filepath() preprocessor.decode_file(input_filepath, intermediary_output_filepath, encoder_filepath) input_filepath = intermediary_output_filepath shutil.copyfile(input_filepath, output_filepath)
def encode_file(self, input_filepath, output_filepath, encoder_filepath=None): if encoder_filepath is None: # We will use an empty temporary file which will yield None for each line encoder_filepath = get_temp_filepath(create=True) with open(output_filepath, 'w') as f: for input_line, encoder_line in yield_lines_in_parallel( [input_filepath, encoder_filepath], strict=False): f.write(self.encode_sentence(input_line, encoder_line) + '\n')
def transform(input_text, LengthRatioProcessor=0.95, LevenshteinPreprocessor=0.75, WordRankRatioPreprocessor=0.75, SentencePiecePreprocessor=10000): input_lines = input_text.split("\n") # Read from input source_filepath = get_temp_filepath() write_lines([word_tokenize(line) for line in input_lines], source_filepath) # Load best model best_model_dir = prepare_models() recommended_preprocessors_kwargs = { 'LengthRatioPreprocessor': { 'target_ratio': LengthRatioProcessor }, 'LevenshteinPreprocessor': { 'target_ratio': LevenshteinPreprocessor }, 'WordRankRatioPreprocessor': { 'target_ratio': WordRankRatioPreprocessor }, 'SentencePiecePreprocessor': { 'vocab_size': SentencePiecePreprocessor }, } preprocessors = get_preprocessors(recommended_preprocessors_kwargs) simplifier = get_fairseq_simplifier(best_model_dir, beam=1) simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors) # Simplify pred_filepath = get_temp_filepath() with mute(): simplifier(source_filepath, pred_filepath) return list(yield_lines(pred_filepath))
sys.path.append(str(REPO_DIR)) # -- end fix path -- import fileinput from access.preprocessors import get_preprocessors from access.resources.prepare import prepare_models from access.simplifiers import get_fairseq_simplifier, get_preprocessed_simplifier from access.text import word_tokenize from access.utils.helpers import yield_lines, write_lines, get_temp_filepath, mute if __name__ == '__main__': # Usage: python generate.py < my_file.complex # Read from stdin source_filepath = get_temp_filepath() write_lines([word_tokenize(line) for line in fileinput.input()], source_filepath) # Load best model best_model_dir = prepare_models() recommended_preprocessors_kwargs = { 'LengthRatioPreprocessor': {'target_ratio': 0.95}, 'LevenshteinPreprocessor': {'target_ratio': 0.75}, 'WordRankRatioPreprocessor': {'target_ratio': 0.75}, 'SentencePiecePreprocessor': {'vocab_size': 10000}, } preprocessors = get_preprocessors(recommended_preprocessors_kwargs) simplifier = get_fairseq_simplifier(best_model_dir, beam=8) simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors) # Simplify pred_filepath = get_temp_filepath() with mute():