def get_laser_embeddings( sentences, bpe_codes_path=BPE_CODES_PATH, encoder_path=ENCODER_PATH, language='en', max_tokens=12000, normalize_l2=False, n_encoding_jobs=10, ): prepare_laser() from embed import SentenceEncoder # noqa: E402 from text_processing import Token, BPEfastApply # noqa: E402 def get_laser_encoder(encoder_path, max_tokens=12000): return SentenceEncoder(encoder_path, max_sentences=None, max_tokens=max_tokens, cpu=False) def encode_file(input_filepath, output_filepath, language, bpe_codes_path): tokenized_filepath = get_temp_filepath() Token(str(input_filepath), str(tokenized_filepath), lang=language, romanize=True if language == 'el' else False) BPEfastApply(str(tokenized_filepath), str(output_filepath), str(bpe_codes_path)) tokenized_filepath.unlink() input_filepath = get_temp_filepath() write_lines(sentences, input_filepath) with mute(): with log_action('Tokenizing and applying BPE'): parallel_file_encoder = get_parallel_file_preprocessor( lambda input_filepath, output_filepath: encode_file( input_filepath, output_filepath, language, bpe_codes_path), n_jobs=n_encoding_jobs, ) bpe_filepath = get_temp_filepath() parallel_file_encoder(input_filepath, bpe_filepath) with log_action('Geting LASER embedding'): encoder = get_laser_encoder(encoder_path, max_tokens=max_tokens) embeddings = encoder.encode_sentences(read_lines(bpe_filepath)) input_filepath.unlink() bpe_filepath.unlink() assert embeddings.shape[0] == len(sentences) del encoder if normalize_l2: embeddings = embeddings / np.expand_dims( np.linalg.norm(embeddings, axis=1), axis=1) return embeddings
def encode_file_pair(self, complex_filepath, simple_filepath, output_complex_filepath, output_simple_filepath): for preprocessor in self.preprocessors: intermediary_output_complex_filepath = get_temp_filepath() intermediary_output_simple_filepath = get_temp_filepath() preprocessor.encode_file_pair( complex_filepath, simple_filepath, intermediary_output_complex_filepath, intermediary_output_simple_filepath, ) complex_filepath = intermediary_output_complex_filepath simple_filepath = intermediary_output_simple_filepath shutil.copyfile(complex_filepath, output_complex_filepath) shutil.copyfile(simple_filepath, output_simple_filepath)
def encode_file(input_filepath, output_filepath, language, bpe_codes_path): tokenized_filepath = get_temp_filepath() Token(str(input_filepath), str(tokenized_filepath), lang=language, romanize=True if language == 'el' else False) BPEfastApply(str(tokenized_filepath), str(output_filepath), str(bpe_codes_path)) tokenized_filepath.unlink()
def get_easse_report(simplifier, test_set, orig_sents_path=None, refs_sents_paths=None): orig_sents, _ = get_orig_and_refs_sents(test_set, orig_sents_path, refs_sents_paths) orig_sents_path = get_temp_filepath() write_lines(orig_sents, orig_sents_path) sys_sents_path = simplifier(orig_sents_path) report_path = get_temp_filepath() report( test_set, sys_sents_path=sys_sents_path, orig_sents_path=orig_sents_path, refs_sents_paths=refs_sents_paths, report_path=report_path, ) return report_path
def write_sentencepiece_vocab_as_fairseq_dict(sentencepiece_model, fairseq_dict_path=None): if fairseq_dict_path is None: fairseq_dict_path = get_temp_filepath() with open(fairseq_dict_path, 'w') as f: for i in range(len(sentencepiece_model)): piece = sentencepiece_model.id_to_piece(i) if piece.startswith('<') and piece.endswith('>'): continue f.write(f'{piece} 999\n') # Use 999 as dummy count return fairseq_dict_path
def decode_file(self, input_filepath, output_filepath, encoder_filepath=None): for preprocessor in self.preprocessors: intermediary_output_filepath = get_temp_filepath() preprocessor.decode_file(input_filepath, intermediary_output_filepath, encoder_filepath) input_filepath = intermediary_output_filepath shutil.copyfile(input_filepath, output_filepath)
def encode_file(self, input_filepath, output_filepath, encoder_filepath=None): if encoder_filepath is None: # We will use an empty temporary file which will yield None for each line encoder_filepath = get_temp_filepath(create=True) with open(output_filepath, 'w', encoding='utf-8') as f: for input_line, encoder_line in yield_lines_in_parallel( [input_filepath, encoder_filepath], strict=False): f.write(self.encode_sentence(input_line, encoder_line) + '\n')
def apply_line_function_to_file(line_function, input_filepath, output_filepath=None): if output_filepath is None: output_filepath = get_temp_filepath() with open(input_filepath, 'r') as input_file, open(output_filepath, 'w') as output_file: for line in input_file: transformed_line = line_function(line.rstrip('\n')) if transformed_line is not None: output_file.write(transformed_line + '\n') return output_filepath
def download(url, destination_path=None, overwrite=True): if destination_path is None: destination_path = get_temp_filepath() if not overwrite and destination_path.exists(): return destination_path print('Downloading...') try: urlretrieve(url, destination_path, reporthook) sys.stdout.write('\n') except (Exception, KeyboardInterrupt, SystemExit): print('Rolling back: remove partially downloaded file') os.remove(destination_path) raise return destination_path
def simplify_sentences(source_sentences, model_name='muss_en_wikilarge_mined'): # Best ACCESS parameter values for the en_bart_access_wikilarge_mined model, ideally we would need to use another set of parameters for other models. exp_dir = get_model_path(model_name) preprocessors = get_muss_preprocessors(model_name) generate_kwargs = {} if is_model_using_mbart(model_name): generate_kwargs['task'] = 'translation_from_pretrained_bart' generate_kwargs[ 'langs'] = 'ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN' # noqa: E501 simplifier = get_fairseq_simplifier(exp_dir, **generate_kwargs) simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors) source_path = get_temp_filepath() write_lines(source_sentences, source_path) pred_path = simplifier(source_path) return read_lines(pred_path)
def evaluate_simplifier(simplifier, test_set, orig_sents_path=None, refs_sents_paths=None, quality_estimation=False): orig_sents, _ = get_orig_and_refs_sents(test_set, orig_sents_path=orig_sents_path, refs_sents_paths=refs_sents_paths) orig_sents_path = get_temp_filepath() write_lines(orig_sents, orig_sents_path) sys_sents_path = simplifier(orig_sents_path) return evaluate_system_output( test_set, sys_sents_path=sys_sents_path, orig_sents_path=orig_sents_path, refs_sents_paths=refs_sents_paths, metrics=['sari', 'bleu', 'fkgl'], quality_estimation=quality_estimation, )
def preprocessed_simplifier(complex_filepath, pred_filepath): preprocessed_complex_filepath = get_temp_filepath() composed_preprocessor.encode_file(complex_filepath, preprocessed_complex_filepath) preprocessed_pred_filepath = simplifier(preprocessed_complex_filepath) composed_preprocessor.decode_file(preprocessed_pred_filepath, pred_filepath, encoder_filepath=complex_filepath)
def wrapped(complex_filepath, pred_filepath=None): if pred_filepath is None: pred_filepath = get_temp_filepath() simplifier(complex_filepath, pred_filepath) return pred_filepath