def has_lines_in_common(filepath1, filepath2): [smallest_filepath, largest_filepath] = sort_files_by_line_count([filepath1, filepath2]) for idx in yield_indexes_of_lines(largest_filepath, read_lines(smallest_filepath)): return True return False
def get_sentences_from_ids_single_file(sentence_ids, sentences_path): sentences = read_lines(sentences_path) try: return [sentences[sentence_id] for sentence_id in sentence_ids] except IndexError: print( f'len(sentences)={len(sentences)}, max(sentence_ids)={max(sentence_ids)}, sentences_path={sentences_path}' ) raise
def compute_and_save_embeddings(sentences_path, base_index_path, get_embeddings, indexes_dir): index_path = get_index_path(sentences_path, indexes_dir) if not index_path.exists(): with log_action('Computing and saving embeddings'): sentences = read_lines(sentences_path) embeddings = get_embeddings(sentences) index = load_index(base_index_path) index.add(embeddings) faiss.write_index(index, str(index_path)) return index_path
def get_laser_embeddings( sentences, bpe_codes_path=BPE_CODES_PATH, encoder_path=ENCODER_PATH, language='en', max_tokens=12000, normalize_l2=False, n_encoding_jobs=10, ): prepare_laser() from embed import SentenceEncoder # noqa: E402 from text_processing import Token, BPEfastApply # noqa: E402 def get_laser_encoder(encoder_path, max_tokens=12000): return SentenceEncoder(encoder_path, max_sentences=None, max_tokens=max_tokens, cpu=False) def encode_file(input_filepath, output_filepath, language, bpe_codes_path): tokenized_filepath = get_temp_filepath() Token(str(input_filepath), str(tokenized_filepath), lang=language, romanize=True if language == 'el' else False) BPEfastApply(str(tokenized_filepath), str(output_filepath), str(bpe_codes_path)) tokenized_filepath.unlink() input_filepath = get_temp_filepath() write_lines(sentences, input_filepath) with mute(): with log_action('Tokenizing and applying BPE'): parallel_file_encoder = get_parallel_file_preprocessor( lambda input_filepath, output_filepath: encode_file( input_filepath, output_filepath, language, bpe_codes_path), n_jobs=n_encoding_jobs, ) bpe_filepath = get_temp_filepath() parallel_file_encoder(input_filepath, bpe_filepath) with log_action('Geting LASER embedding'): encoder = get_laser_encoder(encoder_path, max_tokens=max_tokens) embeddings = encoder.encode_sentences(read_lines(bpe_filepath)) input_filepath.unlink() bpe_filepath.unlink() assert embeddings.shape[0] == len(sentences) del encoder if normalize_l2: embeddings = embeddings / np.expand_dims( np.linalg.norm(embeddings, axis=1), axis=1) return embeddings
def simplify_sentences(source_sentences, model_name='muss_en_wikilarge_mined'): # Best ACCESS parameter values for the en_bart_access_wikilarge_mined model, ideally we would need to use another set of parameters for other models. exp_dir = get_model_path(model_name) preprocessors = get_muss_preprocessors(model_name) generate_kwargs = {} if is_model_using_mbart(model_name): generate_kwargs['task'] = 'translation_from_pretrained_bart' generate_kwargs[ 'langs'] = 'ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN' # noqa: E501 simplifier = get_fairseq_simplifier(exp_dir, **generate_kwargs) simplifier = get_preprocessed_simplifier(simplifier, preprocessors=preprocessors) source_path = get_temp_filepath() write_lines(source_sentences, source_path) pred_path = simplifier(source_path) return read_lines(pred_path)
def train_kenlm_language_model(input_data_paths, output_model_dir): output_model_dir = Path(output_model_dir) output_model_dir.mkdir(exist_ok=True, parents=True) output_model_path = output_model_dir / 'kenlm_model.arpa' with log_action('Training tokenizer'): tokenizer = SentencePieceBPETokenizer() tokenizer.train([str(path) for path in input_data_paths], vocab_size=20000) tokenizer.save(str(output_model_dir), 'spm_tokenizer') with log_action('Tokenizing'): tokenized_data_paths = get_temp_filepaths(len(input_data_paths)) for tokenized_data_path, input_data_path in zip(tokenized_data_paths, input_data_paths): encodings = tokenizer.encode_batch(read_lines(input_data_path)) write_lines([' '.join(encoding.tokens) for encoding in encodings], tokenized_data_path) with log_action('Training language model'): kenlm_path = input('Please provide the path to the lmplz script (install at https://github.com/kpu/kenlm): ') command = ( f'cat {" ".join([str(path) for path in tokenized_data_paths])} | {kenlm_path} -o 3 > {output_model_path}' ) run_command(command, mute=False) [path.unlink() for path in tokenized_data_paths] return output_model_dir
def get_original_dataset(dataset): filepath = get_dataset_dir(dataset) / 'original_dataset' if not filepath.exists(): return None [original_dataset] = read_lines(filepath) return original_dataset
# LICENSE file in the root directory of this source tree. import argparse from muss.simplify import ALLOWED_MODEL_NAMES, simplify_sentences from muss.utils.helpers import read_lines if __name__ == '__main__': parser = argparse.ArgumentParser( description='Simplify a file line by line.') parser.add_argument( 'filepath', type=str, help='File containing the source sentences, one sentence per line.') parser.add_argument( '--model-name', type=str, default=ALLOWED_MODEL_NAMES[0], choices=ALLOWED_MODEL_NAMES, help= f'Model name to generate from. Models selected with the highest validation SARI score.', ) args = parser.parse_args() source_sentences = read_lines(args.filepath) pred_sentences = simplify_sentences(source_sentences, model_name=args.model_name) for c, s in zip(source_sentences, pred_sentences): print('-' * 80) print(f'Original: {c}') print(f'Simplified: {s}')