Exemple #1
0
def has_lines_in_common(filepath1, filepath2):
    [smallest_filepath,
     largest_filepath] = sort_files_by_line_count([filepath1, filepath2])
    for idx in yield_indexes_of_lines(largest_filepath,
                                      read_lines(smallest_filepath)):
        return True
    return False
Exemple #2
0
 def get_sentences_from_ids_single_file(sentence_ids, sentences_path):
     sentences = read_lines(sentences_path)
     try:
         return [sentences[sentence_id] for sentence_id in sentence_ids]
     except IndexError:
         print(
             f'len(sentences)={len(sentences)}, max(sentence_ids)={max(sentence_ids)}, sentences_path={sentences_path}'
         )
         raise
Exemple #3
0
def compute_and_save_embeddings(sentences_path, base_index_path, get_embeddings, indexes_dir):
    index_path = get_index_path(sentences_path, indexes_dir)
    if not index_path.exists():
        with log_action('Computing and saving embeddings'):
            sentences = read_lines(sentences_path)
            embeddings = get_embeddings(sentences)
            index = load_index(base_index_path)
            index.add(embeddings)
            faiss.write_index(index, str(index_path))
    return index_path
Exemple #4
0
def get_laser_embeddings(
    sentences,
    bpe_codes_path=BPE_CODES_PATH,
    encoder_path=ENCODER_PATH,
    language='en',
    max_tokens=12000,
    normalize_l2=False,
    n_encoding_jobs=10,
):
    prepare_laser()
    from embed import SentenceEncoder  # noqa: E402
    from text_processing import Token, BPEfastApply  # noqa: E402

    def get_laser_encoder(encoder_path, max_tokens=12000):
        return SentenceEncoder(encoder_path,
                               max_sentences=None,
                               max_tokens=max_tokens,
                               cpu=False)

    def encode_file(input_filepath, output_filepath, language, bpe_codes_path):
        tokenized_filepath = get_temp_filepath()
        Token(str(input_filepath),
              str(tokenized_filepath),
              lang=language,
              romanize=True if language == 'el' else False)
        BPEfastApply(str(tokenized_filepath), str(output_filepath),
                     str(bpe_codes_path))
        tokenized_filepath.unlink()

    input_filepath = get_temp_filepath()
    write_lines(sentences, input_filepath)
    with mute():
        with log_action('Tokenizing and applying BPE'):
            parallel_file_encoder = get_parallel_file_preprocessor(
                lambda input_filepath, output_filepath: encode_file(
                    input_filepath, output_filepath, language, bpe_codes_path),
                n_jobs=n_encoding_jobs,
            )
            bpe_filepath = get_temp_filepath()
            parallel_file_encoder(input_filepath, bpe_filepath)
        with log_action('Geting LASER embedding'):
            encoder = get_laser_encoder(encoder_path, max_tokens=max_tokens)
            embeddings = encoder.encode_sentences(read_lines(bpe_filepath))
            input_filepath.unlink()
            bpe_filepath.unlink()
            assert embeddings.shape[0] == len(sentences)
    del encoder
    if normalize_l2:
        embeddings = embeddings / np.expand_dims(
            np.linalg.norm(embeddings, axis=1), axis=1)
    return embeddings
Exemple #5
0
def simplify_sentences(source_sentences, model_name='muss_en_wikilarge_mined'):
    # Best ACCESS parameter values for the en_bart_access_wikilarge_mined model, ideally we would need to use another set of parameters for other models.
    exp_dir = get_model_path(model_name)
    preprocessors = get_muss_preprocessors(model_name)
    generate_kwargs = {}
    if is_model_using_mbart(model_name):
        generate_kwargs['task'] = 'translation_from_pretrained_bart'
        generate_kwargs[
            'langs'] = 'ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN'  # noqa: E501
    simplifier = get_fairseq_simplifier(exp_dir, **generate_kwargs)
    simplifier = get_preprocessed_simplifier(simplifier,
                                             preprocessors=preprocessors)
    source_path = get_temp_filepath()
    write_lines(source_sentences, source_path)
    pred_path = simplifier(source_path)
    return read_lines(pred_path)
Exemple #6
0
def train_kenlm_language_model(input_data_paths, output_model_dir):
    output_model_dir = Path(output_model_dir)
    output_model_dir.mkdir(exist_ok=True, parents=True)
    output_model_path = output_model_dir / 'kenlm_model.arpa'
    with log_action('Training tokenizer'):
        tokenizer = SentencePieceBPETokenizer()
        tokenizer.train([str(path) for path in input_data_paths], vocab_size=20000)
        tokenizer.save(str(output_model_dir), 'spm_tokenizer')
    with log_action('Tokenizing'):
        tokenized_data_paths = get_temp_filepaths(len(input_data_paths))
        for tokenized_data_path, input_data_path in zip(tokenized_data_paths, input_data_paths):
            encodings = tokenizer.encode_batch(read_lines(input_data_path))
            write_lines([' '.join(encoding.tokens) for encoding in encodings], tokenized_data_path)
    with log_action('Training language model'):
        kenlm_path = input('Please provide the path to the lmplz script (install at https://github.com/kpu/kenlm): ')
        command = (
            f'cat {" ".join([str(path) for path in tokenized_data_paths])} | {kenlm_path} -o 3 > {output_model_path}'
        )
        run_command(command, mute=False)
    [path.unlink() for path in tokenized_data_paths]
    return output_model_dir
Exemple #7
0
def get_original_dataset(dataset):
    filepath = get_dataset_dir(dataset) / 'original_dataset'
    if not filepath.exists():
        return None
    [original_dataset] = read_lines(filepath)
    return original_dataset
Exemple #8
0
# LICENSE file in the root directory of this source tree.

import argparse

from muss.simplify import ALLOWED_MODEL_NAMES, simplify_sentences
from muss.utils.helpers import read_lines

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Simplify a file line by line.')
    parser.add_argument(
        'filepath',
        type=str,
        help='File containing the source sentences, one sentence per line.')
    parser.add_argument(
        '--model-name',
        type=str,
        default=ALLOWED_MODEL_NAMES[0],
        choices=ALLOWED_MODEL_NAMES,
        help=
        f'Model name to generate from. Models selected with the highest validation SARI score.',
    )
    args = parser.parse_args()
    source_sentences = read_lines(args.filepath)
    pred_sentences = simplify_sentences(source_sentences,
                                        model_name=args.model_name)
    for c, s in zip(source_sentences, pred_sentences):
        print('-' * 80)
        print(f'Original:   {c}')
        print(f'Simplified: {s}')