コード例 #1
0
def create_base_index(sentences, index_name, get_embeddings, metric, output_dir):
    index_prefix = f'{index_name.replace(",", "_").lower()}_metric{metric}'
    index_path = output_dir / f'{index_prefix}.faiss_index'
    if not index_path.exists():
        with log_action('Computing embeddings'):
            embeddings = get_embeddings(sentences)
        with log_action('Training index'):
            index = faiss.index_factory(embeddings.shape[1], index_name, metric)
            index.train(embeddings)
        faiss.write_index(index, str(index_path))
    return index_path
コード例 #2
0
def get_laser_embeddings(
    sentences,
    bpe_codes_path=BPE_CODES_PATH,
    encoder_path=ENCODER_PATH,
    language='en',
    max_tokens=12000,
    normalize_l2=False,
    n_encoding_jobs=10,
):
    prepare_laser()
    from embed import SentenceEncoder  # noqa: E402
    from text_processing import Token, BPEfastApply  # noqa: E402

    def get_laser_encoder(encoder_path, max_tokens=12000):
        return SentenceEncoder(encoder_path,
                               max_sentences=None,
                               max_tokens=max_tokens,
                               cpu=False)

    def encode_file(input_filepath, output_filepath, language, bpe_codes_path):
        tokenized_filepath = get_temp_filepath()
        Token(str(input_filepath),
              str(tokenized_filepath),
              lang=language,
              romanize=True if language == 'el' else False)
        BPEfastApply(str(tokenized_filepath), str(output_filepath),
                     str(bpe_codes_path))
        tokenized_filepath.unlink()

    input_filepath = get_temp_filepath()
    write_lines(sentences, input_filepath)
    with mute():
        with log_action('Tokenizing and applying BPE'):
            parallel_file_encoder = get_parallel_file_preprocessor(
                lambda input_filepath, output_filepath: encode_file(
                    input_filepath, output_filepath, language, bpe_codes_path),
                n_jobs=n_encoding_jobs,
            )
            bpe_filepath = get_temp_filepath()
            parallel_file_encoder(input_filepath, bpe_filepath)
        with log_action('Geting LASER embedding'):
            encoder = get_laser_encoder(encoder_path, max_tokens=max_tokens)
            embeddings = encoder.encode_sentences(read_lines(bpe_filepath))
            input_filepath.unlink()
            bpe_filepath.unlink()
            assert embeddings.shape[0] == len(sentences)
    del encoder
    if normalize_l2:
        embeddings = embeddings / np.expand_dims(
            np.linalg.norm(embeddings, axis=1), axis=1)
    return embeddings
コード例 #3
0
def sentence_tokenize_subshard(subshard_path, sentences_path, language):
    if not sentences_path.exists():
        with log_action('Sentence tokenization'):
            with gzip.open(sentences_path, 'wt', compresslevel=1) as f:
                for json_document in tqdm(yield_json_documents_from_compressed(subshard_path), desc='documents'):
                    sentences = sentence_tokenize_document(json_document.pop('raw_content'), language=language)
                    for sentence in sentences:
                        f.write(f'{sentence}\n')
        cached_count_lines(sentences_path)  # Cache line count
    return sentences_path
コード例 #4
0
ファイル: nn_search.py プロジェクト: facebookresearch/muss
def compute_and_save_embeddings(sentences_path, base_index_path, get_embeddings, indexes_dir):
    index_path = get_index_path(sentences_path, indexes_dir)
    if not index_path.exists():
        with log_action('Computing and saving embeddings'):
            sentences = read_lines(sentences_path)
            embeddings = get_embeddings(sentences)
            index = load_index(base_index_path)
            index.add(embeddings)
            faiss.write_index(index, str(index_path))
    return index_path
コード例 #5
0
ファイル: kenlm.py プロジェクト: facebookresearch/muss
def train_kenlm_language_model(input_data_paths, output_model_dir):
    output_model_dir = Path(output_model_dir)
    output_model_dir.mkdir(exist_ok=True, parents=True)
    output_model_path = output_model_dir / 'kenlm_model.arpa'
    with log_action('Training tokenizer'):
        tokenizer = SentencePieceBPETokenizer()
        tokenizer.train([str(path) for path in input_data_paths], vocab_size=20000)
        tokenizer.save(str(output_model_dir), 'spm_tokenizer')
    with log_action('Tokenizing'):
        tokenized_data_paths = get_temp_filepaths(len(input_data_paths))
        for tokenized_data_path, input_data_path in zip(tokenized_data_paths, input_data_paths):
            encodings = tokenizer.encode_batch(read_lines(input_data_path))
            write_lines([' '.join(encoding.tokens) for encoding in encodings], tokenized_data_path)
    with log_action('Training language model'):
        kenlm_path = input('Please provide the path to the lmplz script (install at https://github.com/kpu/kenlm): ')
        command = (
            f'cat {" ".join([str(path) for path in tokenized_data_paths])} | {kenlm_path} -o 3 > {output_model_path}'
        )
        run_command(command, mute=False)
    [path.unlink() for path in tokenized_data_paths]
    return output_model_dir
コード例 #6
0
def train_sentencepiece(input_filepaths,
                        vocab_size,
                        sentencepiece_model_path,
                        num_threads=64,
                        max_lines=10**7):
    with log_action('Training sentencepiece'):
        sentencepiece_model_path.parent.mkdir(parents=True, exist_ok=True)
        sentencepiece_model_prefix = sentencepiece_model_path.parent / sentencepiece_model_path.stem
        args_str = f'''
        --bos_id=-1 --eos_id=-1
        --input={",".join([str(path) for path in input_filepaths])} --model_prefix={sentencepiece_model_prefix}
        --vocab_size={vocab_size} --num_threads={num_threads} --character_coverage=0.9995
        '''
        if sum([count_lines(filepath)
                for filepath in input_filepaths]) > max_lines:
            args_str += f' --input_sentence_size={max_lines} --shuffle_input_sentence=true'
        args_str = remove_multiple_whitespaces(args_str.replace(
            '\n', ' ')).strip(' ')
        spm.SentencePieceTrainer.Train(args_str)
        return sentencepiece_model_path
コード例 #7
0
ファイル: nn_search.py プロジェクト: facebookresearch/muss
def compute_and_save_simplification_pairs(
    query_sentences_path,
    db_sentences_paths,
    base_index_path,
    get_embeddings,
    cache_dir,
    pairs_dir,
    topk,
    nprobe,
    language,
    filter_kwargs,
    is_simpler,
):
    simplifications_path = get_pairs_path(
        query_sentences_path, db_sentences_paths, topk, nprobe, filter_kwargs, pairs_dir
    )
    if not simplifications_path.exists():
        paraphrase_pairs = get_paraphrase_pairs(
            query_sentences_path,
            db_sentences_paths,
            base_index_path,
            get_embeddings,
            cache_dir,
            topk,
            nprobe,
            filter_kwargs,
        )
        filters = {
            'is_simpler': is_simpler,
        }
        if filter_kwargs.get('filter_ne', True):
            filters['has_hallucinated_ne'] = lambda pair: not has_hallucinated_named_entities(*pair, language=language)
        with log_action('filtering'):
            simplification_pairs = filter_candidate_pairs(paraphrase_pairs, filters)
        write_pairs_to_file(simplification_pairs, simplifications_path)
    return simplifications_path
コード例 #8
0
ファイル: nn_search.py プロジェクト: facebookresearch/muss
def get_paraphrase_pairs(
    query_sentences_path, db_sentences_paths, base_index_path, get_embeddings, cache_dir, topk, nprobe, filter_kwargs
):
    candidate_pairs = print_running_time(find_nearest_neighbors)(
        [query_sentences_path],
        db_sentences_paths,
        base_index_path,
        get_embeddings,
        cache_dir,
        topk=topk,
        nprobe=nprobe,
        distance_threshold=filter_kwargs['distance'],
        density_threshold=filter_kwargs['density'],
    )
    print(f'#candidates: {len(candidate_pairs)}')
    filters = {
        'macro-duplicates': lambda pairs: list(set(candidate_pairs)),
        'is_contained': lambda pair: not is_contained(*pair),
        'is_overlapping': lambda pair: not is_overlapping(*pair),
    }
    if filter_kwargs.get('levenshtein', 0) > 0:
        filters['is_different_enough'] = lambda pair: is_different_enough(*pair, threshold=filter_kwargs['levenshtein'])
    with log_action('filtering'):
        return filter_candidate_pairs(candidate_pairs, filters)
コード例 #9
0
)
from muss.mining.filtering import SimplicityScorer

ccnet_dir = Path(
    input(
        'Please download the CCNet corpus from https://github.com/facebookresearch/cc_net and enter the path to the downloaded data: '
    ))
language = input('What language do you want to process? (en/fr/es): ')
cluster = 'local'
dataset_dir = get_dataset_dir('uts') / language
# For large jobs only
slurm_partition = 'dev,scavenge'
slurm_array_parallelism = 1024

# Split CCNet shards into subshards
with log_action('Splitting CCNet shards into smaller subshards'):
    # We need to split each shard even more for the LASER embeddings to fit in memory
    n_shards = {  # Number of shards to take for each languages for ~1B sentences
        'en': 15,
        'fr': 25,
        'es':
        13,  # We would need about 20 shards for 1B sentences, but there are only 13
    }[language]
    ccnet_filepaths = [
        ccnet_dir / f'{language}_head_{i:04d}.json.gz' for i in range(n_shards)
    ]
    raw_original_dir = dataset_dir / 'raw_original'
    raw_original_dir.mkdir(exist_ok=True, parents=True)
    output_dirs = [
        raw_original_dir / f'{language}_head_{i:04d}' for i in range(n_shards)
    ]