def create_base_index(sentences, index_name, get_embeddings, metric, output_dir): index_prefix = f'{index_name.replace(",", "_").lower()}_metric{metric}' index_path = output_dir / f'{index_prefix}.faiss_index' if not index_path.exists(): with log_action('Computing embeddings'): embeddings = get_embeddings(sentences) with log_action('Training index'): index = faiss.index_factory(embeddings.shape[1], index_name, metric) index.train(embeddings) faiss.write_index(index, str(index_path)) return index_path
def get_laser_embeddings( sentences, bpe_codes_path=BPE_CODES_PATH, encoder_path=ENCODER_PATH, language='en', max_tokens=12000, normalize_l2=False, n_encoding_jobs=10, ): prepare_laser() from embed import SentenceEncoder # noqa: E402 from text_processing import Token, BPEfastApply # noqa: E402 def get_laser_encoder(encoder_path, max_tokens=12000): return SentenceEncoder(encoder_path, max_sentences=None, max_tokens=max_tokens, cpu=False) def encode_file(input_filepath, output_filepath, language, bpe_codes_path): tokenized_filepath = get_temp_filepath() Token(str(input_filepath), str(tokenized_filepath), lang=language, romanize=True if language == 'el' else False) BPEfastApply(str(tokenized_filepath), str(output_filepath), str(bpe_codes_path)) tokenized_filepath.unlink() input_filepath = get_temp_filepath() write_lines(sentences, input_filepath) with mute(): with log_action('Tokenizing and applying BPE'): parallel_file_encoder = get_parallel_file_preprocessor( lambda input_filepath, output_filepath: encode_file( input_filepath, output_filepath, language, bpe_codes_path), n_jobs=n_encoding_jobs, ) bpe_filepath = get_temp_filepath() parallel_file_encoder(input_filepath, bpe_filepath) with log_action('Geting LASER embedding'): encoder = get_laser_encoder(encoder_path, max_tokens=max_tokens) embeddings = encoder.encode_sentences(read_lines(bpe_filepath)) input_filepath.unlink() bpe_filepath.unlink() assert embeddings.shape[0] == len(sentences) del encoder if normalize_l2: embeddings = embeddings / np.expand_dims( np.linalg.norm(embeddings, axis=1), axis=1) return embeddings
def sentence_tokenize_subshard(subshard_path, sentences_path, language): if not sentences_path.exists(): with log_action('Sentence tokenization'): with gzip.open(sentences_path, 'wt', compresslevel=1) as f: for json_document in tqdm(yield_json_documents_from_compressed(subshard_path), desc='documents'): sentences = sentence_tokenize_document(json_document.pop('raw_content'), language=language) for sentence in sentences: f.write(f'{sentence}\n') cached_count_lines(sentences_path) # Cache line count return sentences_path
def compute_and_save_embeddings(sentences_path, base_index_path, get_embeddings, indexes_dir): index_path = get_index_path(sentences_path, indexes_dir) if not index_path.exists(): with log_action('Computing and saving embeddings'): sentences = read_lines(sentences_path) embeddings = get_embeddings(sentences) index = load_index(base_index_path) index.add(embeddings) faiss.write_index(index, str(index_path)) return index_path
def train_kenlm_language_model(input_data_paths, output_model_dir): output_model_dir = Path(output_model_dir) output_model_dir.mkdir(exist_ok=True, parents=True) output_model_path = output_model_dir / 'kenlm_model.arpa' with log_action('Training tokenizer'): tokenizer = SentencePieceBPETokenizer() tokenizer.train([str(path) for path in input_data_paths], vocab_size=20000) tokenizer.save(str(output_model_dir), 'spm_tokenizer') with log_action('Tokenizing'): tokenized_data_paths = get_temp_filepaths(len(input_data_paths)) for tokenized_data_path, input_data_path in zip(tokenized_data_paths, input_data_paths): encodings = tokenizer.encode_batch(read_lines(input_data_path)) write_lines([' '.join(encoding.tokens) for encoding in encodings], tokenized_data_path) with log_action('Training language model'): kenlm_path = input('Please provide the path to the lmplz script (install at https://github.com/kpu/kenlm): ') command = ( f'cat {" ".join([str(path) for path in tokenized_data_paths])} | {kenlm_path} -o 3 > {output_model_path}' ) run_command(command, mute=False) [path.unlink() for path in tokenized_data_paths] return output_model_dir
def train_sentencepiece(input_filepaths, vocab_size, sentencepiece_model_path, num_threads=64, max_lines=10**7): with log_action('Training sentencepiece'): sentencepiece_model_path.parent.mkdir(parents=True, exist_ok=True) sentencepiece_model_prefix = sentencepiece_model_path.parent / sentencepiece_model_path.stem args_str = f''' --bos_id=-1 --eos_id=-1 --input={",".join([str(path) for path in input_filepaths])} --model_prefix={sentencepiece_model_prefix} --vocab_size={vocab_size} --num_threads={num_threads} --character_coverage=0.9995 ''' if sum([count_lines(filepath) for filepath in input_filepaths]) > max_lines: args_str += f' --input_sentence_size={max_lines} --shuffle_input_sentence=true' args_str = remove_multiple_whitespaces(args_str.replace( '\n', ' ')).strip(' ') spm.SentencePieceTrainer.Train(args_str) return sentencepiece_model_path
def compute_and_save_simplification_pairs( query_sentences_path, db_sentences_paths, base_index_path, get_embeddings, cache_dir, pairs_dir, topk, nprobe, language, filter_kwargs, is_simpler, ): simplifications_path = get_pairs_path( query_sentences_path, db_sentences_paths, topk, nprobe, filter_kwargs, pairs_dir ) if not simplifications_path.exists(): paraphrase_pairs = get_paraphrase_pairs( query_sentences_path, db_sentences_paths, base_index_path, get_embeddings, cache_dir, topk, nprobe, filter_kwargs, ) filters = { 'is_simpler': is_simpler, } if filter_kwargs.get('filter_ne', True): filters['has_hallucinated_ne'] = lambda pair: not has_hallucinated_named_entities(*pair, language=language) with log_action('filtering'): simplification_pairs = filter_candidate_pairs(paraphrase_pairs, filters) write_pairs_to_file(simplification_pairs, simplifications_path) return simplifications_path
def get_paraphrase_pairs( query_sentences_path, db_sentences_paths, base_index_path, get_embeddings, cache_dir, topk, nprobe, filter_kwargs ): candidate_pairs = print_running_time(find_nearest_neighbors)( [query_sentences_path], db_sentences_paths, base_index_path, get_embeddings, cache_dir, topk=topk, nprobe=nprobe, distance_threshold=filter_kwargs['distance'], density_threshold=filter_kwargs['density'], ) print(f'#candidates: {len(candidate_pairs)}') filters = { 'macro-duplicates': lambda pairs: list(set(candidate_pairs)), 'is_contained': lambda pair: not is_contained(*pair), 'is_overlapping': lambda pair: not is_overlapping(*pair), } if filter_kwargs.get('levenshtein', 0) > 0: filters['is_different_enough'] = lambda pair: is_different_enough(*pair, threshold=filter_kwargs['levenshtein']) with log_action('filtering'): return filter_candidate_pairs(candidate_pairs, filters)
) from muss.mining.filtering import SimplicityScorer ccnet_dir = Path( input( 'Please download the CCNet corpus from https://github.com/facebookresearch/cc_net and enter the path to the downloaded data: ' )) language = input('What language do you want to process? (en/fr/es): ') cluster = 'local' dataset_dir = get_dataset_dir('uts') / language # For large jobs only slurm_partition = 'dev,scavenge' slurm_array_parallelism = 1024 # Split CCNet shards into subshards with log_action('Splitting CCNet shards into smaller subshards'): # We need to split each shard even more for the LASER embeddings to fit in memory n_shards = { # Number of shards to take for each languages for ~1B sentences 'en': 15, 'fr': 25, 'es': 13, # We would need about 20 shards for 1B sentences, but there are only 13 }[language] ccnet_filepaths = [ ccnet_dir / f'{language}_head_{i:04d}.json.gz' for i in range(n_shards) ] raw_original_dir = dataset_dir / 'raw_original' raw_original_dir.mkdir(exist_ok=True, parents=True) output_dirs = [ raw_original_dir / f'{language}_head_{i:04d}' for i in range(n_shards) ]