def mix_datasets(datasets, props=None, new_dataset=None): if len(set(datasets)) == 1: return datasets[0] if props is None: props = [1 / len(datasets)] * len(datasets) assert len(props) == len(datasets) assert all([get_dataset_dir(dataset).exists() for dataset in datasets]) # Sort in unison according to dataset names datasets, props = zip(*sorted(zip(datasets, props))) if new_dataset is None: new_dataset = 'mix-' + '-'.join([ f'{dataset}_{prop:.2f}' for dataset, prop in zip(datasets, props) ]) with create_directory_or_skip(get_dataset_dir(new_dataset)): print('Mixing datasets...') for phase, language in product(PHASES, LANGUAGES): input_files = [ get_data_filepath(dataset, phase, language) for dataset in datasets ] # If one of the input files does not exist, we remove it and its prop and renormalize input_files, current_props = zip( *[(input_file, prop) for input_file, prop in zip(input_files, props) if input_file.exists()]) current_props = np.array(current_props) / np.sum(current_props) output_file = get_data_filepath(new_dataset, phase, language) # TODO: Jointly mix files # The seed is set everytime mix is called, therefore they should be mixed in the same order mix_files(input_files, current_props, output_file) shuffle_file_lines(output_file) return new_dataset
def create_preprocessed_dataset(dataset, preprocessors, n_jobs=1): for preprocessor in preprocessors: # Fit preprocessor on input dataset preprocessor.fit(get_data_filepath(dataset, 'train', 'complex'), get_data_filepath(dataset, 'train', 'simple')) dataset = create_preprocessed_dataset_one_preprocessor( dataset, preprocessor, n_jobs) return dataset
def check_dataset(dataset): # Sanity check with evaluation dataset if has_lines_in_common(get_data_filepath(dataset, 'train', 'complex'), get_data_filepath('asset', 'valid', 'complex')): warnings.warn( 'WARNING: Dataset has validation samples in training set!') if has_lines_in_common(get_data_filepath(dataset, 'train', 'complex'), get_data_filepath('asset', 'test', 'complex')): warnings.warn('WARNING: Dataset has test samples in training set!')
def apply_line_function_to_dataset(line_function, dataset, new_dataset, languages=LANGUAGES): '''Provided function signature: line_function(line) -> line''' with create_directory_or_skip(get_dataset_dir(new_dataset)): for phase, language in product(PHASES, languages): source_filepath = get_data_filepath(dataset, phase, language) target_filepath = get_data_filepath(new_dataset, phase, language) if not source_filepath.exists(): continue apply_line_function_to_file(line_function, source_filepath, target_filepath) return new_dataset
def mlm_fairseq_preprocess(dataset): '''Too specific for ts.fairseq.base.fairseq_preprocess''' dataset_dir = get_dataset_dir(dataset) with lock_directory(dataset_dir): preprocessed_dir = dataset_dir / 'fairseq_preprocessed' with create_directory_or_skip(preprocessed_dir): vocab_path = get_data_filepath(dataset, 'vocab', 'fr') assert vocab_path.exists() trainpref = get_data_filepath(dataset, 'train', 'fr') validpref = get_data_filepath(dataset, 'valid', 'fr') testpref = get_data_filepath(dataset, 'test', 'fr') command = f'fairseq-preprocess --only-source --trainpref {trainpref} --validpref {validpref} --testpref {testpref} --destdir {preprocessed_dir} --workers 64 --srcdict {vocab_path}' # noqa print(command) run_command(command) return preprocessed_dir
def combine_simplifications_in_dataset(simplification_pairs, dataset): with create_directory_or_skip(get_dataset_dir(dataset)): assert len(simplification_pairs) > 30000, f'Not enough pairs: {len(simplification_pairs)}' indexes = np.random.permutation(len(simplification_pairs)) for phase, start_index, end_index in [ ('test', 10000, 20000), ('valid', 20000, 30000), ('train', 30000, len(indexes)), ]: with write_lines_in_parallel( [get_data_filepath(dataset, phase, 'complex'), get_data_filepath(dataset, phase, 'simple')] ) as files: for idx in tqdm(indexes[start_index:end_index]): files.write(simplification_pairs[idx]) return get_dataset_dir(dataset)
def finetune_and_predict_on_dataset(finetuning_dataset, exp_dir, **kwargs): kwargs['train_kwargs']['ngpus'] = 1 prefix = 'finetune' if kwargs.get('fast_parametrization_search', False): prefix += '_fast' pred_filepaths = [ exp_dir / f'{prefix}_{finetuning_dataset}_valid-test_{finetuning_dataset}_valid.pred', exp_dir / f'{prefix}_{finetuning_dataset}_valid-test_{finetuning_dataset}_test.pred', ] if all([path.exists() for path in pred_filepaths]): return for phase, pred_filepath in zip(['valid', 'test'], pred_filepaths): orig_sents_path = get_data_filepath(finetuning_dataset, phase, 'complex') refs_sents_paths = list( get_dataset_dir(finetuning_dataset).glob(f'{phase}.simple*')) kwargs['evaluate_kwargs'] = { 'test_set': 'custom', 'orig_sents_path': orig_sents_path, 'refs_sents_paths': refs_sents_paths, } if phase == 'valid': # Finetune preprocessors_kwargs only on valid kwargs['preprocessors_kwargs'] = find_best_parametrization( exp_dir, **kwargs) shutil.copyfile( fairseq_get_simplifier(exp_dir, **kwargs)(orig_sents_path), pred_filepath)
def get_transformer_kwargs(dataset, language, use_access, use_short_name=False): kwargs = { 'dataset': dataset, 'parametrization_budget': 128, 'predict_files': get_predict_files(language), 'train_kwargs': { 'ngpus': 8, 'arch': 'bart_large', 'max_tokens': 4096, 'truncate_source': True, 'layernorm_embedding': True, 'share_all_embeddings': True, 'share_decoder_input_output_embed': True, 'required_batch_size_multiple': 1, 'criterion': 'label_smoothed_cross_entropy', 'lr': 3e-04, 'label_smoothing': 0.1, 'dropout': 0.1, 'attention_dropout': 0.1, 'weight_decay': 0.01, 'optimizer': 'adam', 'adam_betas': '(0.9, 0.999)', 'adam_eps': 1e-08, 'clip_norm': 0.1, }, 'preprocessors_kwargs': { 'SentencePiecePreprocessor': { 'vocab_size': 32000, 'input_filepaths': [ get_data_filepath(dataset, 'train', 'complex'), get_data_filepath(dataset, 'train', 'simple'), ], } # 'SentencePiecePreprocessor': {'vocab_size': 32000, 'input_filepaths': [get_dataset_dir('enwiki') / 'all_sentences']} }, 'evaluate_kwargs': get_evaluate_kwargs(language), } if use_access: kwargs['preprocessors_kwargs'] = add_dicts( get_access_preprocessors_kwargs(language, use_short_name=use_short_name), kwargs['preprocessors_kwargs']) return kwargs
def fairseq_evaluate_and_save(exp_dir, **kwargs): scores = fairseq_evaluate(exp_dir, **kwargs) print(f'scores={scores}') report_path = exp_dir / 'easse_report.html' shutil.move(get_easse_report_from_exp_dir(exp_dir, **kwargs), report_path) print(f'report_path={report_path}') predict_files = kwargs.get('predict_files', [ get_data_filepath('asset', 'valid', 'complex'), get_data_filepath('asset', 'test', 'complex') ]) for source_path in predict_files: pred_path = get_predictions(source_path, exp_dir, **kwargs) shutil.copyfile(source_path, exp_dir / source_path.name) new_pred_path = exp_dir / source_path.with_suffix('.pred').name shutil.move(pred_path, new_pred_path) print(f'source_path={source_path}') print(f'pred_path={new_pred_path}') return scores
def get_scores_on_dataset(pred_path, dataset, phase): orig_sents_path = get_data_filepath(dataset, phase, 'complex') refs_sents_paths = list(get_dataset_dir(dataset).glob(f'{phase}.simple*')) return evaluate_system_output( 'custom', sys_sents_path=pred_path, orig_sents_path=orig_sents_path, refs_sents_paths=refs_sents_paths, metrics=['sari', 'bleu', 'fkgl', 'sari_by_operation'], quality_estimation=False, )
def fairseq_preprocess(dataset, dict_path=None, source_lang='complex', target_lang='simple'): dataset_dir = get_dataset_dir(dataset) with lock_directory(dataset_dir): preprocessed_dir = dataset_dir / f'fairseq_preprocessed_{source_lang}-{target_lang}' with create_directory_or_skip(preprocessed_dir): # HACK for phase in PHASES: for language, new_language in zip(LANGUAGES, [source_lang, target_lang]): symlink_path = get_data_filepath(dataset, phase, new_language) if not symlink_path.exists(): symlink_path.symlink_to( get_data_filepath(dataset, phase, language)) trainpref = str(get_data_filepath(dataset, 'train', 'dummy')).replace('.dummy', '') validpref = str(get_data_filepath(dataset, 'valid', 'dummy')).replace('.dummy', '') testpref = str(get_data_filepath(dataset, 'test', 'dummy')).replace('.dummy', '') args = f''' --source-lang {source_lang} --target-lang {target_lang} --trainpref {trainpref} --validpref {validpref} --testpref {testpref} --destdir {preprocessed_dir} --bpe sentencepiece --joined-dictionary --workers 32 ''' if dict_path is not None: args = f'{args} --srcdict {dict_path}' args = remove_multiple_whitespaces(args.replace('\n', ' ')).strip(' ') print(f'fairseq-preprocess {args}') args = shlex.split(args) with mock_cli_args(args): preprocess.cli_main() return preprocessed_dir
def prepare_asset(): print('ASSET') dataset = 'asset' with create_directory_or_skip(get_dataset_dir(dataset)): for phase in ('valid', 'test'): for i in range(10): for (old_language_name, new_language_name) in [('orig', 'complex'), (f'simp.{i}', f'simple.{i}')]: url = f'https://raw.githubusercontent.com/facebookresearch/asset/master/dataset/asset.{phase}.{old_language_name}' old_path = download(url) new_path = get_data_filepath(dataset, phase, new_language_name) shutil.copyfile(old_path, new_path) add_newline_at_end_of_file(new_path) print('Done.')
def get_predict_files(language): return { 'en': [ get_data_filepath('asset', 'valid', 'complex'), get_data_filepath('asset', 'test', 'complex') ], 'fr': [ get_data_filepath('alector', 'valid', 'complex'), get_data_filepath('alector', 'test', 'complex') ], 'es': [ get_data_filepath('simplext_corpus', 'valid', 'complex'), get_data_filepath('simplext_corpus', 'test', 'complex'), ], }[language]
def get_evaluate_kwargs(language, phase='valid'): return { ('en', 'valid'): { 'test_set': 'asset_valid' }, ('en', 'test'): { 'test_set': 'asset_test' }, ('fr', 'valid'): { 'test_set': 'custom', 'orig_sents_path': get_data_filepath('alector', 'valid', 'complex'), 'refs_sents_paths': [get_data_filepath('alector', 'valid', 'simple')], }, ('fr', 'test'): { 'test_set': 'custom', 'orig_sents_path': get_data_filepath('alector', 'test', 'complex'), 'refs_sents_paths': [get_data_filepath('alector', 'test', 'simple')], }, ('es', 'valid'): { 'test_set': 'custom', 'orig_sents_path': get_data_filepath('simplext_corpus', 'valid', 'complex'), 'refs_sents_paths': [get_data_filepath('simplext_corpus', 'valid', 'simple')], }, ('es', 'test'): { 'test_set': 'custom', 'orig_sents_path': get_data_filepath('simplext_corpus', 'test', 'complex'), 'refs_sents_paths': [get_data_filepath('simplext_corpus', 'test', 'simple')], }, }[(language, phase)]
def prepare_wikilarge(): print('WikiLarge') dataset = 'wikilarge' # dataset = wikismall works as well with create_directory_or_skip(get_dataset_dir(dataset)): url = 'https://github.com/louismartin/dress-data/raw/master/data-simplification.tar.bz2' extracted_path = download_and_extract(url)[0] # Process print('Processing...') # Only rename files and put them in local directory architecture # FIXME: Wikilarge validations set only has 992 sentences for phase in PHASES: for (old_language_name, new_language_name) in [('src', 'complex'), ('dst', 'simple')]: old_path_glob = os.path.join( extracted_path, dataset, f'*.ori.{phase}.{old_language_name}') globs = glob(old_path_glob) assert len(globs) == 1 old_path = globs[0] new_path = get_data_filepath(dataset, phase, new_language_name) shutil.copyfile(old_path, new_path) shutil.move(replace_lrb_rrb_file(new_path), new_path) add_newline_at_end_of_file(new_path) print('Done.')
def get_all_baseline_rows(): paths = { ('asset', 'test'): ('en', TEST_SETS_PATHS[('asset_test', 'orig')], TEST_SETS_PATHS[('asset_test', 'refs')]), ('asset', 'valid'): ('en', TEST_SETS_PATHS[('asset_valid', 'orig')], TEST_SETS_PATHS[('asset_valid', 'refs')]), ('turkcorpus_detokenized', 'test'): ( 'en', TEST_SETS_PATHS[('turkcorpus_test', 'orig')], TEST_SETS_PATHS[('turkcorpus_test', 'refs')], ), ('turkcorpus_detokenized', 'valid'): ( 'en', TEST_SETS_PATHS[('turkcorpus_valid', 'orig')], TEST_SETS_PATHS[('turkcorpus_valid', 'refs')], ), ('alector', 'test'): ( 'fr', get_data_filepath('alector', 'test', 'complex'), [get_data_filepath('alector', 'test', 'simple')], ), ('alector', 'valid'): ( 'fr', get_data_filepath('alector', 'valid', 'complex'), [get_data_filepath('alector', 'valid', 'simple')], ), # Old dataset with problems ('simplext_corpus_all', 'test'): ( 'es', get_data_filepath('simplext_corpus_all', 'test', 'complex'), [get_data_filepath('simplext_corpus_all', 'test', 'simple')], ), ('simplext_corpus_all', 'valid'): ( 'es', get_data_filepath('simplext_corpus_all', 'valid', 'complex'), [get_data_filepath('simplext_corpus_all', 'valid', 'simple')], ), ('simplext_corpus_all_fixed', 'test'): ( 'es', get_data_filepath('simplext_corpus_all_fixed', 'test', 'complex'), [get_data_filepath('simplext_corpus_all_fixed', 'test', 'simple')], ), ('simplext_corpus_all_fixed', 'valid'): ( 'es', get_data_filepath('simplext_corpus_all_fixed', 'valid', 'complex'), [ get_data_filepath('simplext_corpus_all_fixed', 'valid', 'simple') ], ), ('simpitiki', 'test'): ( 'it', get_data_filepath('simpitiki', 'test', 'complex'), [get_data_filepath('simpitiki', 'test', 'simple')], ), ('simpitiki', 'valid'): ( 'it', get_data_filepath('simpitiki', 'valid', 'complex'), [get_data_filepath('simpitiki', 'valid', 'simple')], ), } rows = [] for (dataset, phase), (language, orig_sents_path, refs_sents_paths) in tqdm(paths.items()): dataset_rows = get_baseline_rows(orig_sents_path, tuple(refs_sents_paths), language) for row in dataset_rows: row['dataset'] = dataset row['phase'] = phase rows.extend(dataset_rows) return rows