Exemple #1
0
def get_baseline_rows(orig_sents_path, refs_sents_paths, language):
    refs_sents_paths = list(refs_sents_paths)
    rows = []
    scores = evaluate_system_output(
        'custom',
        sys_sents_path=orig_sents_path,
        orig_sents_path=orig_sents_path,
        refs_sents_paths=refs_sents_paths,
        metrics=['sari', 'bleu', 'fkgl', 'sari_by_operation'],
        quality_estimation=False,
    )
    row = {
        'exp_name': 'Identity',
        'language': language,
    }
    rows.append(add_dicts(row, scores))

    scores = evaluate_system_output(
        'custom',
        sys_sents_path=apply_line_function_to_file(
            lambda sentence: truncate(
                sentence, truncate_prop=0.2, language=language),
            orig_sents_path),
        orig_sents_path=orig_sents_path,
        refs_sents_paths=refs_sents_paths,
        metrics=['sari', 'bleu', 'fkgl', 'sari_by_operation'],
        quality_estimation=False,
    )
    row = {
        'exp_name': 'Truncate',
        'language': language,
    }
    rows.append(add_dicts(row, scores))

    if len(refs_sents_paths) > 1:
        for i in range(len(refs_sents_paths)):
            scores = evaluate_system_output(
                'custom',
                sys_sents_path=refs_sents_paths[i],
                orig_sents_path=orig_sents_path,
                refs_sents_paths=[refs_sents_paths[i - 1]] +
                refs_sents_paths[:i] + refs_sents_paths[i + 1:],
                metrics=['sari', 'bleu', 'fkgl', 'sari_by_operation'],
                quality_estimation=False,
            )
            row = {
                'exp_name': 'Reference',
                'language': language,
                'job_id': f'ref_{i}',
            }
            rows.append(add_dicts(row, scores))
    return rows
Exemple #2
0
def get_score_rows(exp_dir, kwargs, additional_fields=None):
    rows = []
    language = get_language_from_dataset(kwargs['dataset'])
    for pred_path in exp_dir.glob('finetune_*.pred'):
        dataset, phase = re.match(r'finetune_.+?_valid-test_(.+)_(.+?).pred',
                                  pred_path.name).groups()
        scores = get_scores_on_dataset(pred_path, dataset, phase)
        row = {
            'language': language,
            'dataset': dataset,
            'phase': phase,
        }
        if additional_fields is not None:
            row = add_dicts(row, additional_fields)
        rows.append(add_dicts(row, scores))
    return rows
Exemple #3
0
def get_bart_kwargs(dataset,
                    language,
                    use_access,
                    use_short_name=False,
                    bart_model='bart.large'):
    assert language == 'en'
    bart_path = prepare_bart_model(bart_model) / 'model.pt'
    arch = {
        'bart.base': 'bart_base',
        'bart.large': 'bart_large',
        'bart.large.cnn': 'bart_large',
    }[bart_model]
    kwargs = {
        'dataset': dataset,
        'metrics_coefs': [0, 1, 0],
        'parametrization_budget': 128,
        'predict_files': get_predict_files(language),
        'preprocessors_kwargs': {
            'GPT2BPEPreprocessor': {},
        },
        'preprocess_kwargs': {
            'dict_path': GPT2BPEPreprocessor().dict_path
        },
        'train_kwargs': {
            'ngpus': 8,
            'arch': arch,
            'restore_file': bart_path,
            'max_tokens': 4096,
            'lr': 3e-05,
            'warmup_updates': 500,
            'truncate_source': True,
            'layernorm_embedding': True,
            'share_all_embeddings': True,
            'share_decoder_input_output_embed': True,
            'reset_optimizer': True,
            'reset_dataloader': True,
            'reset_meters': True,
            'required_batch_size_multiple': 1,
            'criterion': 'label_smoothed_cross_entropy',
            'label_smoothing': 0.1,
            'dropout': 0.1,
            'attention_dropout': 0.1,
            'weight_decay': 0.01,
            'optimizer': 'adam',
            'adam_betas': '(0.9, 0.999)',
            'adam_eps': 1e-08,
            'clip_norm': 0.1,
            'lr_scheduler': 'polynomial_decay',
            'max_update': 20000,
            'skip_invalid_size_inputs_valid_test': True,
            'find_unused_parameters': True,
        },
        'evaluate_kwargs': get_evaluate_kwargs(language),
    }
    if use_access:
        kwargs['preprocessors_kwargs'] = add_dicts(
            get_access_preprocessors_kwargs(language,
                                            use_short_name=use_short_name),
            kwargs['preprocessors_kwargs'])
    return kwargs
Exemple #4
0
def get_mbart_kwargs(dataset, language, use_access, use_short_name=False):
    mbart_dir = prepare_mbart_model()
    mbart_path = mbart_dir / 'model.pt'
    # source_lang = f'{language}_XX'
    # target_lang = f'{language}_XX'
    source_lang = 'complex'
    target_lang = 'simple'
    kwargs = {
        'dataset':
        dataset,
        'metrics_coefs': [0, 1, 0],
        'parametrization_budget':
        128,
        'predict_files':
        get_predict_files(language),
        'preprocessors_kwargs': {
            'SentencePiecePreprocessor': {
                'sentencepiece_model_path': mbart_dir / 'sentence.bpe.model',
                'tokenize_special_tokens': True,
            },
        },
        'preprocess_kwargs': {
            'dict_path': mbart_dir / 'dict.txt',
            'source_lang': source_lang,
            'target_lang': target_lang,
        },
        'train_kwargs':
        add_dicts(
            {'ngpus': 8},
            args_str_to_dict(
                f'''--restore-file {mbart_path}  --arch mbart_large --task translation_from_pretrained_bart  --source-lang {source_lang} --target-lang {target_lang}  --encoder-normalize-before --decoder-normalize-before --criterion label_smoothed_cross_entropy --label-smoothing 0.2  --dataset-impl mmap --optimizer adam --adam-eps 1e-06 --adam-betas '(0.9, 0.98)' --lr-scheduler polynomial_decay --lr 3e-05 --min-lr -1 --warmup-updates 2500 --total-num-update 40000 --dropout 0.3 --attention-dropout 0.1  --weight-decay 0.0 --max-tokens 1024 --update-freq 2 --log-format simple --log-interval 2 --reset-optimizer --reset-meters --reset-dataloader --reset-lr-scheduler --langs ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN
     --layernorm-embedding  --ddp-backend no_c10d'''),
        ),  # noqa: E501
        'generate_kwargs':
        args_str_to_dict(
            f'''--task translation_from_pretrained_bart --source_lang {source_lang} --target-lang {target_lang} --batch-size 32 --langs ar_AR,cs_CZ,de_DE,en_XX,es_XX,et_EE,fi_FI,fr_XX,gu_IN,hi_IN,it_IT,ja_XX,kk_KZ,ko_KR,lt_LT,lv_LV,my_MM,ne_NP,nl_XX,ro_RO,ru_RU,si_LK,tr_TR,vi_VN,zh_CN'''  # noqa: E501
        ),
        'evaluate_kwargs':
        get_evaluate_kwargs(language),
    }
    if use_access:
        kwargs['preprocessors_kwargs'] = add_dicts(
            get_access_preprocessors_kwargs(language,
                                            use_short_name=use_short_name),
            kwargs['preprocessors_kwargs'])
    return kwargs
Exemple #5
0
 def get_parametrization(preprocessors_kwargs):
     parametrization_kwargs = {}
     for preprocessor_name, preprocessor_kwargs in preprocessors_kwargs.items(
     ):
         assert '_' not in preprocessor_name
         nevergrad_variables = add_dicts(
             preprocessor_kwargs,
             get_preprocessor_by_name(
                 preprocessor_name).get_nevergrad_variables())
         parametrization_kwargs[preprocessor_name] = ng.p.Dict(
             **nevergrad_variables)
     return ng.p.Instrumentation(**parametrization_kwargs)
Exemple #6
0
def get_transformer_kwargs(dataset,
                           language,
                           use_access,
                           use_short_name=False):
    kwargs = {
        'dataset': dataset,
        'parametrization_budget': 128,
        'predict_files': get_predict_files(language),
        'train_kwargs': {
            'ngpus': 8,
            'arch': 'bart_large',
            'max_tokens': 4096,
            'truncate_source': True,
            'layernorm_embedding': True,
            'share_all_embeddings': True,
            'share_decoder_input_output_embed': True,
            'required_batch_size_multiple': 1,
            'criterion': 'label_smoothed_cross_entropy',
            'lr': 3e-04,
            'label_smoothing': 0.1,
            'dropout': 0.1,
            'attention_dropout': 0.1,
            'weight_decay': 0.01,
            'optimizer': 'adam',
            'adam_betas': '(0.9, 0.999)',
            'adam_eps': 1e-08,
            'clip_norm': 0.1,
        },
        'preprocessors_kwargs': {
            'SentencePiecePreprocessor': {
                'vocab_size':
                32000,
                'input_filepaths': [
                    get_data_filepath(dataset, 'train', 'complex'),
                    get_data_filepath(dataset, 'train', 'simple'),
                ],
            }
            # 'SentencePiecePreprocessor': {'vocab_size': 32000, 'input_filepaths':  [get_dataset_dir('enwiki') / 'all_sentences']}
        },
        'evaluate_kwargs': get_evaluate_kwargs(language),
    }
    if use_access:
        kwargs['preprocessors_kwargs'] = add_dicts(
            get_access_preprocessors_kwargs(language,
                                            use_short_name=use_short_name),
            kwargs['preprocessors_kwargs'])
    return kwargs
Exemple #7
0
 def wrapped(self, *args, **kwargs):
     if not hasattr(self, 'args') or not hasattr(self, 'kwargs'):
         # TODO: Default args are not overwritten if provided as args
         self.args = args
         self.kwargs = add_dicts(get_default_args(constructor), kwargs)
     return constructor(self, *args, **kwargs)