Ejemplo n.º 1
0
def test_other_clis(train_params: str, translate_params: str):
    """
    Task: test CLIs and core features other than train & translate.
    """
    with tmp_digits_dataset(prefix="test_other_clis",
                            train_line_count=_TRAIN_LINE_COUNT,
                            train_line_count_empty=_TRAIN_LINE_COUNT_EMPTY,
                            train_max_length=_LINE_MAX_LENGTH,
                            dev_line_count=_DEV_LINE_COUNT,
                            dev_max_length=_LINE_MAX_LENGTH,
                            test_line_count=_TEST_LINE_COUNT,
                            test_line_count_empty=0,
                            test_max_length=_TEST_MAX_LENGTH) as data:
        # train a minimal default model
        data = run_train_translate(train_params=train_params,
                                   translate_params=translate_params,
                                   data=data,
                                   max_seq_len=_LINE_MAX_LENGTH,
                                   use_pytorch=True)

        _test_checkpoint_decoder(data['dev_source'], data['dev_target'],
                                 data['model'])
        _test_mc_dropout(data['model'])
        _test_parameter_averaging(data['model'])
        _test_evaluate_cli(data['test_outputs'], data['test_target'])
Ejemplo n.º 2
0
def check_train_translate(train_params: str,
                          translate_params: str,
                          data: Dict[str, Any],
                          use_prepared_data: bool,
                          max_seq_len: int,
                          compare_output: bool = True,
                          seed: int = 13) -> Dict[str, Any]:
    """
    Tests core features (training, inference).
    """
    # train model and translate test set
    data = run_train_translate(train_params=train_params,
                               translate_params=translate_params,
                               data=data,
                               use_prepared_data=use_prepared_data,
                               max_seq_len=max_seq_len,
                               seed=seed)

    # Test equivalence of batch decoding
    # With neural-vocab-selection the vocabulary is determined on the batch level so that batch and non-batch outputs
    # may differ.
    if 'greedy' not in translate_params and 'neural-vocab-selection' not in train_params:
        translate_params_batch = translate_params + " --batch-size 2"
        test_translate_equivalence(data,
                                   translate_params_batch,
                                   compare_output=True)

    # Run translate with restrict-lexicon
    if 'neural-vocab-selection ' not in train_params:
        data = run_translate_restrict(data, translate_params)

    test_translate_equivalence(data, translate_params, compare_output=True)

    # Test scoring by ensuring that the sockeye.scoring module produces the same scores when scoring the output
    # of sockeye.translate. However, since this training is on very small datasets, the output of sockeye.translate
    # is often pure garbage or empty and cannot be scored. So we only try to score if we have some valid output
    # to work with.
    # Only run scoring under these conditions. Why?
    # - translate splits up too-long sentences and translates them in sequence, invalidating the score, so skip that
    # - scoring requires valid translation output to compare against
    if '--max-input-length' not in translate_params and _translate_output_is_valid(data['test_outputs']) \
            and 'greedy' not in translate_params and 'neural-vocab-selection' not in train_params \
            and _translate_output_is_valid(data['test_with_target_prefix_outputs']):
        test_scoring(data, translate_params, compare_output)

    # Test correct prediction of target factors if enabled
    if compare_output and 'train_target_factors' in data:
        test_odd_even_target_factors(data)

    return data
Ejemplo n.º 3
0
def check_train_translate(train_params: str,
                          translate_params: str,
                          data: Dict[str, Any],
                          use_prepared_data: bool,
                          max_seq_len: int,
                          compare_output: bool = True,
                          seed: int = 13) -> Dict[str, Any]:
    """
    Tests core features (training, inference).
    """
    # train model and translate test set
    data = run_train_translate(train_params=train_params,
                               translate_params=translate_params,
                               data=data,
                               use_prepared_data=use_prepared_data,
                               max_seq_len=max_seq_len,
                               seed=seed)

    # Test equivalence of batch decoding
    translate_params_batch = translate_params + " --batch-size 2"
    test_translate_equivalence(data, translate_params_batch, compare_output)

    # Run translate with restrict-lexicon
    data = run_translate_restrict(data, translate_params)

    # Test scoring by ensuring that the sockeye.scoring module produces the same scores when scoring the output
    # of sockeye.translate. However, since this training is on very small datasets, the output of sockeye.translate
    # is often pure garbage or empty and cannot be scored. So we only try to score if we have some valid output
    # to work with.
    # Only run scoring under these conditions. Why?
    # - translate splits up too-long sentences and translates them in sequence, invalidating the score, so skip that
    # - scoring requires valid translation output to compare against
    if '--max-input-length' not in translate_params and _translate_output_is_valid(
            data['test_outputs']):
        test_scoring(data, translate_params, compare_output)

    return data
Ejemplo n.º 4
0
def check_train_translate(train_params: str,
                          translate_params: str,
                          data: Dict[str, Any],
                          use_prepared_data: bool,
                          max_seq_len: int,
                          compare_output: bool = True,
                          seed: int = 13,
                          use_pytorch: bool = False) -> Dict[str, Any]:
    """
    Tests core features (training, inference).
    """
    # train model and translate test set
    data = run_train_translate(train_params=train_params,
                               translate_params=translate_params,
                               data=data,
                               use_prepared_data=use_prepared_data,
                               max_seq_len=max_seq_len,
                               seed=seed,
                               use_pytorch=use_pytorch)

    # Test equivalence of batch decoding
    if 'greedy' not in translate_params:
        translate_params_batch = translate_params + " --batch-size 2"
        # NOTE: --beam search stop first seems to behave strangely with latest MXNet 2.x nightly build, only on Linux
        # Disabling output comparison here if this is an MXNet-based test using beam-search-stop-first (Pytorch works).
        _compare_output = False if not use_pytorch and '--beam-search-stop first' in translate_params else True
        test_translate_equivalence(data,
                                   translate_params_batch,
                                   compare_output=_compare_output,
                                   use_pytorch=use_pytorch)
        if not use_pytorch and mxnet_installed:
            # convert model to pytorch
            import sockeye.mx_to_pt
            convert_params = f"{sockeye.mx_to_pt.__file__} -m {data['model']}"
            with patch.object(sys, "argv", convert_params.split()):
                sockeye.mx_to_pt.main()
            # check for equivalence with PyTorch decoding
            test_translate_equivalence(data,
                                       translate_params_batch,
                                       compare_output=_compare_output,
                                       use_pytorch=True)

    # Run translate with restrict-lexicon
    data = run_translate_restrict(data,
                                  translate_params,
                                  use_pytorch=use_pytorch)

    test_translate_equivalence(data,
                               translate_params,
                               compare_output=True,
                               use_pytorch=use_pytorch)

    # Test scoring by ensuring that the sockeye.scoring module produces the same scores when scoring the output
    # of sockeye.translate. However, since this training is on very small datasets, the output of sockeye.translate
    # is often pure garbage or empty and cannot be scored. So we only try to score if we have some valid output
    # to work with.
    # Only run scoring under these conditions. Why?
    # - translate splits up too-long sentences and translates them in sequence, invalidating the score, so skip that
    # - scoring requires valid translation output to compare against
    if '--max-input-length' not in translate_params and _translate_output_is_valid(data['test_outputs']) \
            and 'greedy' not in translate_params:
        test_scoring(data,
                     translate_params,
                     compare_output,
                     use_pytorch=use_pytorch)

    # Test correct prediction of target factors if enabled
    if compare_output and 'train_target_factors' in data:
        test_odd_even_target_factors(data)

    return data