Esempio n. 1
0
def test_predict_on_subset_with_existing_model():
    """
    Test generating predictions on subset with existing model
    """
    # Create data files
    make_single_file_featureset_data()

    # train and save a model on the training file
    train_fs = NDJReader.for_path(join(_my_dir, 'train', 'train_single_file.jsonlines')).read()
    learner = Learner('RandomForestClassifier')
    learner.train(train_fs, grid_search=True, grid_objective="accuracy")
    model_filename = join(_my_dir, 'output', ('train_test_single_file_train_train_'
                                              'single_file.jsonlines_test_test_single'
                                              '_file_subset.jsonlines_RandomForestClassifier'
                                              '.model'))

    learner.save(model_filename)

    # Run experiment
    config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
                                                            "test_single_file_saved_subset"
                                                            ".template.cfg"),
                                                       join(_my_dir, 'train', 'train_single_file.jsonlines'),
                                                       join(_my_dir, 'test',
                                                            'test_single_file_subset.'
                                                            'jsonlines'))
    run_configuration(config_path, quiet=True, overwrite=False)

    # Check results
    with open(join(_my_dir, 'output', ('train_test_single_file_train_train_'
                                       'single_file.jsonlines_test_test_single'
                                       '_file_subset.jsonlines_RandomForestClassifier'
                                       '.results.json'))) as f:
        result_dict = json.load(f)[0]
    assert_almost_equal(result_dict['score'], 0.7333333)
Esempio n. 2
0
def test_folds_file_logging_num_folds():
    """
    Test that, when `folds_file` is used, the log prints the number of folds,
     instead of the entire cv_folds data. And that the folds file warning shows up
     in the log file.
    """
    # Run experiment
    suffix = '.jsonlines'
    train_path = join(_my_dir, 'train', 'f0{}'.format(suffix))

    config_path = fill_in_config_paths_for_single_file(
        join(_my_dir, "configs", "test_folds_file"
             ".template.cfg"), train_path, None)
    run_configuration(config_path, quiet=True)

    # Check experiment log output
    with open(join(_my_dir, 'output', 'test_folds_file_logging.log')) as f:
        cv_file_pattern = re.compile(
            'Specifying "folds_file" overrides both explicit and default "num_cv_folds".'
        )
        matches = re.findall(cv_file_pattern, f.read())
        assert_equal(len(matches), 1)

    # Check job log output
    with open(
            join(
                _my_dir, 'output', 'test_folds_file_logging_train_f0.'
                'jsonlines_LogisticRegression.log')) as f:
        cv_folds_pattern = re.compile(
            "(Task: cross_validate\n)(.+)(Cross-validating \([0-9]+ folds\))")
        matches = re.findall(cv_folds_pattern, f.read())
        assert_equal(len(matches), 1)
Esempio n. 3
0
def test_train_file_test_file_ablation():
    """
    Test that specifying ablation with train and test file is ignored
    """
    # Create data files
    make_single_file_featureset_data()

    # Run experiment
    config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
                                                            "test_single_file"
                                                            ".template.cfg"),
                                                       join(_my_dir, 'train',
                                                            'train_single_file'
                                                            '.jsonlines'),
                                                       join(_my_dir, 'test',
                                                            'test_single_file.'
                                                            'jsonlines'))
    run_configuration(config_path, quiet=True, ablation=None)

    # check that we see the message that ablation was ignored in the experiment log
    # Check experiment log output
    with open(join(_my_dir,
                   'output',
                   'train_test_single_file.log')) as f:
        cv_file_pattern = re.compile('Not enough featuresets for ablation. Ignoring.')
        matches = re.findall(cv_file_pattern, f.read())
        eq_(len(matches), 1)
def test_train_file_test_file_ablation():
    """
    Test that specifying ablation with train and test file is ignored
    """
    # Create data files
    make_single_file_featureset_data()

    # Run experiment
    config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
                                                            "test_single_file"
                                                            ".template.cfg"),
                                                       join(_my_dir, 'train',
                                                            'train_single_file'
                                                            '.jsonlines'),
                                                       join(_my_dir, 'test',
                                                            'test_single_file.'
                                                            'jsonlines'))
    run_configuration(config_path, quiet=True, ablation=None)

    # check that we see the message that ablation was ignored in the experiment log
    # Check experiment log output
    with open(join(_my_dir,
                   'output',
                   'train_test_single_file.log')) as f:
        cv_file_pattern = re.compile('Not enough featuresets for ablation. Ignoring.')
        matches = re.findall(cv_file_pattern, f.read())
        eq_(len(matches), 1)
Esempio n. 5
0
def test_folds_file_logging_num_folds():
    """
    Test when using `folds_file`, log shows number of folds and appropriate warning.
    """
    # Run experiment
    suffix = '.jsonlines'
    train_path = join(_my_dir, 'train', 'f0{}'.format(suffix))

    config_path = fill_in_config_paths_for_single_file(join(_my_dir,
                                                            "configs",
                                                            "test_folds_file"
                                                            ".template.cfg"),
                                                       train_path,
                                                       None)
    run_configuration(config_path, quiet=True)

    # Check experiment log output
    with open(join(_my_dir,
                   'output',
                   'test_folds_file_logging.log')) as f:
        cv_file_pattern = re.compile('Specifying "folds_file" overrides both explicit and default "num_cv_folds".')
        matches = re.findall(cv_file_pattern, f.read())
        assert_equal(len(matches), 1)

    # Check job log output
    with open(join(_my_dir,
                   'output',
                   'test_folds_file_logging_train_f0.'
                   'jsonlines_LogisticRegression.log')) as f:
        cv_folds_pattern = re.compile("(Task: cross_validate\n)(.+)(Cross-validating \([0-9]+ folds\))")
        matches = re.findall(cv_folds_pattern, f.read())
        assert_equal(len(matches), 1)
def test_predict_on_subset_with_existing_model():
    """
    Test generating predictions on subset with existing model
    """
    # Create data files
    make_single_file_featureset_data()

    # train and save a model on the training file
    train_fs = NDJReader.for_path(join(_my_dir, 'train', 'train_single_file.jsonlines')).read()
    learner = Learner('RandomForestClassifier')
    learner.train(train_fs, grid_search=True, grid_objective="accuracy")
    model_filename = join(_my_dir, 'output', ('train_test_single_file_train_train_'
                                              'single_file.jsonlines_test_test_single'
                                              '_file_subset.jsonlines_RandomForestClassifier'
                                              '.model'))

    learner.save(model_filename)

    # Run experiment
    config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
                                                            "test_single_file_saved_subset"
                                                            ".template.cfg"),
                                                       join(_my_dir, 'train', 'train_single_file.jsonlines'),
                                                       join(_my_dir, 'test',
                                                            'test_single_file_subset.'
                                                            'jsonlines'))
    run_configuration(config_path, quiet=True, overwrite=False)

    # Check results
    with open(join(_my_dir, 'output', ('train_test_single_file_train_train_'
                                       'single_file.jsonlines_test_test_single'
                                       '_file_subset.jsonlines_RandomForestClassifier'
                                       '.results.json'))) as f:
        result_dict = json.load(f)[0]
    assert_almost_equal(result_dict['accuracy'], 0.7333333)
Esempio n. 7
0
def test_train_file_test_file():
    """
    Test that train_file and test_file experiments work
    """
    # Create data files
    make_single_file_featureset_data()

    # Run experiment
    config_path = fill_in_config_paths_for_single_file(
        join(_my_dir, "configs", "test_single_file"
             ".template.cfg"),
        join(_my_dir, 'train', 'train_single_file'
             '.jsonlines'),
        join(_my_dir, 'test', 'test_single_file.'
             'jsonlines'))
    run_configuration(config_path, quiet=True)

    # Check results
    with open(
            join(_my_dir, 'output', ('train_test_single_file_train_train_'
                                     'single_file.jsonlines_test_test_single'
                                     '_file.jsonlines_RandomForestClassifier'
                                     '.results.json'))) as f:
        result_dict = json.load(f)[0]

    assert_almost_equal(result_dict['score'], 0.925)
Esempio n. 8
0
def test_train_file_test_file():
    """
    Test that train_file and test_file experiments work
    """
    # Create data files
    make_single_file_featureset_data()

    # Run experiment
    config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
                                                            "test_single_file"
                                                            ".template.cfg"),
                                                       join(_my_dir, 'train',
                                                            'train_single_file'
                                                            '.jsonlines'),
                                                       join(_my_dir, 'test',
                                                            'test_single_file.'
                                                            'jsonlines'))
    run_configuration(config_path, quiet=True)

    # Check results
    with open(join(_my_dir, 'output', ('train_test_single_file_train_train_'
                                       'single_file.jsonlines_test_test_single'
                                       '_file.jsonlines_RandomForestClassifier'
                                       '.results.json'))) as f:
        result_dict = json.load(f)[0]

    assert_almost_equal(result_dict['score'], 0.925)
Esempio n. 9
0
def test_test_file_and_test_directory():
    """
    Test that test_file + test_directory = ValueError
    """
    # Run experiment
    config_path = fill_in_config_paths_for_single_file(
        join(_my_dir, "configs", "test_single_file"
             ".template.cfg"),
        join(_my_dir, 'train', 'train_single_file'
             '.jsonlines'),
        join(_my_dir, 'test', 'test_single_file.'
             'jsonlines'),
        test_directory='foo')
    _parse_config_file(config_path)
Esempio n. 10
0
def test_cross_validate_task():
    """
    Test that 10-fold cross_validate experiments work.
    Test that fold ids get correctly saved.
    """

    # Run experiment
    suffix = '.jsonlines'
    train_path = join(_my_dir, 'train', 'f0{}'.format(suffix))

    config_path = fill_in_config_paths_for_single_file(
        join(_my_dir, "configs", "test_save_cv_folds"
             ".template.cfg"), train_path, None)
    run_configuration(config_path, quiet=True)

    # Check final average results
    with open(
            join(
                _my_dir, 'output', 'test_save_cv_folds_train_f0.' +
                'jsonlines_LogisticRegression.results.json')) as f:
        result_dict = json.load(f)[10]

    assert_almost_equal(result_dict['score'], 0.517)

    # Check that the fold ids were saved correctly
    expected_skll_ids = {}
    examples = _load_featureset(train_path, '', suffix, quiet=True)
    kfold = StratifiedKFold(n_splits=10)
    for fold_num, (_, test_indices) in enumerate(
            kfold.split(examples.features, examples.labels)):
        for index in test_indices:
            expected_skll_ids[examples.ids[index]] = fold_num

    skll_fold_ids = {}
    with open(join(_my_dir, 'output',
                   'test_save_cv_folds_skll_fold_ids.csv')) as f:
        reader = csv.DictReader(f)
        for row in reader:
            skll_fold_ids[row['id']] = row['cv_test_fold']

    # convert the dictionary to strings (sorted by key) for quick comparison
    skll_fold_ids_str = ''.join('{}{}'.format(key, val)
                                for key, val in sorted(skll_fold_ids.items()))
    expected_skll_ids_str = ''.join(
        '{}{}'.format(key, val)
        for key, val in sorted(expected_skll_ids.items()))

    assert_equal(skll_fold_ids_str, expected_skll_ids_str)
def test_test_file_and_test_directory():
    """
    Test that test_file + test_directory = ValueError
    """
    # Run experiment
    config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
                                                            "test_single_file"
                                                            ".template.cfg"),
                                                       join(_my_dir, 'train',
                                                            'train_single_file'
                                                            '.jsonlines'),
                                                       join(_my_dir, 'test',
                                                            'test_single_file.'
                                                            'jsonlines'),
                                                       test_directory='foo')
    _parse_config_file(config_path)
Esempio n. 12
0
def test_cross_validate_task():
    """
    Test that 10-fold cross_validate experiments work.
    Test that fold ids get correctly saved.
    """

    # Run experiment
    suffix = '.jsonlines'
    train_path = join(_my_dir, 'train', 'f0{}'.format(suffix))

    config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
                                                            "test_save_cv_folds"
                                                            ".template.cfg"),
                                                       train_path,
                                                       None)
    run_configuration(config_path, quiet=True)

    # Check final average results
    with open(join(_my_dir, 'output', 'test_save_cv_folds_train_f0.' +
                                      'jsonlines_LogisticRegression.results.json')) as f:
        result_dict = json.load(f)[10]

    assert_almost_equal(result_dict['score'], 0.517)

    # Check that the fold ids were saved correctly
    expected_skll_ids = {}
    examples = _load_featureset(train_path, '', suffix, quiet=True)
    kfold = StratifiedKFold(examples.labels, n_folds=10)
    for fold_num, (_, test_indices) in enumerate(kfold):
        for index in test_indices:
            expected_skll_ids[examples.ids[index]] = fold_num

    skll_fold_ids = {}
    with open(join(_my_dir, 'output', 'test_save_cv_folds_skll_fold_ids.csv')) as f:
        reader = csv.DictReader(f)
        for row in reader:
            skll_fold_ids[row['id']] = row['cv_test_fold']

    # convert the dictionary to strings (sorted by key) for quick comparison
    skll_fold_ids_str = ''.join('{}{}'.format(key, val) for key, val in sorted(skll_fold_ids.items()))
    expected_skll_ids_str = ''.join('{}{}'.format(key, val) for key, val in sorted(expected_skll_ids.items()))

    assert_equal(skll_fold_ids_str, expected_skll_ids_str)
Esempio n. 13
0
def test_folds_file_logging_grid_search():
    """
    Test that, when `folds_file` is used but `use_folds_file` for grid search
    is specified, that we get an appropriate message in the log.
    """
    # Run experiment
    suffix = '.jsonlines'
    train_path = join(_my_dir, 'train', 'f0{}'.format(suffix))

    config_path = fill_in_config_paths_for_single_file(
        join(_my_dir, "configs", "test_folds_file_grid"
             ".template.cfg"), train_path, None)
    run_configuration(config_path, quiet=True)

    # Check experiment log output
    with open(join(_my_dir, 'output', 'test_folds_file_logging.log')) as f:
        cv_file_pattern = re.compile(
            'Specifying "folds_file" overrides both explicit and default "num_cv_folds".\n(.+)The specified "folds_file" will not be used for inner grid search.'
        )
        matches = re.findall(cv_file_pattern, f.read())
        assert_equal(len(matches), 1)
Esempio n. 14
0
def test_cv_folds_file_logging():
    """
    Test that, when `cv_folds_file` is used, the log prints the number of folds,
     instead of the entire cv_folds data.
    """
    # Run experiment
    suffix = '.jsonlines'
    train_path = join(_my_dir, 'train', 'f0{}'.format(suffix))

    config_path = fill_in_config_paths_for_single_file(
        join(_my_dir, "configs", "test_cv_folds_file"
             ".template.cfg"), train_path, None)
    run_configuration(config_path, quiet=True)

    # Check log output
    with open(
            join(
                _my_dir, 'output', 'test_cv_folds_file_logging_train_f0.' +
                'jsonlines_LogisticRegression.log')) as f:
        cv_folds_pattern = re.compile(
            "Task: cross_validate\nCross-validating \([0-9]+ folds\)")
        matches = re.findall(cv_folds_pattern, f.read())
        assert_equal(len(matches), 1)
Esempio n. 15
0
def test_folds_file_logging_grid_search():
    """
    Test that, when `folds_file` is used but `use_folds_file` for grid search
    is specified, that we get an appropriate message in the log.
    """
    # Run experiment
    suffix = '.jsonlines'
    train_path = join(_my_dir, 'train', 'f0{}'.format(suffix))

    config_path = fill_in_config_paths_for_single_file(join(_my_dir,
                                                            "configs",
                                                            "test_folds_file_grid"
                                                            ".template.cfg"),
                                                       train_path,
                                                       None)
    run_configuration(config_path, quiet=True)

    # Check experiment log output
    with open(join(_my_dir,
                   'output',
                   'test_folds_file_logging.log')) as f:
        cv_file_pattern = re.compile('Specifying "folds_file" overrides both explicit and default "num_cv_folds".\n(.+)The specified "folds_file" will not be used for inner grid search.')
        matches = re.findall(cv_file_pattern, f.read())
        assert_equal(len(matches), 1)