Python Learner.train Examples, skll.learner.Learner.train Python Examples

Example #1

0

Show file

File: test_classification.py Project: yiyiwang515/skll

def test_learner_api_load_into_existing_instance():
    """
    Check that `Learner.load()` works as expected
    """

    # create a LinearSVC instance and train it on some data
    learner1 = Learner('LinearSVC')
    (train_fs, test_fs) = make_classification_data(num_examples=200,
                                                   num_features=5,
                                                   use_feature_hashing=False,
                                                   non_negative=True)
    learner1.train(train_fs, grid_search=False)

    # now use `load()` to replace the existing instance with a
    # different saved learner
    other_model_file = join(
        _my_dir, 'other',
        'test_load_saved_model.{}.model'.format(sys.version_info[0]))
    learner1.load(other_model_file)

    # now load the saved model into another instance using the class method
    # `from_file()`
    learner2 = Learner.from_file(other_model_file)

    # check that the two instances are now basically the same
    eq_(learner1.model_type, learner2.model_type)
    eq_(learner1.model_params, learner2.model_params)
    eq_(learner1.model_kwargs, learner2.model_kwargs)

Example #2

0

Show file

File: test_classification.py Project: yiyiwang515/skll

def check_train_and_score_function(model_type):
    """
    Check that the _train_and_score() function works as expected
    """

    # create train and test data
    (train_fs, test_fs) = make_classification_data(num_examples=500,
                                                   train_test_ratio=0.7,
                                                   num_features=5,
                                                   use_feature_hashing=False,
                                                   non_negative=True)

    # call _train_and_score() on this data
    estimator_name = 'LogisticRegression' if model_type == 'classifier' else 'Ridge'
    metric = 'accuracy' if model_type == 'classifier' else 'pearson'
    learner1 = Learner(estimator_name)
    train_score1, test_score1 = _train_and_score(learner1, train_fs, test_fs,
                                                 metric)

    # this should yield identical results when training another instance
    # of the same learner without grid search and shuffling and evaluating
    # that instance on the train and the test set
    learner2 = Learner(estimator_name)
    learner2.train(train_fs, grid_search=False, shuffle=False)
    train_score2 = learner2.evaluate(train_fs,
                                     output_metrics=[metric])[-1][metric]
    test_score2 = learner2.evaluate(test_fs,
                                    output_metrics=[metric])[-1][metric]

    eq_(train_score1, train_score2)
    eq_(test_score1, test_score2)

Example #3

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def test_learner_api_load_into_existing_instance():
    """
    Check that `Learner.load()` works as expected
    """

    # create a LinearSVC instance and train it on some data
    learner1 = Learner('LinearSVC')
    (train_fs,
     test_fs) = make_classification_data(num_examples=200,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)
    learner1.train(train_fs, grid_search=False)

    # now use `load()` to replace the existing instance with a
    # different saved learner
    other_model_file = join(_my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0]))
    learner1.load(other_model_file)

    # now load the saved model into another instance using the class method
    # `from_file()`
    learner2 = Learner.from_file(other_model_file)

    # check that the two instances are now basically the same
    eq_(learner1.model_type, learner2.model_type)
    eq_(learner1.model_params, learner2.model_params)
    eq_(learner1.model_kwargs, learner2.model_kwargs)

Example #4

0

Show file

def check_scaling_features(use_feature_hashing=False, use_scaling=False):
    train_fs, test_fs = make_scaling_data(use_feature_hashing=use_feature_hashing)

    # create a Linear SVM with the value of scaling as specified
    feature_scaling = 'both' if use_scaling else 'none'
    learner = Learner('SGDClassifier', feature_scaling=feature_scaling,
                      pos_label_str=1)

    # train the learner on the training set and test on the testing set
    learner.train(train_fs)
    test_output = learner.evaluate(test_fs)
    fmeasures = [test_output[2][0]['F-measure'],
                 test_output[2][1]['F-measure']]

    # these are the expected values of the f-measures, sorted
    if not use_feature_hashing:
        expected_fmeasures = ([0.77319587628865982, 0.78640776699029125] if
                              not use_scaling else
                              [0.94930875576036866, 0.93989071038251359])
    else:
        expected_fmeasures = ([0.42774566473988435, 0.5638766519823788] if
                              not use_scaling else
                              [0.87323943661971837, 0.85561497326203206])

    assert_almost_equal(expected_fmeasures, fmeasures)

Example #5

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def check_train_and_score_function(model_type):
    """
    Check that the _train_and_score() function works as expected
    """

    # create train and test data
    (train_fs,
     test_fs) = make_classification_data(num_examples=500,
                                         train_test_ratio=0.7,
                                         num_features=5,
                                         use_feature_hashing=False,
                                         non_negative=True)

    # call _train_and_score() on this data
    estimator_name = 'LogisticRegression' if model_type == 'classifier' else 'Ridge'
    metric = 'accuracy' if model_type == 'classifier' else 'pearson'
    learner1 = Learner(estimator_name)
    train_score1, test_score1 = _train_and_score(learner1, train_fs, test_fs, metric)

    # this should yield identical results when training another instance
    # of the same learner without grid search and shuffling and evaluating
    # that instance on the train and the test set
    learner2 = Learner(estimator_name)
    learner2.train(train_fs, grid_search=False, shuffle=False)
    train_score2 = learner2.evaluate(train_fs, output_metrics=[metric])[-1][metric]
    test_score2 = learner2.evaluate(test_fs, output_metrics=[metric])[-1][metric]

    eq_(train_score1, train_score2)
    eq_(test_score1, test_score2)

Example #6

0

Show file

def check_specified_cv_folds(numeric_ids):
    make_cv_folds_data(numeric_ids)

    # test_cv_folds1.cfg has prespecified folds and should have ~50% accuracy
    # test_cv_folds2.cfg doesn't have prespecified folds and >95% accuracy
    for experiment_name, test_func, grid_size in [('test_cv_folds1',
                                                   lambda x: x < 0.6,
                                                   3),
                                                  ('test_cv_folds2',
                                                   lambda x: x > 0.95,
                                                   10)]:
        config_template_file = '{}.template.cfg'.format(experiment_name)
        config_template_path = os.path.join(_my_dir, 'configs',
                                            config_template_file)
        config_path = os.path.join(_my_dir,
                                   fill_in_config_paths(config_template_path))

        # Modify config file to change ids_to_floats depending on numeric_ids
        # setting
        with open(config_path, 'r+') as config_template_file:
            lines = config_template_file.readlines()
            config_template_file.seek(0)
            config_template_file.truncate()
            for line in lines:
                if line.startswith('ids_to_floats='):
                    if numeric_ids:
                        line = 'ids_to_floats=true\n'
                    else:
                        line = 'ids_to_floats=false\n'
                config_template_file.write(line)

        run_configuration(config_path, quiet=True)
        result_filename = ('{}_test_cv_folds_LogisticRegression.' +
                           'results').format(experiment_name)
        with open(os.path.join(_my_dir, 'output', result_filename)) as f:
            # check held out scores
            outstr = f.read()
            score = float(SCORE_OUTPUT_RE.search(outstr).groups()[-1])
            assert test_func(score)

            grid_score_matches = GRID_RE.findall(outstr)
            assert len(grid_score_matches) == grid_size
            for match_str in grid_score_matches:
                assert test_func(float(match_str))

    # try the same tests for just training (and specifying the folds for the
    # grid search)
    dirpath = os.path.join(_my_dir, 'train')
    suffix = '.jsonlines'
    featureset = ['test_cv_folds']
    examples = _load_featureset(dirpath, featureset, suffix, quiet=True)
    clf = Learner('LogisticRegression', probability=True)
    cv_folds = _load_cv_folds(os.path.join(_my_dir, 'train',
                                           'test_cv_folds.csv'))
    grid_search_score = clf.train(examples, grid_search_folds=cv_folds,
                                  grid_objective='accuracy', grid_jobs=1)
    assert grid_search_score < 0.6
    grid_search_score = clf.train(examples, grid_search_folds=5,
                                  grid_objective='accuracy', grid_jobs=1)
    assert grid_search_score > 0.95

Example #7

0

Show file

File: test_utilities.py Project: MechCoder/skll

def check_print_model_weights(task='classification'):

    # create some simple classification or regression data
    if task == 'classification':
        train_fs, _ = make_classification_data(train_test_ratio=0.8)
    else:
        train_fs, _, _ = make_regression_data(num_features=4,
                                              train_test_ratio=0.8)

    # now train the appropriate model
    if task == 'classification':
        learner = Learner('LogisticRegression')
        learner.train(train_fs)
    else:
        learner = Learner('LinearRegression')
        learner.train(train_fs, grid_objective='pearson')

    # now save the model to disk
    model_file = join(_my_dir, 'output',
                      'test_print_model_weights.model')
    learner.save(model_file)

    # now call print_model_weights main() and capture the output
    print_model_weights_cmd = [model_file]
    err = ''
    try:
        old_stderr = sys.stderr
        old_stdout = sys.stdout
        sys.stderr = mystderr = StringIO()
        sys.stdout = mystdout = StringIO()
        pmw.main(print_model_weights_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        sys.stdout = old_stdout
        print(err)

    # now parse the output of the print_model_weight command
    # and get the intercept and the feature values
    if task == 'classification':
        lines_to_parse = [l for l in out.split('\n')[1:] if l]
        intercept = safe_float(lines_to_parse[0].split('\t')[0])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[2], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_[0])
        assert_allclose(learner.model.coef_[0], feature_values)
    else:
        lines_to_parse = [l for l in out.split('\n') if l]
        intercept = safe_float(lines_to_parse[0].split('=')[1])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[1], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_)
        assert_allclose(learner.model.coef_, feature_values)

Example #8

0

Show file

def test_predict_on_subset_with_existing_model():
    """
    Test generating predictions on subset with existing model
    """
    # Create data files
    make_single_file_featureset_data()

    # train and save a model on the training file
    train_fs = NDJReader.for_path(join(_my_dir, 'train', 'train_single_file.jsonlines')).read()
    learner = Learner('RandomForestClassifier')
    learner.train(train_fs, grid_search=True, grid_objective="accuracy")
    model_filename = join(_my_dir, 'output', ('train_test_single_file_train_train_'
                                              'single_file.jsonlines_test_test_single'
                                              '_file_subset.jsonlines_RandomForestClassifier'
                                              '.model'))

    learner.save(model_filename)

    # Run experiment
    config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
                                                            "test_single_file_saved_subset"
                                                            ".template.cfg"),
                                                       join(_my_dir, 'train', 'train_single_file.jsonlines'),
                                                       join(_my_dir, 'test',
                                                            'test_single_file_subset.'
                                                            'jsonlines'))
    run_configuration(config_path, quiet=True, overwrite=False)

    # Check results
    with open(join(_my_dir, 'output', ('train_test_single_file_train_train_'
                                       'single_file.jsonlines_test_test_single'
                                       '_file_subset.jsonlines_RandomForestClassifier'
                                       '.results.json'))) as f:
        result_dict = json.load(f)[0]
    assert_almost_equal(result_dict['score'], 0.7333333)

Example #9

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def test_predict_on_subset_with_existing_model():
    """
    Test generating predictions on subset with existing model
    """
    # Create data files
    make_single_file_featureset_data()

    # train and save a model on the training file
    train_fs = NDJReader.for_path(join(_my_dir, 'train', 'train_single_file.jsonlines')).read()
    learner = Learner('RandomForestClassifier')
    learner.train(train_fs, grid_search=True, grid_objective="accuracy")
    model_filename = join(_my_dir, 'output', ('train_test_single_file_train_train_'
                                              'single_file.jsonlines_test_test_single'
                                              '_file_subset.jsonlines_RandomForestClassifier'
                                              '.model'))

    learner.save(model_filename)

    # Run experiment
    config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
                                                            "test_single_file_saved_subset"
                                                            ".template.cfg"),
                                                       join(_my_dir, 'train', 'train_single_file.jsonlines'),
                                                       join(_my_dir, 'test',
                                                            'test_single_file_subset.'
                                                            'jsonlines'))
    run_configuration(config_path, quiet=True, overwrite=False)

    # Check results
    with open(join(_my_dir, 'output', ('train_test_single_file_train_train_'
                                       'single_file.jsonlines_test_test_single'
                                       '_file_subset.jsonlines_RandomForestClassifier'
                                       '.results.json'))) as f:
        result_dict = json.load(f)[0]
    assert_almost_equal(result_dict['accuracy'], 0.7333333)

Example #10

0

Show file

File: test_preprocessing.py Project: srhrshr/skll

def check_scaling_features(use_feature_hashing=False, use_scaling=False):
    train_fs, test_fs = make_scaling_data(
        use_feature_hashing=use_feature_hashing)

    # create a Linear SVM with the value of scaling as specified
    feature_scaling = 'both' if use_scaling else 'none'
    learner = Learner('SGDClassifier',
                      feature_scaling=feature_scaling,
                      pos_label_str=1)

    # train the learner on the training set and test on the testing set
    learner.train(train_fs, grid_search=True, grid_objective='f1_score_micro')
    test_output = learner.evaluate(test_fs)
    fmeasures = [
        test_output[2][0]['F-measure'], test_output[2][1]['F-measure']
    ]

    # these are the expected values of the f-measures, sorted
    if not use_feature_hashing:
        expected_fmeasures = ([
            0.5333333333333333, 0.4842105263157895
        ] if not use_scaling else [0.7219512195121951, 0.7076923076923077])
    else:
        expected_fmeasures = ([
            0.5288461538461539, 0.4895833333333333
        ] if not use_scaling else [0.663157894736842, 0.6952380952380952])

    assert_almost_equal(expected_fmeasures, fmeasures)

Example #11

0

Show file

def check_scaling_features(use_feature_hashing=False, use_scaling=False):
    train_fs, test_fs = make_scaling_data(use_feature_hashing=use_feature_hashing)

    # create a Linear SVM with the value of scaling as specified
    feature_scaling = 'both' if use_scaling else 'none'
    learner = Learner('SGDClassifier', feature_scaling=feature_scaling,
                      pos_label_str=1)

    # train the learner on the training set and test on the testing set
    learner.train(train_fs)
    test_output = learner.evaluate(test_fs)
    fmeasures = [test_output[2][0]['F-measure'],
                 test_output[2][1]['F-measure']]

    # these are the expected values of the f-measures, sorted
    if not use_feature_hashing:
        expected_fmeasures = ([0.55276381909547745, 0.55721393034825872] if
                              not use_scaling else
                              [0.65217391304347827, 0.70370370370370372])
    else:
        expected_fmeasures = ([0.54255319148936176, 0.59433962264150941] if
                              not use_scaling else
                              [0.69950738916256161, 0.69035532994923865])

    assert_almost_equal(expected_fmeasures, fmeasures)

Example #12

0

Show file

File: test_preprocessing.py Project: nimmen/skll

def check_scaling_features(use_feature_hashing=False, use_scaling=False):
    train_fs, test_fs = make_scaling_data(
        use_feature_hashing=use_feature_hashing)

    # create a Linear SVM with the value of scaling as specified
    feature_scaling = 'both' if use_scaling else 'none'
    learner = Learner('SGDClassifier', feature_scaling=feature_scaling,
                      pos_label_str=1)

    # train the learner on the training set and test on the testing set
    learner.train(train_fs)
    test_output = learner.evaluate(test_fs)
    fmeasures = [test_output[2][0]['F-measure'],
                 test_output[2][1]['F-measure']]

    # these are the expected values of the f-measures, sorted
    if not use_feature_hashing:
        expected_fmeasures = ([0.7979797979797979, 0.80198019801980192] if
                              not use_scaling else
                              [0.94883720930232551, 0.94054054054054048])
    else:
        expected_fmeasures = ([0.83962264150943389, 0.81914893617021278] if
                              not use_scaling else
                              [0.88038277511961716, 0.86910994764397898])

    assert_almost_equal(expected_fmeasures, fmeasures)

Example #13

0

Show file

File: test_regression.py Project: MechCoder/skll

def check_tree_models(name,
                      use_feature_hashing=False,
                      use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([0.37331461,
                                         0.08572699,
                                         0.2543484,
                                         0.1841172,
                                         0.1024928] if use_feature_hashing else
                                        [0.08931994,
                                         0.15545093,
                                         0.75522913])
        expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0]
    else:
        if use_feature_hashing:
            expected_feature_importances = [0.40195655,
                                            0.06702161,
                                            0.25814858,
                                            0.18183947,
                                            0.09103379]
        else:
            expected_feature_importances = [0.07975691, 0.16122862, 0.75901447]
        expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    rtol=1e-2)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])

Example #14

0

Show file

File: test_regression.py Project: ChristianGeng/skll

def check_tree_models(name,
                      use_feature_hashing=False,
                      use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([0.37483895,
                                         0.08816508,
                                         0.25379838,
                                         0.18337128,
                                         0.09982631] if use_feature_hashing else
                                        [0.08926899,
                                         0.15585068,
                                         0.75488033])
        expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0]
    else:
        expected_feature_importances = ([0.40195798,
                                         0.06702903,
                                         0.25816559,
                                         0.18185518,
                                         0.09099222] if use_feature_hashing else
                                        [0.07974267,
                                         0.16121895,
                                         0.75903838])
        expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])

Example #15

0

Show file

File: test_metrics.py Project: yiyiwang515/skll

def check_invalid_regr_grid_obj_func(learner_name, grid_objective_function):
    """
    Checks whether the grid objective function is valid for this regression
    learner
    """
    (train_fs, _, _) = make_regression_data()
    clf = Learner(learner_name)
    clf.train(train_fs, grid_objective=grid_objective_function)

Example #16

0

Show file

File: test_metrics.py Project: BK-University/skll

def check_invalid_regr_grid_obj_func(learner_name, grid_objective_function):
    """
    Checks whether the grid objective function is valid for this regression
    learner
    """
    (train_fs, _, _) = make_regression_data()
    clf = Learner(learner_name)
    clf.train(train_fs, grid_objective=grid_objective_function)

Example #17

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def test_predict_dict_hasher():
    train_file = join(_my_dir, 'other', 'examples_train.jsonlines')
    test_file = join(_my_dir, 'other', 'examples_test.jsonlines')
    train_fs = NDJReader.for_path(train_file).read()
    test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=3).read()
    learner = Learner('LogisticRegression')
    learner.train(train_fs, grid_search=False)
    _ = learner.predict(test_fs)

Example #18

0

Show file

File: test_regression.py Project: ChristianGeng/skll

def check_ensemble_models(name,
                          use_feature_hashing=False,
                          use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('AdaBoostRegressor'):
        if use_feature_hashing:
            expected_feature_importances = [0.33718443,
                                            0.07810721,
                                            0.25621769,
                                            0.19489766,
                                            0.13359301]
        else:
            expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
    else:
        expected_feature_importances = ([0.204,
                                         0.172,
                                         0.178,
                                         0.212,
                                         0.234] if use_feature_hashing else
                                        [0.262,
                                         0.288,
                                         0.45])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])

Example #19

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def test_predict_hasher_hasher_same_bins():
    train_file = join(_my_dir, 'other', 'examples_train.jsonlines')
    test_file = join(_my_dir, 'other', 'examples_test.jsonlines')
    train_fs = NDJReader.for_path(train_file, feature_hasher=True, num_features=3).read()
    test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=3).read()
    learner = Learner('LogisticRegression')
    learner.train(train_fs, grid_search=False)
    predictions = learner.predict(test_fs)
    eq_(len(predictions), test_fs.features.shape[0])

Example #20

0

Show file

File: test_classification.py Project: yiyiwang515/skll

def test_predict_dict_dict():
    train_file = join(_my_dir, 'other', 'examples_train.jsonlines')
    test_file = join(_my_dir, 'other', 'examples_test.jsonlines')
    train_fs = NDJReader.for_path(train_file).read()
    test_fs = NDJReader.for_path(test_file).read()
    learner = Learner('LogisticRegression')
    learner.train(train_fs, grid_search=False)
    predictions = learner.predict(test_fs)
    eq_(len(predictions), test_fs.features.shape[0])

Example #21

0

Show file

File: test_utilities.py Project: MechCoder/skll

def check_generate_predictions_console(use_threshold=False):

    # create some simple classification data without feature hashing
    train_fs, test_fs = make_classification_data(num_examples=1000,
                                                 num_features=5)

    # save the test feature set to an NDJ file
    input_file = join(_my_dir, 'test',
                      'test_generate_predictions.jsonlines')
    writer = NDJWriter(input_file, test_fs)
    writer.write()

    # create a learner that uses an SGD classifier
    learner = Learner('SGDClassifier', probability=use_threshold)

    # train the learner with grid search
    learner.train(train_fs, grid_search=True)

    # get the predictions on the test featureset
    predictions = learner.predict(test_fs)

    # if we asked for probabilities, then use the threshold
    # to convert them into binary predictions
    if use_threshold:
        threshold = 0.6
        predictions = [int(p[1] >= threshold) for p in predictions]
    else:
        predictions = predictions.tolist()
        threshold = None

    # save the learner to a file
    model_file = join(_my_dir, 'output',
                      'test_generate_predictions_console.model')
    learner.save(model_file)

    # now call main() from generate_predictions.py
    generate_cmd = []
    if use_threshold:
        generate_cmd.append('-t {}'.format(threshold))
    generate_cmd.extend([model_file, input_file])

    # we need to capture stdout since that's what main() writes to
    err = ''
    try:
        old_stdout = sys.stdout
        old_stderr = sys.stderr
        sys.stdout = mystdout = StringIO()
        sys.stderr = mystderr = StringIO()
        gp.main(generate_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
        predictions_after_saving = [int(x) for x in out.strip().split('\n')]
        eq_(predictions, predictions_after_saving)
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr
        print(err)

Example #22

0

Show file

File: test_regression.py Project: EducationalTestingService/skll

def check_linear_models(name,
                        use_feature_hashing=False,
                        use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        (train_fs,
         test_fs,
         weightdict) = make_regression_data(num_examples=5000,
                                            num_features=10,
                                            use_feature_hashing=True,
                                            feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the weights are close to the weights
    # that we got from make_regression_data. Take the
    # ceiling before comparing since just comparing
    # the ceilings should be enough to make sure nothing
    # catastrophic happened. However, sometimes with
    # feature hashing, the ceiling is not exactly identical
    # so when that fails we want to check that the rounded
    # feature values are the same. One of those two equalities
    # _must_ be satisified.

    # get the weights for this trained model
    learned_weights = learner.model_params[0]

    for feature_name in learned_weights:
        learned_w_ceil = math.ceil(learned_weights[feature_name])
        given_w_ceil = math.ceil(weightdict[feature_name])
        learned_w_round = round(learned_weights[feature_name], 0)
        given_w_round = round(weightdict[feature_name], 0)
        ceil_equal = learned_w_ceil == given_w_ceil
        round_equal = learned_w_round == given_w_round
        either_equal = ceil_equal or round_equal
        assert either_equal

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)

Example #23

0

Show file

File: test_regression.py Project: m7142yosuke/skll

def check_linear_models(name,
                        use_feature_hashing=False,
                        use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        (train_fs,
         test_fs,
         weightdict) = make_regression_data(num_examples=5000,
                                            num_features=10,
                                            use_feature_hashing=True,
                                            feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the weights are close to the weights
    # that we got from make_regression_data. Take the
    # ceiling before comparing since just comparing
    # the ceilings should be enough to make sure nothing
    # catastrophic happened. However, sometimes with
    # feature hashing, the ceiling is not exactly identical
    # so when that fails we want to check that the rounded
    # feature values are the same. One of those two equalities
    # _must_ be satisified.

    # get the weights for this trained model
    learned_weights = learner.model_params[0]

    for feature_name in learned_weights:
        learned_w_ceil = math.ceil(learned_weights[feature_name])
        given_w_ceil = math.ceil(weightdict[feature_name])
        learned_w_round = round(learned_weights[feature_name], 0)
        given_w_round = round(weightdict[feature_name], 0)
        ceil_equal = learned_w_ceil == given_w_ceil
        round_equal = learned_w_round == given_w_round
        either_equal = ceil_equal or round_equal
        assert either_equal

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)

Example #24

0

Show file

File: test_regression.py Project: EducationalTestingService/skll

def check_tree_models(name,
                      use_feature_hashing=False,
                      use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([0.730811,
                                         0.001834,
                                         0.247603,
                                         0.015241,
                                         0.004511] if use_feature_hashing else
                                        [0.08926899,
                                         0.15585068,
                                         0.75488033])
    else:
        expected_feature_importances = ([0.733654,
                                         0.002528,
                                         0.245527,
                                         0.013664,
                                         0.004627] if use_feature_hashing else
                                        [0.07974267,
                                         0.16121895,
                                         0.75903838])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)

Example #25

0

Show file

File: test_regression.py Project: m7142yosuke/skll

def check_tree_models(name,
                      use_feature_hashing=False,
                      use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([0.730811,
                                         0.001834,
                                         0.247603,
                                         0.015241,
                                         0.004511] if use_feature_hashing else
                                        [0.08926899,
                                         0.15585068,
                                         0.75488033])
    else:
        expected_feature_importances = ([0.733654,
                                         0.002528,
                                         0.245527,
                                         0.013664,
                                         0.004627] if use_feature_hashing else
                                        [0.07974267,
                                         0.16121895,
                                         0.75903838])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)

Example #26

0

Show file

File: test_regression.py Project: EducationalTestingService/skll

def check_ensemble_models(name,
                          use_feature_hashing=False,
                          use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('AdaBoostRegressor'):
        if use_feature_hashing:
            expected_feature_importances = [0.749811,
                                            0.001373,
                                            0.23357,
                                            0.011691,
                                            0.003554]
        else:
            expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
    else:
        expected_feature_importances = ([0.735756,
                                         0.001034,
                                         0.242734,
                                         0.015836,
                                         0.00464] if use_feature_hashing else
                                        [0.082621,
                                         0.166652,
                                         0.750726])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)

Example #27

0

Show file

File: test_regression.py Project: m7142yosuke/skll

def check_ensemble_models(name,
                          use_feature_hashing=False,
                          use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('AdaBoostRegressor'):
        if use_feature_hashing:
            expected_feature_importances = [0.749811,
                                            0.001373,
                                            0.23357,
                                            0.011691,
                                            0.003554]
        else:
            expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
    else:
        expected_feature_importances = ([0.735756,
                                         0.001034,
                                         0.242734,
                                         0.015836,
                                         0.00464] if use_feature_hashing else
                                        [0.082621,
                                         0.166652,
                                         0.750726])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)

Example #28

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def check_adaboost_predict(base_estimator, algorithm, expected_score):
    train_fs, test_fs = make_sparse_data()

    # train an AdaBoostClassifier on the training data and evalute on the
    # testing data
    learner = Learner('AdaBoostClassifier', model_kwargs={'base_estimator': base_estimator,
                                                          'algorithm': algorithm})
    learner.train(train_fs, grid_search=False)
    test_score = learner.evaluate(test_fs)[1]
    assert_almost_equal(test_score, expected_score)

Example #29

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def check_sparse_predict(learner_name, expected_score, use_feature_hashing=False):
    train_fs, test_fs = make_sparse_data(
        use_feature_hashing=use_feature_hashing)

    # train the given classifier on the training
    # data and evalute on the testing data
    learner = Learner(learner_name)
    learner.train(train_fs, grid_search=False)
    test_score = learner.evaluate(test_fs)[1]
    assert_almost_equal(test_score, expected_score)

Example #30

0

Show file

def check_sparse_predict(learner_name, expected_score, use_feature_hashing=False):
    train_fs, test_fs = make_sparse_data(
        use_feature_hashing=use_feature_hashing)

    # train the given classifier on the training
    # data and evalute on the testing data
    learner = Learner(learner_name)
    learner.train(train_fs, grid_search=False)
    test_score = learner.evaluate(test_fs)[1]
    assert_almost_equal(test_score, expected_score)

Example #31

0

Show file

File: test_classification.py Project: MechCoder/skll

def check_sparse_predict(use_feature_hashing=False):
    train_fs, test_fs = make_sparse_data(
        use_feature_hashing=use_feature_hashing)

    # train a linear SVM on the training data and evalute on the testing data
    learner = Learner('LogisticRegression')
    learner.train(train_fs, grid_search=False)
    test_score = learner.evaluate(test_fs)[1]
    expected_score = 0.51 if use_feature_hashing else 0.45
    assert_almost_equal(test_score, expected_score)

Example #32

0

Show file

File: test_classification.py Project: nimmen/skll

def check_sparse_predict(use_feature_hashing=False):
    train_fs, test_fs = make_sparse_data(
        use_feature_hashing=use_feature_hashing)

    # train a linear SVM on the training data and evalute on the testing data
    learner = Learner('LogisticRegression')
    learner.train(train_fs, grid_search=False)
    test_score = learner.evaluate(test_fs)[1]
    expected_score = 0.51 if use_feature_hashing else 0.45
    assert_almost_equal(test_score, expected_score)

Example #33

0

Show file

File: test_classification.py Project: yiyiwang515/skll

def test_predict_dict_hasher():
    train_file = join(_my_dir, 'other', 'examples_train.jsonlines')
    test_file = join(_my_dir, 'other', 'examples_test.jsonlines')
    train_fs = NDJReader.for_path(train_file).read()
    test_fs = NDJReader.for_path(test_file,
                                 feature_hasher=True,
                                 num_features=3).read()
    learner = Learner('LogisticRegression')
    learner.train(train_fs, grid_search=False)
    _ = learner.predict(test_fs)

Example #34

0

Show file

def check_ensemble_models(name,
                          use_feature_hashing=False,
                          use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('AdaBoostRegressor'):
        if use_feature_hashing:
            expected_feature_importances = [
                0.33718443, 0.07810721, 0.25621769, 0.19489766, 0.13359301
            ]
        else:
            expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
    else:
        expected_feature_importances = ([
            0.204, 0.172, 0.178, 0.212, 0.234
        ] if use_feature_hashing else [0.262, 0.288, 0.45])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances,
                    expected_feature_importances,
                    atol=1e-2,
                    rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])

Example #35

0

Show file

def check_tree_models(name, use_feature_hashing=False, use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([
            0.37331461, 0.08572699, 0.2543484, 0.1841172, 0.1024928
        ] if use_feature_hashing else [0.08931994, 0.15545093, 0.75522913])
        expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0]
    else:
        if use_feature_hashing:
            expected_feature_importances = [
                0.40195655, 0.06702161, 0.25814858, 0.18183947, 0.09103379
            ]
        else:
            expected_feature_importances = [0.07975691, 0.16122862, 0.75901447]
        expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances,
                    expected_feature_importances,
                    rtol=1e-2)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])

Example #36

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def test_all_new_labels_in_test():
    """
    Test classification with all labels in test set unseen
    """
    train_fs, test_fs = make_classification_data(num_labels=3,
                                                 train_test_ratio=0.8)
    # change all test labels
    test_fs.labels = test_fs.labels + 3

    learner = Learner('SVC')
    learner.train(train_fs, grid_search=False)
    res = learner.evaluate(test_fs)
    yield check_results_with_unseen_labels, res, 6, [3, 4, 5]
    yield assert_almost_equal, res[1], 0

Example #37

0

Show file

def check_linear_models(name, use_feature_hashing=False, use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, weightdict = make_regression_data(
            num_examples=5000,
            num_features=10,
            use_feature_hashing=True,
            feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the weights are close to the weights
    # that we got from make_regression_data. Take the
    # ceiling before  comparing since just comparing
    # the ceilings should be enough to make sure nothing
    # catastrophic happened. Note though that we cannot
    # test feature weights if we are using feature hashing
    # since model_params is not defined with a featurehasher.
    if not use_feature_hashing:

        # get the weights for this trained model
        learned_weights = learner.model_params[0]

        for feature_name in learned_weights:
            learned_w = math.ceil(learned_weights[feature_name])
            given_w = math.ceil(weightdict[feature_name])
            eq_(learned_w, given_w)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])

Example #38

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def test_new_labels_in_test_set():
    """
    Test classification experiment with an unseen label in the test set.
    """
    train_fs, test_fs = make_classification_data(num_labels=3,
                                                 train_test_ratio=0.8)
    # add new labels to the test set
    test_fs.labels[-3:] = 3

    learner = Learner('SVC')
    learner.train(train_fs, grid_search=False)
    res = learner.evaluate(test_fs)
    yield check_results_with_unseen_labels, res, 4, [3]
    yield assert_almost_equal, res[1], 0.3

Example #39

0

Show file

File: test_regression.py Project: ChristianGeng/skll

def check_linear_models(name,
                        use_feature_hashing=False,
                        use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, weightdict = make_regression_data(
            num_examples=5000, num_features=10, use_feature_hashing=True,
            feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the weights are close to the weights
    # that we got from make_regression_data. Take the
    # ceiling before  comparing since just comparing
    # the ceilings should be enough to make sure nothing
    # catastrophic happened. Note though that we cannot
    # test feature weights if we are using feature hashing
    # since model_params is not defined with a featurehasher.
    if not use_feature_hashing:

        # get the weights for this trained model
        learned_weights = learner.model_params[0]

        for feature_name in learned_weights:
            learned_w = math.ceil(learned_weights[feature_name])
            given_w = math.ceil(weightdict[feature_name])
            eq_(learned_w, given_w)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])

Example #40

0

Show file

def check_tree_models(name, use_feature_hashing=False, use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([
            0.37483895, 0.08816508, 0.25379838, 0.18337128, 0.09982631
        ] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033])
        expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0]
    else:
        expected_feature_importances = ([
            0.40195798, 0.06702903, 0.25816559, 0.18185518, 0.09099222
        ] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838])
        expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances,
                    expected_feature_importances,
                    atol=1e-2,
                    rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])

Example #41

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def test_new_labels_in_test_set_change_order():
    """
    Test classification with an unseen label in the test set when the new label falls between the existing labels
    """
    train_fs, test_fs = make_classification_data(num_labels=3,
                                                 train_test_ratio=0.8)
    # change train labels to create a gap
    train_fs.labels = train_fs.labels * 10
    # add new test labels
    test_fs.labels = test_fs.labels * 10
    test_fs.labels[-3:] = 15

    learner = Learner('SVC')
    learner.train(train_fs, grid_search=False)
    res = learner.evaluate(test_fs)
    yield check_results_with_unseen_labels, res, 4, [15]
    yield assert_almost_equal, res[1], 0.3

Example #42

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def check_dummy_classifier_predict(model_args, train_labels, expected_output):

    # create hard-coded featuresets based with known labels
    train_fs = FeatureSet('classification_train',
                          ['TrainExample{}'.format(i) for i in range(20)],
                          labels=train_labels,
                          features=[{"feature": i} for i in range(20)])

    test_fs = FeatureSet('classification_test',
                         ['TestExample{}'.format(i) for i in range(10)],
                         features=[{"feature": i} for i in range(20, 30)])

    # Ensure predictions are as expectedfor the given strategy
    learner = Learner('DummyClassifier', model_kwargs=model_args)
    learner.train(train_fs, grid_search=False)
    predictions = learner.predict(test_fs)
    eq_(np.array_equal(expected_output, predictions), True)

Example #43

0

Show file

def check_dummy_classifier_predict(model_args, train_labels, expected_output):

    # create hard-coded featuresets based with known labels
    train_fs = FeatureSet('classification_train',
                          ['TrainExample{}'.format(i) for i in range(20)],
                          labels=train_labels,
                          features=[{"feature": i} for i in range(20)])

    test_fs = FeatureSet('classification_test',
                         ['TestExample{}'.format(i) for i in range(10)],
                         features=[{"feature": i} for i in range(20, 30)])

    # Ensure predictions are as expectedfor the given strategy
    learner = Learner('DummyClassifier', model_kwargs=model_args)
    learner.train(train_fs, grid_search=False)
    predictions = learner.predict(test_fs)
    eq_(np.array_equal(expected_output, predictions), True)

Example #44

0

Show file

File: test_regression.py Project: ChristianGeng/skll

def check_rescaling(name):

    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                sd_noise=4,
                                                num_features=3)

    # instantiate the given learner and its rescaled counterpart
    learner = Learner(name)
    rescaled_learner = Learner('Rescaled' + name)

    # train both the regular regressor and the rescaled regressor
    learner.train(train_fs, grid_objective='pearson')
    rescaled_learner.train(train_fs, grid_objective='pearson')

    # now generate both sets of predictions on the test feature set
    predictions = learner.predict(test_fs)
    rescaled_predictions = rescaled_learner.predict(test_fs)

    # ... and on the training feature set
    train_predictions = learner.predict(train_fs)
    rescaled_train_predictions = rescaled_learner.predict(train_fs)

    # make sure that both sets of correlations are close to perfectly
    # correlated, since the only thing different is that one set has been
    # rescaled
    assert_almost_equal(pearsonr(predictions, rescaled_predictions)[0], 1.0,
                        places=3)

    # make sure that the standard deviation of the rescaled test set
    # predictions is higher than the standard deviation of the regular test set
    # predictions
    p_std = np.std(predictions)
    rescaled_p_std = np.std(rescaled_predictions)
    assert_greater(rescaled_p_std, p_std)

    # make sure that the standard deviation of the rescaled predictions
    # on the TRAINING set (not the TEST) is closer to the standard
    # deviation of the training set labels than the standard deviation
    # of the regular predictions.
    train_y_std = np.std(train_fs.labels)
    train_p_std = np.std(train_predictions)
    rescaled_train_p_std = np.std(rescaled_train_predictions)
    assert_less(abs(rescaled_train_p_std - train_y_std), abs(train_p_std -
                                                             train_y_std))

Example #45

0

Show file

File: test_regression.py Project: EducationalTestingService/skll

def test_additional_metrics():
    """
    Test additional metrics in the results file for a regressor
    """
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                num_features=3)

    # train a regression model using the train feature set
    learner = Learner('LinearRegression')
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # evaluate the trained model using the test feature set
    results = learner.evaluate(test_fs, output_metrics=['spearman',
                                                        'kendall_tau'])

    # check that the values for the additional metrics are as expected
    additional_scores_dict = results[-1]
    assert_almost_equal(additional_scores_dict['spearman'], 0.9996, places=4)
    assert_almost_equal(additional_scores_dict['kendall_tau'], 0.9846, places=4)

Example #46

0

Show file

File: test_regression.py Project: ChristianGeng/skll

def check_adaboost_regression(base_estimator):
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                sd_noise=4,
                                                num_features=3)

    # train an AdaBoostClassifier on the training data and evalute on the
    # testing data
    learner = Learner('AdaBoostRegressor', model_kwargs={'base_estimator':
                                                         base_estimator})
    learner.train(train_fs, grid_search=False)

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)

Example #47

0

Show file

File: test_classification.py Project: EducationalTestingService/skll

def check_sparse_predict_sampler(use_feature_hashing=False):
    train_fs, test_fs = make_sparse_data(
        use_feature_hashing=use_feature_hashing)

    if use_feature_hashing:
        sampler = 'RBFSampler'
        sampler_parameters = {"gamma": 1.0, "n_components": 50}
    else:
        sampler = 'Nystroem'
        sampler_parameters = {"gamma": 1.0, "n_components": 50,
                              "kernel": 'rbf'}

    learner = Learner('LogisticRegression',
                      sampler=sampler,
                      sampler_kwargs=sampler_parameters)

    learner.train(train_fs, grid_search=False)
    test_score = learner.evaluate(test_fs)[1]

    expected_score = 0.48 if use_feature_hashing else 0.45
    assert_almost_equal(test_score, expected_score)