Ejemplo n.º 1
0
def test_subtract():
    """
    Test to ensure that subtraction works
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=2,
                                      train_test_ratio=1.0,
                                      random_state=1234)

    # create a different feature set with the same feature names
    # but different feature values
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=2,
                                      num_labels=2,
                                      train_test_ratio=1.0,
                                      random_state=5678)

    # subtract fs1 from fs2, i.e., the features in fs2
    # should be removed from fs1 but nothing else should change
    fs = fs1 - fs2

    # ensure that the labels are the same in fs and fs1
    assert_array_equal(fs.labels, fs1.labels)

    # ensure that there are only two features left
    eq_(fs.features.shape[1], 2)

    # and that they are f3 and f4
    assert_array_equal(np.array(fs.vectorizer.feature_names_), ['f03', 'f04'])
Ejemplo n.º 2
0
def test_merge_missing_labels():
    """
    Test to ensure that labels are sucessfully copied when merging
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create a different feature set with no labels specified
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      feature_prefix='g',
                                      empty_labels=True,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # merge the two featuresets in different orders
    fs12 = fs1 + fs2
    fs21 = fs2 + fs1

    # make sure that the labels are the same after merging
    assert_array_equal(fs12.labels, fs1.labels)
    assert_array_equal(fs21.labels, fs1.labels)
Ejemplo n.º 3
0
def test_api_with_custom_prob_metric():
    """Test API with custom probabilistic metric"""

    # register a custom metric from our file that requires probabilities
    input_dir = join(_my_dir, "other")
    custom_metrics_file = join(input_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file, "fake_prob_metric")

    # create some classification data
    train_fs, _ = make_classification_data(num_examples=1000,
                                           num_features=10,
                                           num_labels=2)

    # set up a learner to tune using this probabilistic metric
    # this should fail since LinearSVC doesn't support probabilities
    learner1 = Learner("LinearSVC")
    assert_raises_regex(AttributeError,
                        r"has no attribute 'predict_proba'",
                        learner1.train,
                        train_fs,
                        grid_objective="fake_prob_metric")

    # set up another learner with explicit probability support
    # this should work just fine with our custom metric
    learner2 = Learner("SVC", probability=True)
    grid_score, _ = learner2.train(train_fs, grid_objective="fake_prob_metric")
    ok_(grid_score > 0.95)
Ejemplo n.º 4
0
def test_string_feature():
    """
    Test that string-valued features are properly encoded as binary features
    """
    # create a featureset that is derived from an original
    # set of features containing 3 numeric features and
    # one string-valued feature that can take six possible
    # values between 'a' to 'f'. This means that the
    # featureset will have 3 numeric + 6 binary features.
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     one_string_feature=True,
                                     num_string_values=6,
                                     train_test_ratio=1.0)

    # confirm that the number of features are as expected
    eq_(fs.features.shape, (100, 9))

    # confirm the feature names
    eq_(fs.vectorizer.feature_names_, [
        'f01', 'f02', 'f03', 'f04=a', 'f04=b', 'f04=c', 'f04=d', 'f04=e',
        'f04=f'
    ])

    # confirm that the final six features are binary
    assert_array_equal(fs.features[:, [3, 4, 5, 6, 7, 8]].data, 1)
Ejemplo n.º 5
0
def check_filter_labels(inverse=False):

    # create a feature set
    fs, _ = make_classification_data(num_examples=1000,
                                     num_features=4,
                                     num_labels=5,
                                     train_test_ratio=1.0)

    # keep just the instaces with 0, 1 and 2 labels
    labels_to_filter = [0, 1, 2]

    # do the actual filtering
    fs.filter(labels=labels_to_filter, inverse=inverse)

    # make sure that we removed the right things
    if inverse:
        ids_kept = fs.ids[np.where(
            np.logical_not(np.in1d(fs.labels, labels_to_filter)))]
    else:
        ids_kept = fs.ids[np.where(np.in1d(fs.labels, labels_to_filter))]

    assert_array_equal(fs.ids, np.array(ids_kept))

    # make sure that number of ids, labels and features are the same
    eq_(fs.ids.shape[0], fs.labels.shape[0])
    eq_(fs.labels.shape[0], fs.features.shape[0])
Ejemplo n.º 6
0
def test_merge_different_vectorizers():
    """
    Test to ensure rejection of merging featuresets with different vectorizers
    """

    # create a featureset each with a DictVectorizer
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create another featureset using hashing
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      feature_prefix='g',
                                      num_labels=3,
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True)
    # This should raise a ValueError
    fs1 + fs2
Ejemplo n.º 7
0
def test_write_hashed_featureset():
    """
    Test to check that hashed featuresets cannot be written out
    """
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     use_feature_hashing=True,
                                     feature_bins=2,
                                     random_state=1234)
    output_dir = join(_my_dir, 'output')
    writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs)
    writer.write()
Ejemplo n.º 8
0
def test_length():
    """
    Test to whether len() returns the number of instances
    """

    # create a featureset
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     train_test_ratio=1.0)

    eq_(len(fs), 100)
Ejemplo n.º 9
0
def test_empty_labels():
    """
    Test to check behaviour when labels is None
    """

    # create a feature set with empty labels
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     empty_labels=True,
                                     train_test_ratio=1.0)
    assert np.isnan(fs.labels).all()
Ejemplo n.º 10
0
def test_custom_learner_model_loading():
    num_labels = 10

    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train', 'test_model_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test', 'test_model_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    # run the configuration that trains the custom model and saves it
    cfgfile = 'test_model_save_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    # save the predictions from disk into memory
    # and delete the predictions file
    outprefix = 'test_model_custom_learner'
    pred_file = join(
        _my_dir, 'output', '{}_{}_CustomLogisticRegressionWrapper'
        '_predictions.tsv'.format(outprefix, outprefix))
    preds1 = read_predictions(pred_file)
    os.unlink(pred_file)

    # run the configuration that loads the saved model
    # and generates the predictions again
    cfgfile = 'test_model_load_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, overwrite=False, quiet=True)

    # load the newly generated predictions
    preds2 = read_predictions(pred_file)

    # make sure that they are the same as before
    assert_array_equal(preds1, preds2)
Ejemplo n.º 11
0
def test_merge_different_hashers():
    """
    Test to ensure rejection of merging featuresets with different FeatureHashers
    """

    # create a feature set with 4 feature hashing bins
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=10,
                                      num_labels=3,
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True,
                                      feature_bins=4)

    # create a second feature set with 3 feature hashing bins
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=10,
                                      num_labels=3,
                                      feature_prefix='g',
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True,
                                      feature_bins=3)
    # This should raise a ValueError
    fs1 + fs2
Ejemplo n.º 12
0
def test_merge_different_labels_same_ids():
    """
    Test to ensure rejection of merging featuresets that have conflicting labels
    """

    # create a feature set
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create a different feature set that has everything
    # the same but has different labels for the same IDs
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      feature_prefix='g',
                                      train_test_ratio=1.0)

    # artificially modify the class labels
    fs2.labels = fs2.labels + 1

    # This should raise a ValueError
    fs1 + fs2
Ejemplo n.º 13
0
def test_filter_with_hashing():
    """
    Test to ensure rejection of filtering by features when using hashing
    """

    # create a feature set
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=5,
                                     num_labels=3,
                                     train_test_ratio=1.0,
                                     use_feature_hashing=True,
                                     feature_bins=2)

    # filter features f1 and f4 or their inverse
    fs.filter(features=['f1', 'f4'])
Ejemplo n.º 14
0
def test_iteration_without_dictvectorizer():
    """
    Test to allow iteration only if the vectorizer is a DictVectorizer
    """

    # create a feature set
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     train_test_ratio=1.0,
                                     use_feature_hashing=True,
                                     feature_bins=2)
    # This should raise a ValueError
    for _ in fs:
        pass
Ejemplo n.º 15
0
def test_logistic_custom_learner():
    num_labels = 10

    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_logistic_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test', 'test_logistic_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    cfgfile = 'test_logistic_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    outprefix = 'test_logistic_custom_learner'
    preds = read_predictions(
        join(_my_dir, 'output',
             ('{}_{}_CustomLogisticRegressionWrapper'
              '_predictions.tsv'.format(outprefix, outprefix))))

    expected = read_predictions(
        join(_my_dir, 'output',
             ('{}_{}_LogisticRegression_predictions.tsv'.format(
                 outprefix, outprefix))))

    assert_array_equal(preds, expected)
Ejemplo n.º 16
0
def test_api_with_inverted_custom_metric():
    """Test API with a lower-is-better custom metric"""

    # register a lower-is-better custom metrics from our file
    # which is simply 1 minus the precision score
    input_dir = join(_my_dir, "other")
    custom_metrics_file1 = join(input_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file1, "one_minus_precision")

    # create some classification data
    train_fs, _ = make_classification_data(num_examples=1000,
                                           num_features=10,
                                           num_labels=2)

    # set up a learner to tune using the lower-is-better custom metric
    learner1 = Learner("LogisticRegression")
    (grid_score1,
     grid_results_dict1) = learner1.train(train_fs,
                                          grid_objective="one_minus_precision")

    # now setup another learner that uses the complementary version
    # of our custom metric (regular precision) for grid search
    learner2 = Learner("LogisticRegression")
    (grid_score2,
     grid_results_dict2) = learner2.train(train_fs, grid_objective="precision")

    # for both learners the ranking of the C hyperparameter should be
    # should be the identical since when we defined one_minus_precision
    # we set the `greater_is_better` keyword argument to `False`
    assert_array_equal(grid_results_dict1['rank_test_score'],
                       grid_results_dict2['rank_test_score'])

    # furthermore, the final grid score and the mean scores for each
    # C hyperparameter value should follow the same 1-X relationship
    # except that our custom metric should be negated due to the
    # keyword argument that we set when we defined it
    assert_almost_equal(1 - grid_score2, -1 * grid_score1, places=6)
    assert_array_almost_equal(1 - grid_results_dict2['mean_test_score'],
                              -1 * grid_results_dict1['mean_test_score'],
                              decimal=6)
Ejemplo n.º 17
0
def check_filter_ids(inverse=False):

    # create a feature set
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=4,
                                     num_labels=3,
                                     train_test_ratio=1.0)

    # keep just the IDs after Example_50 or do the inverse
    ids_to_filter = ['EXAMPLE_{}'.format(i) for i in range(51, 101)]
    if inverse:
        ids_kept = ['EXAMPLE_{}'.format(i) for i in range(1, 51)]
    else:
        ids_kept = ids_to_filter
    fs.filter(ids=ids_to_filter, inverse=inverse)

    # make sure that we removed the right things
    assert_array_equal(fs.ids, np.array(ids_kept))

    # make sure that number of ids, labels and features are the same
    eq_(fs.ids.shape[0], fs.labels.shape[0])
    eq_(fs.labels.shape[0], fs.features.shape[0])
Ejemplo n.º 18
0
def test_majority_class_custom_learner():
    num_labels = 10

    # This will make data where the last class happens about 50% of the time.
    class_weights = [(0.5 / (num_labels - 1))
                     for x in range(num_labels - 1)] + [0.5]
    train_fs, test_fs = make_classification_data(num_examples=600,
                                                 train_test_ratio=0.8,
                                                 num_labels=num_labels,
                                                 num_features=5,
                                                 non_negative=True,
                                                 class_weights=class_weights)

    # Write training feature set to a file
    train_path = join(_my_dir, 'train',
                      'test_majority_class_custom_learner.jsonlines')
    writer = NDJWriter(train_path, train_fs)
    writer.write()

    # Write test feature set to a file
    test_path = join(_my_dir, 'test',
                     'test_majority_class_custom_learner.jsonlines')
    writer = NDJWriter(test_path, test_fs)
    writer.write()

    cfgfile = 'test_majority_class_custom_learner.template.cfg'
    config_template_path = join(_my_dir, 'configs', cfgfile)
    config_path = fill_in_config_paths(config_template_path)

    run_configuration(config_path, quiet=True)

    outprefix = 'test_majority_class_custom_learner'

    preds = read_predictions(
        join(_my_dir, 'output',
             ('{}_{}_MajorityClassLearner_predictions.tsv'.format(
                 outprefix, outprefix))))
    expected = np.array([float(num_labels - 1) for x in preds])
    assert_array_equal(preds, expected)
Ejemplo n.º 19
0
def check_filter_features(inverse=False):

    # create a feature set
    fs, _ = make_classification_data(num_examples=100,
                                     num_features=5,
                                     num_labels=3,
                                     train_test_ratio=1.0)

    # store the features in a separate matrix before filtering
    X = fs.features.todense()

    # filter features f1 and f4 or their inverse
    fs.filter(features=['f01', 'f04'], inverse=inverse)

    # make sure that we have the right number of feature columns
    # depending on whether we are inverting
    feature_shape = (100, 3) if inverse else (100, 2)
    eq_(fs.features.shape, feature_shape)

    # and that they are the first and fourth columns
    # of X that we generated, if not inverting and
    # the second, third and fifth, if inverting
    if inverse:
        feature_columns = X[:, [1, 2, 4]]
    else:
        feature_columns = X[:, [0, 3]]

    assert (fs.features.todense() == feature_columns).all()

    # make sure that the feature names that we kept are also correct
    feature_names = ['f02', 'f03', 'f05'] if inverse else ['f01', 'f04']
    assert_array_equal(np.array(fs.vectorizer.feature_names_), feature_names)

    # make sure that number of ids, labels and features are the same
    eq_(fs.ids.shape[0], fs.labels.shape[0])
    eq_(fs.labels.shape[0], fs.features.shape[0])
Ejemplo n.º 20
0
def test_equality():
    """
    Test featureset equality
    """

    # create a featureset
    fs1, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create a featureset with a different set but same number
    # of features and everything else the same
    fs2, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    fs2.features *= 2

    # create a featureset with different feature names
    # and everything else the same
    fs3, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      feature_prefix='g',
                                      train_test_ratio=1.0)

    # create a featureset with a different set of labels
    # and everything else the same
    fs4, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=2,
                                      train_test_ratio=1.0)

    # create a featureset with a different set but same number
    # of IDs and everything else the same
    fs5, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)
    fs5.ids = np.array(['A' + i for i in fs2.ids])

    # create a featureset with a different vectorizer
    # and everything else the same
    fs6, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0,
                                      use_feature_hashing=True,
                                      feature_bins=2)

    # create a featureset with a different number of features
    # and everything else the same
    fs7, _ = make_classification_data(num_examples=100,
                                      num_features=5,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create a featureset with a different number of examples
    # and everything else the same
    fs8, _ = make_classification_data(num_examples=200,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # create a featureset with a different vectorizer instance
    # and everything else the same
    fs9, _ = make_classification_data(num_examples=100,
                                      num_features=4,
                                      num_labels=3,
                                      train_test_ratio=1.0)

    # now check for the expected equalities
    assert_not_equal(fs1, fs2)
    assert_not_equal(fs1, fs3)
    assert_not_equal(fs1, fs4)
    assert_not_equal(fs1, fs5)
    assert_not_equal(fs1, fs6)
    assert_not_equal(fs1, fs7)
    assert_not_equal(fs1, fs8)
    assert_not_equal(id(fs1.vectorizer), id(fs9.vectorizer))
    eq_(fs1, fs9)