def test_subtract(): """ Test to ensure that subtraction works """ # create a feature set fs1, _ = make_classification_data(num_examples=100, num_features=4, num_labels=2, train_test_ratio=1.0, random_state=1234) # create a different feature set with the same feature names # but different feature values fs2, _ = make_classification_data(num_examples=100, num_features=2, num_labels=2, train_test_ratio=1.0, random_state=5678) # subtract fs1 from fs2, i.e., the features in fs2 # should be removed from fs1 but nothing else should change fs = fs1 - fs2 # ensure that the labels are the same in fs and fs1 assert_array_equal(fs.labels, fs1.labels) # ensure that there are only two features left eq_(fs.features.shape[1], 2) # and that they are f3 and f4 assert_array_equal(np.array(fs.vectorizer.feature_names_), ['f03', 'f04'])
def test_merge_missing_labels(): """ Test to ensure that labels are sucessfully copied when merging """ # create a feature set fs1, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) # create a different feature set with no labels specified fs2, _ = make_classification_data(num_examples=100, num_features=4, feature_prefix='g', empty_labels=True, num_labels=3, train_test_ratio=1.0) # merge the two featuresets in different orders fs12 = fs1 + fs2 fs21 = fs2 + fs1 # make sure that the labels are the same after merging assert_array_equal(fs12.labels, fs1.labels) assert_array_equal(fs21.labels, fs1.labels)
def test_api_with_custom_prob_metric(): """Test API with custom probabilistic metric""" # register a custom metric from our file that requires probabilities input_dir = join(_my_dir, "other") custom_metrics_file = join(input_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file, "fake_prob_metric") # create some classification data train_fs, _ = make_classification_data(num_examples=1000, num_features=10, num_labels=2) # set up a learner to tune using this probabilistic metric # this should fail since LinearSVC doesn't support probabilities learner1 = Learner("LinearSVC") assert_raises_regex(AttributeError, r"has no attribute 'predict_proba'", learner1.train, train_fs, grid_objective="fake_prob_metric") # set up another learner with explicit probability support # this should work just fine with our custom metric learner2 = Learner("SVC", probability=True) grid_score, _ = learner2.train(train_fs, grid_objective="fake_prob_metric") ok_(grid_score > 0.95)
def test_string_feature(): """ Test that string-valued features are properly encoded as binary features """ # create a featureset that is derived from an original # set of features containing 3 numeric features and # one string-valued feature that can take six possible # values between 'a' to 'f'. This means that the # featureset will have 3 numeric + 6 binary features. fs, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, one_string_feature=True, num_string_values=6, train_test_ratio=1.0) # confirm that the number of features are as expected eq_(fs.features.shape, (100, 9)) # confirm the feature names eq_(fs.vectorizer.feature_names_, [ 'f01', 'f02', 'f03', 'f04=a', 'f04=b', 'f04=c', 'f04=d', 'f04=e', 'f04=f' ]) # confirm that the final six features are binary assert_array_equal(fs.features[:, [3, 4, 5, 6, 7, 8]].data, 1)
def check_filter_labels(inverse=False): # create a feature set fs, _ = make_classification_data(num_examples=1000, num_features=4, num_labels=5, train_test_ratio=1.0) # keep just the instaces with 0, 1 and 2 labels labels_to_filter = [0, 1, 2] # do the actual filtering fs.filter(labels=labels_to_filter, inverse=inverse) # make sure that we removed the right things if inverse: ids_kept = fs.ids[np.where( np.logical_not(np.in1d(fs.labels, labels_to_filter)))] else: ids_kept = fs.ids[np.where(np.in1d(fs.labels, labels_to_filter))] assert_array_equal(fs.ids, np.array(ids_kept)) # make sure that number of ids, labels and features are the same eq_(fs.ids.shape[0], fs.labels.shape[0]) eq_(fs.labels.shape[0], fs.features.shape[0])
def test_merge_different_vectorizers(): """ Test to ensure rejection of merging featuresets with different vectorizers """ # create a featureset each with a DictVectorizer fs1, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) # create another featureset using hashing fs2, _ = make_classification_data(num_examples=100, num_features=4, feature_prefix='g', num_labels=3, train_test_ratio=1.0, use_feature_hashing=True) # This should raise a ValueError fs1 + fs2
def test_write_hashed_featureset(): """ Test to check that hashed featuresets cannot be written out """ fs, _ = make_classification_data(num_examples=100, num_features=4, use_feature_hashing=True, feature_bins=2, random_state=1234) output_dir = join(_my_dir, 'output') writer = NDJWriter(join(output_dir, 'foo.jsonlines'), fs) writer.write()
def test_length(): """ Test to whether len() returns the number of instances """ # create a featureset fs, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) eq_(len(fs), 100)
def test_empty_labels(): """ Test to check behaviour when labels is None """ # create a feature set with empty labels fs, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, empty_labels=True, train_test_ratio=1.0) assert np.isnan(fs.labels).all()
def test_custom_learner_model_loading(): num_labels = 10 class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_model_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_model_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() # run the configuration that trains the custom model and saves it cfgfile = 'test_model_save_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) # save the predictions from disk into memory # and delete the predictions file outprefix = 'test_model_custom_learner' pred_file = join( _my_dir, 'output', '{}_{}_CustomLogisticRegressionWrapper' '_predictions.tsv'.format(outprefix, outprefix)) preds1 = read_predictions(pred_file) os.unlink(pred_file) # run the configuration that loads the saved model # and generates the predictions again cfgfile = 'test_model_load_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, overwrite=False, quiet=True) # load the newly generated predictions preds2 = read_predictions(pred_file) # make sure that they are the same as before assert_array_equal(preds1, preds2)
def test_merge_different_hashers(): """ Test to ensure rejection of merging featuresets with different FeatureHashers """ # create a feature set with 4 feature hashing bins fs1, _ = make_classification_data(num_examples=100, num_features=10, num_labels=3, train_test_ratio=1.0, use_feature_hashing=True, feature_bins=4) # create a second feature set with 3 feature hashing bins fs2, _ = make_classification_data(num_examples=100, num_features=10, num_labels=3, feature_prefix='g', train_test_ratio=1.0, use_feature_hashing=True, feature_bins=3) # This should raise a ValueError fs1 + fs2
def test_merge_different_labels_same_ids(): """ Test to ensure rejection of merging featuresets that have conflicting labels """ # create a feature set fs1, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) # create a different feature set that has everything # the same but has different labels for the same IDs fs2, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, feature_prefix='g', train_test_ratio=1.0) # artificially modify the class labels fs2.labels = fs2.labels + 1 # This should raise a ValueError fs1 + fs2
def test_filter_with_hashing(): """ Test to ensure rejection of filtering by features when using hashing """ # create a feature set fs, _ = make_classification_data(num_examples=100, num_features=5, num_labels=3, train_test_ratio=1.0, use_feature_hashing=True, feature_bins=2) # filter features f1 and f4 or their inverse fs.filter(features=['f1', 'f4'])
def test_iteration_without_dictvectorizer(): """ Test to allow iteration only if the vectorizer is a DictVectorizer """ # create a feature set fs, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0, use_feature_hashing=True, feature_bins=2) # This should raise a ValueError for _ in fs: pass
def test_logistic_custom_learner(): num_labels = 10 class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_logistic_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_logistic_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() cfgfile = 'test_logistic_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) outprefix = 'test_logistic_custom_learner' preds = read_predictions( join(_my_dir, 'output', ('{}_{}_CustomLogisticRegressionWrapper' '_predictions.tsv'.format(outprefix, outprefix)))) expected = read_predictions( join(_my_dir, 'output', ('{}_{}_LogisticRegression_predictions.tsv'.format( outprefix, outprefix)))) assert_array_equal(preds, expected)
def test_api_with_inverted_custom_metric(): """Test API with a lower-is-better custom metric""" # register a lower-is-better custom metrics from our file # which is simply 1 minus the precision score input_dir = join(_my_dir, "other") custom_metrics_file1 = join(input_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file1, "one_minus_precision") # create some classification data train_fs, _ = make_classification_data(num_examples=1000, num_features=10, num_labels=2) # set up a learner to tune using the lower-is-better custom metric learner1 = Learner("LogisticRegression") (grid_score1, grid_results_dict1) = learner1.train(train_fs, grid_objective="one_minus_precision") # now setup another learner that uses the complementary version # of our custom metric (regular precision) for grid search learner2 = Learner("LogisticRegression") (grid_score2, grid_results_dict2) = learner2.train(train_fs, grid_objective="precision") # for both learners the ranking of the C hyperparameter should be # should be the identical since when we defined one_minus_precision # we set the `greater_is_better` keyword argument to `False` assert_array_equal(grid_results_dict1['rank_test_score'], grid_results_dict2['rank_test_score']) # furthermore, the final grid score and the mean scores for each # C hyperparameter value should follow the same 1-X relationship # except that our custom metric should be negated due to the # keyword argument that we set when we defined it assert_almost_equal(1 - grid_score2, -1 * grid_score1, places=6) assert_array_almost_equal(1 - grid_results_dict2['mean_test_score'], -1 * grid_results_dict1['mean_test_score'], decimal=6)
def check_filter_ids(inverse=False): # create a feature set fs, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) # keep just the IDs after Example_50 or do the inverse ids_to_filter = ['EXAMPLE_{}'.format(i) for i in range(51, 101)] if inverse: ids_kept = ['EXAMPLE_{}'.format(i) for i in range(1, 51)] else: ids_kept = ids_to_filter fs.filter(ids=ids_to_filter, inverse=inverse) # make sure that we removed the right things assert_array_equal(fs.ids, np.array(ids_kept)) # make sure that number of ids, labels and features are the same eq_(fs.ids.shape[0], fs.labels.shape[0]) eq_(fs.labels.shape[0], fs.features.shape[0])
def test_majority_class_custom_learner(): num_labels = 10 # This will make data where the last class happens about 50% of the time. class_weights = [(0.5 / (num_labels - 1)) for x in range(num_labels - 1)] + [0.5] train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=num_labels, num_features=5, non_negative=True, class_weights=class_weights) # Write training feature set to a file train_path = join(_my_dir, 'train', 'test_majority_class_custom_learner.jsonlines') writer = NDJWriter(train_path, train_fs) writer.write() # Write test feature set to a file test_path = join(_my_dir, 'test', 'test_majority_class_custom_learner.jsonlines') writer = NDJWriter(test_path, test_fs) writer.write() cfgfile = 'test_majority_class_custom_learner.template.cfg' config_template_path = join(_my_dir, 'configs', cfgfile) config_path = fill_in_config_paths(config_template_path) run_configuration(config_path, quiet=True) outprefix = 'test_majority_class_custom_learner' preds = read_predictions( join(_my_dir, 'output', ('{}_{}_MajorityClassLearner_predictions.tsv'.format( outprefix, outprefix)))) expected = np.array([float(num_labels - 1) for x in preds]) assert_array_equal(preds, expected)
def check_filter_features(inverse=False): # create a feature set fs, _ = make_classification_data(num_examples=100, num_features=5, num_labels=3, train_test_ratio=1.0) # store the features in a separate matrix before filtering X = fs.features.todense() # filter features f1 and f4 or their inverse fs.filter(features=['f01', 'f04'], inverse=inverse) # make sure that we have the right number of feature columns # depending on whether we are inverting feature_shape = (100, 3) if inverse else (100, 2) eq_(fs.features.shape, feature_shape) # and that they are the first and fourth columns # of X that we generated, if not inverting and # the second, third and fifth, if inverting if inverse: feature_columns = X[:, [1, 2, 4]] else: feature_columns = X[:, [0, 3]] assert (fs.features.todense() == feature_columns).all() # make sure that the feature names that we kept are also correct feature_names = ['f02', 'f03', 'f05'] if inverse else ['f01', 'f04'] assert_array_equal(np.array(fs.vectorizer.feature_names_), feature_names) # make sure that number of ids, labels and features are the same eq_(fs.ids.shape[0], fs.labels.shape[0]) eq_(fs.labels.shape[0], fs.features.shape[0])
def test_equality(): """ Test featureset equality """ # create a featureset fs1, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) # create a featureset with a different set but same number # of features and everything else the same fs2, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) fs2.features *= 2 # create a featureset with different feature names # and everything else the same fs3, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, feature_prefix='g', train_test_ratio=1.0) # create a featureset with a different set of labels # and everything else the same fs4, _ = make_classification_data(num_examples=100, num_features=4, num_labels=2, train_test_ratio=1.0) # create a featureset with a different set but same number # of IDs and everything else the same fs5, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) fs5.ids = np.array(['A' + i for i in fs2.ids]) # create a featureset with a different vectorizer # and everything else the same fs6, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0, use_feature_hashing=True, feature_bins=2) # create a featureset with a different number of features # and everything else the same fs7, _ = make_classification_data(num_examples=100, num_features=5, num_labels=3, train_test_ratio=1.0) # create a featureset with a different number of examples # and everything else the same fs8, _ = make_classification_data(num_examples=200, num_features=4, num_labels=3, train_test_ratio=1.0) # create a featureset with a different vectorizer instance # and everything else the same fs9, _ = make_classification_data(num_examples=100, num_features=4, num_labels=3, train_test_ratio=1.0) # now check for the expected equalities assert_not_equal(fs1, fs2) assert_not_equal(fs1, fs3) assert_not_equal(fs1, fs4) assert_not_equal(fs1, fs5) assert_not_equal(fs1, fs6) assert_not_equal(fs1, fs7) assert_not_equal(fs1, fs8) assert_not_equal(id(fs1.vectorizer), id(fs9.vectorizer)) eq_(fs1, fs9)