def check_rescaling(name, grid_search=False): train_fs, test_fs, _ = make_regression_data(num_examples=2000, sd_noise=4, num_features=3) # instantiate the given learner and its rescaled counterpart learner = Learner(name) rescaled_learner = Learner('Rescaled' + name) # train both the regular regressor and the rescaled regressor # with and without using grid search if grid_search: learner.train(train_fs, grid_search=True, grid_objective='pearson') rescaled_learner.train(train_fs, grid_search=True, grid_objective='pearson') else: learner.train(train_fs, grid_search=False) rescaled_learner.train(train_fs, grid_search=False) # now generate both sets of predictions on the test feature set predictions = learner.predict(test_fs) rescaled_predictions = rescaled_learner.predict(test_fs) # ... and on the training feature set train_predictions = learner.predict(train_fs) rescaled_train_predictions = rescaled_learner.predict(train_fs) # make sure that both sets of correlations are close to perfectly # correlated, since the only thing different is that one set has been # rescaled assert_almost_equal(pearsonr(predictions, rescaled_predictions)[0], 1.0, places=3) # make sure that the standard deviation of the rescaled test set # predictions is higher than the standard deviation of the regular test set # predictions p_std = np.std(predictions) rescaled_p_std = np.std(rescaled_predictions) assert_greater(rescaled_p_std, p_std) # make sure that the standard deviation of the rescaled predictions # on the TRAINING set (not the TEST) is closer to the standard # deviation of the training set labels than the standard deviation # of the regular predictions. train_y_std = np.std(train_fs.labels) train_p_std = np.std(train_predictions) rescaled_train_p_std = np.std(rescaled_train_predictions) assert_less(abs(rescaled_train_p_std - train_y_std), abs(train_p_std - train_y_std))
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.37331461, 0.08572699, 0.2543484, 0.1841172, 0.1024928] if use_feature_hashing else [0.08931994, 0.15545093, 0.75522913]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: if use_feature_hashing: expected_feature_importances = [0.40195655, 0.06702161, 0.25814858, 0.18183947, 0.09103379] else: expected_feature_importances = [0.07975691, 0.16122862, 0.75901447] expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, rtol=1e-2) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.37483895, 0.08816508, 0.25379838, 0.18337128, 0.09982631] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: expected_feature_importances = ([0.40195798, 0.06702903, 0.25816559, 0.18185518, 0.09099222] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def test_predict_dict_hasher(): train_file = join(_my_dir, 'other', 'examples_train.jsonlines') test_file = join(_my_dir, 'other', 'examples_test.jsonlines') train_fs = NDJReader.for_path(train_file).read() test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=3).read() learner = Learner('LogisticRegression') learner.train(train_fs, grid_search=False) _ = learner.predict(test_fs)
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [0.33718443, 0.07810721, 0.25621769, 0.19489766, 0.13359301] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([0.204, 0.172, 0.178, 0.212, 0.234] if use_feature_hashing else [0.262, 0.288, 0.45]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def test_predict_hasher_hasher_same_bins(): train_file = join(_my_dir, 'other', 'examples_train.jsonlines') test_file = join(_my_dir, 'other', 'examples_test.jsonlines') train_fs = NDJReader.for_path(train_file, feature_hasher=True, num_features=3).read() test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=3).read() learner = Learner('LogisticRegression') learner.train(train_fs, grid_search=False) predictions = learner.predict(test_fs) eq_(len(predictions), test_fs.features.shape[0])
def check_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: (train_fs, test_fs, weightdict) = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the weights are close to the weights # that we got from make_regression_data. Take the # ceiling before comparing since just comparing # the ceilings should be enough to make sure nothing # catastrophic happened. However, sometimes with # feature hashing, the ceiling is not exactly identical # so when that fails we want to check that the rounded # feature values are the same. One of those two equalities # _must_ be satisified. # get the weights for this trained model learned_weights = learner.model_params[0] for feature_name in learned_weights: learned_w_ceil = math.ceil(learned_weights[feature_name]) given_w_ceil = math.ceil(weightdict[feature_name]) learned_w_round = round(learned_weights[feature_name], 0) given_w_round = round(weightdict[feature_name], 0) ceil_equal = learned_w_ceil == given_w_ceil round_equal = learned_w_round == given_w_round either_equal = ceil_equal or round_equal assert either_equal # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def test_predict_dict_dict(): train_file = join(_my_dir, 'other', 'examples_train.jsonlines') test_file = join(_my_dir, 'other', 'examples_test.jsonlines') train_fs = NDJReader.for_path(train_file).read() test_fs = NDJReader.for_path(test_file).read() learner = Learner('LogisticRegression') learner.train(train_fs, grid_search=False) predictions = learner.predict(test_fs) eq_(len(predictions), test_fs.features.shape[0])
def check_generate_predictions_console(use_threshold=False): # create some simple classification data without feature hashing train_fs, test_fs = make_classification_data(num_examples=1000, num_features=5) # save the test feature set to an NDJ file input_file = join(_my_dir, 'test', 'test_generate_predictions.jsonlines') writer = NDJWriter(input_file, test_fs) writer.write() # create a learner that uses an SGD classifier learner = Learner('SGDClassifier', probability=use_threshold) # train the learner with grid search learner.train(train_fs, grid_search=True) # get the predictions on the test featureset predictions = learner.predict(test_fs) # if we asked for probabilities, then use the threshold # to convert them into binary predictions if use_threshold: threshold = 0.6 predictions = [int(p[1] >= threshold) for p in predictions] else: predictions = predictions.tolist() threshold = None # save the learner to a file model_file = join(_my_dir, 'output', 'test_generate_predictions_console.model') learner.save(model_file) # now call main() from generate_predictions.py generate_cmd = [] if use_threshold: generate_cmd.append('-t {}'.format(threshold)) generate_cmd.extend([model_file, input_file]) # we need to capture stdout since that's what main() writes to err = '' try: old_stdout = sys.stdout old_stderr = sys.stderr sys.stdout = mystdout = StringIO() sys.stderr = mystderr = StringIO() gp.main(generate_cmd) out = mystdout.getvalue() err = mystderr.getvalue() predictions_after_saving = [int(x) for x in out.strip().split('\n')] eq_(predictions, predictions_after_saving) finally: sys.stdout = old_stdout sys.stderr = old_stderr print(err)
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [0.749811, 0.001373, 0.23357, 0.011691, 0.003554] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([0.735756, 0.001034, 0.242734, 0.015836, 0.00464] if use_feature_hashing else [0.082621, 0.166652, 0.750726]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_search=True, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([0.730811, 0.001834, 0.247603, 0.015241, 0.004511] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) else: expected_feature_importances = ([0.733654, 0.002528, 0.245527, 0.013664, 0.004627] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_ensemble_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('AdaBoostRegressor'): if use_feature_hashing: expected_feature_importances = [ 0.33718443, 0.07810721, 0.25621769, 0.19489766, 0.13359301 ] else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: expected_feature_importances = ([ 0.204, 0.172, 0.178, 0.212, 0.234 ] if use_feature_hashing else [0.262, 0.288, 0.45]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([ 0.37331461, 0.08572699, 0.2543484, 0.1841172, 0.1024928 ] if use_feature_hashing else [0.08931994, 0.15545093, 0.75522913]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: if use_feature_hashing: expected_feature_importances = [ 0.40195655, 0.06702161, 0.25814858, 0.18183947, 0.09103379 ] else: expected_feature_importances = [0.07975691, 0.16122862, 0.75901447] expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, rtol=1e-2) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, weightdict = make_regression_data( num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the weights are close to the weights # that we got from make_regression_data. Take the # ceiling before comparing since just comparing # the ceilings should be enough to make sure nothing # catastrophic happened. Note though that we cannot # test feature weights if we are using feature hashing # since model_params is not defined with a featurehasher. if not use_feature_hashing: # get the weights for this trained model learned_weights = learner.model_params[0] for feature_name in learned_weights: learned_w = math.ceil(learned_weights[feature_name]) given_w = math.ceil(weightdict[feature_name]) eq_(learned_w, given_w) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, _ = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, _ = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # make sure that the feature importances are as expected. if name.endswith('DecisionTreeRegressor'): expected_feature_importances = ([ 0.37483895, 0.08816508, 0.25379838, 0.18337128, 0.09982631 ] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033]) expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0] else: expected_feature_importances = ([ 0.40195798, 0.06702903, 0.25816559, 0.18185518, 0.09099222 ] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838]) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, atol=1e-2, rtol=0) # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_dummy_classifier_predict(model_args, train_labels, expected_output): # create hard-coded featuresets based with known labels train_fs = FeatureSet('classification_train', ['TrainExample{}'.format(i) for i in range(20)], labels=train_labels, features=[{"feature": i} for i in range(20)]) test_fs = FeatureSet('classification_test', ['TestExample{}'.format(i) for i in range(10)], features=[{"feature": i} for i in range(20, 30)]) # Ensure predictions are as expectedfor the given strategy learner = Learner('DummyClassifier', model_kwargs=model_args) learner.train(train_fs, grid_search=False) predictions = learner.predict(test_fs) eq_(np.array_equal(expected_output, predictions), True)
def check_generate_predictions(use_feature_hashing=False, use_threshold=False, test_on_subset=False): # create some simple classification feature sets for training and testing train_fs, test_fs = make_classification_data( num_examples=1000, num_features=5, use_feature_hashing=use_feature_hashing, feature_bins=4) # create a learner that uses an SGD classifier learner = Learner('SGDClassifier', probability=use_threshold) # train the learner with grid search learner.train(train_fs, grid_search=True) # if we are asked to use only a subset, then filter out # one of the features if we are not using feature hashing, # do nothing if we are using feature hashing if test_on_subset and not use_feature_hashing: test_fs.filter(features=['f01', 'f02', 'f03', 'f04']) # get the predictions on the test featureset predictions = learner.predict(test_fs) # if we asked for probabilities, then use the threshold # to convert them into binary predictions if use_threshold: threshold = 0.6 predictions = [int(p[1] >= threshold) for p in predictions] else: predictions = predictions.tolist() threshold = None # save the learner to a file model_file = join(_my_dir, 'output', 'test_generate_predictions.model') learner.save(model_file) # now use Predictor to generate the predictions and make # sure that they are the same as before saving the model p = gp.Predictor(model_file, threshold=threshold) predictions_after_saving = p.predict(test_fs) eq_(predictions, predictions_after_saving)
def check_adaboost_regression(base_estimator): train_fs, test_fs, _ = make_regression_data(num_examples=2000, sd_noise=4, num_features=3) # train an AdaBoostRegressor on the training data and evalute on the # testing data learner = Learner('AdaBoostRegressor', model_kwargs={'base_estimator': base_estimator}) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_adaboost_regression(base_estimator): train_fs, test_fs, _ = make_regression_data(num_examples=2000, sd_noise=4, num_features=3) # train an AdaBoostClassifier on the training data and evalute on the # testing data learner = Learner('AdaBoostRegressor', model_kwargs={'base_estimator': base_estimator}) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.95)
def check_ransac_regression(base_estimator, pearson_value): train_fs, test_fs, _ = make_regression_data(num_examples=2000, sd_noise=4, num_features=3) # train a RANSACRegressor on the training data and evalute on the # testing data model_kwargs = {'base_estimator': base_estimator} if base_estimator else {} learner = Learner('RANSACRegressor', model_kwargs=model_kwargs) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated and the value # of the correlation is as expected cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, pearson_value)
def check_non_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, weightdict = make_regression_data( num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # Note that we cannot check the feature weights here # since `model_params()` is not defined for non-linear # kernels. # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def test_mlp_classification(): train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=3, num_features=5) # train an MLPCLassifier on the training data and evalute on the # testing data learner = Learner('MLPClassifier') with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) learner.train(train_fs, grid_search=True) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated accuracy = accuracy_score(predictions, test_fs.labels) assert_almost_equal(accuracy, 0.825)
def test_mlp_classification(): train_fs, test_fs = make_classification_data(num_examples=600, train_test_ratio=0.8, num_labels=3, num_features=5) # train an MLPCLassifier on the training data and evalute on the # testing data learner = Learner('MLPClassifier') with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated accuracy = accuracy_score(predictions, test_fs.labels) assert_almost_equal(accuracy, 0.858, places=3)
def test_dummy_classifier_predict(): # hard-code dataset train_fs = FeatureSet('classification_train', ['TrainExample{}'.format(i) for i in range(20)], labels=([0] * 14) + ([1] * 6), features=[{ "feature": i } for i in range(20)]) test_fs = FeatureSet('classification_test', ['TestExample{}'.format(i) for i in range(10)], features=[{ "feature": i } for i in range(20, 30)]) toy_data = ([{ "strategy": "stratified", "random_state": 12345 }, np.array([1, 0, 0, 0, 0, 0, 1, 0, 1, 0])], [{ "strategy": "most_frequent" }, np.array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])], [{ "strategy": "constant", "constant": 1 }, np.array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]) # Ensure predictions are correct for all strategies. correct = [] for model_args, expected_output in toy_data: learner = Learner('DummyClassifier', model_kwargs=model_args) learner.train(train_fs) predictions = learner.predict(test_fs) correct.append(np.array_equal(expected_output, predictions)) eq_(correct, [True, True, True])
def check_non_linear_models(name, use_feature_hashing=False, use_rescaling=False): # create a FeatureSet object with the data we want to use if use_feature_hashing: train_fs, test_fs, weightdict = make_regression_data(num_examples=5000, num_features=10, use_feature_hashing=True, feature_bins=5) else: train_fs, test_fs, weightdict = make_regression_data(num_examples=2000, num_features=3) # create the learner if use_rescaling: name = 'Rescaled' + name learner = Learner(name) # train it with the training feature set we created # make sure to set the grid objective to pearson learner.train(train_fs, grid_objective='pearson') # Note that we cannot check the feature weights here # since `model_params()` is not defined for non-linear # kernels. # now generate the predictions on the test FeatureSet predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated with pearson > 0.95 cor, _ = pearsonr(predictions, test_fs.labels) expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0] assert_greater(cor, expected_cor_range[0]) assert_less(cor, expected_cor_range[1])
def check_mlp_regression(use_rescaling=False): train_fs, test_fs, _ = make_regression_data(num_examples=500, sd_noise=4, num_features=5) # train an MLPRegressor on the training data and evalute on the # testing data name = 'MLPRegressor' if use_rescaling else 'RescaledMLPRegressor' learner = Learner(name) # we don't want to see any convergence warnings during the grid search with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) # now make sure that the predictions are close to # the actual test FeatureSet labels that we generated # using make_regression_data. To do this, we just # make sure that they are correlated cor, _ = pearsonr(predictions, test_fs.labels) assert_greater(cor, 0.98)
def check_predict(model, use_feature_hashing=False): """ This tests whether predict task runs and generates the same number of predictions as samples in the test set. The specified model indicates whether to generate random regression or classification data. """ # create the random data for the given model if model._estimator_type == 'regressor': train_fs, test_fs, _ = \ make_regression_data(use_feature_hashing=use_feature_hashing, feature_bins=5) # feature hashing will not work for Naive Bayes since it requires # non-negative feature values elif model.__name__ == 'MultinomialNB': train_fs, test_fs = \ make_classification_data(use_feature_hashing=False, non_negative=True) else: train_fs, test_fs = \ make_classification_data(use_feature_hashing=use_feature_hashing, feature_bins=25) # create the learner with the specified model learner = Learner(model.__name__) # now train the learner on the training data and use feature hashing when # specified and when we are not using a Naive Bayes model learner.train(train_fs, grid_search=False) # now make predictions on the test set predictions = learner.predict(test_fs) # make sure we have the same number of outputs as the # number of test set samples eq_(len(predictions), test_fs.features.shape[0])
def check_generate_predictions(use_feature_hashing=False, use_threshold=False): # create some simple classification data without feature hashing train_fs, test_fs = make_classification_data( num_examples=1000, num_features=5, use_feature_hashing=use_feature_hashing, feature_bins=4) # create a learner that uses an SGD classifier learner = Learner('SGDClassifier', probability=use_threshold) # train the learner with grid search learner.train(train_fs, grid_search=True) # get the predictions on the test featureset predictions = learner.predict(test_fs) # if we asked for probabilities, then use the threshold # to convert them into binary predictions if use_threshold: threshold = 0.6 predictions = [int(p[1] >= threshold) for p in predictions] else: predictions = predictions.tolist() threshold = None # save the learner to a file model_file = join(_my_dir, 'output', 'test_generate_predictions.model') learner.save(model_file) # now use Predictor to generate the predictions and make # sure that they are the same as before saving the model p = gp.Predictor(model_file, threshold=threshold) predictions_after_saving = p.predict(test_fs) eq_(predictions, predictions_after_saving)
def _classify_featureset(args): """ Classification job to be submitted to grid """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") save_cv_folds = args.pop("save_cv_folds") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating ({} folds) on {}, feature " + "set {} ...").format(cv_folds, train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if task == 'cross_validate' or (not exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path) # load the model if it already exists else: # import the custom learner path here in case we are reusing a # saved model if custom_learner_path: _import_custom_learner(custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: print(('\tloading pre-existing %s model: %s') % (learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds, 'save_cv_folds': save_cv_folds, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores, skll_fold_ids = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] # write out the cv folds if required if task == 'cross_validate' and save_cv_folds: skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv' file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(join(results_path, skll_fold_ids_file), file_mode) as output_file: _write_skll_folds(skll_fold_ids, output_file) return res
def _classify_featureset(args): ''' Classification job to be submitted to grid ''' # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") job_name = args.pop("job_name") featureset = args.pop("featureset") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") cv_folds = args.pop("cv_folds") label_col = args.pop("label_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " + "{}").format(args.keys())) timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S') with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating on {}, feature " + "set {} ...").format(train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = os.path.join(model_path, '{}.model'.format(job_name)) # load the training and test examples if task == 'cross_validate' or (not os.path.exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map) # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count) # load the model if it already exists else: if os.path.exists(modelfile) and not overwrite: print(('\tloading pre-existing {} ' + 'model: {}').format(learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, unlabelled=True) # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'test_set_name': test_set_name, 'featureset': json.dumps(featureset), 'learner_name': learner_name, 'task': task, 'timestamp': timestamp, 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'min_feature_count': min_feature_count} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate(train_examples, prediction_prefix=prediction_prefix, grid_search=grid_search, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not os.path.exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) grid_search_folds = 5 if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train if task == 'cross_validate' or task == 'evaluate': results_json_path = os.path.join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file) with open(os.path.join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res
def _classify_featureset(args): ''' Classification job to be submitted to grid ''' # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") job_name = args.pop("job_name") featureset = args.pop("featureset") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") cv_folds = args.pop("cv_folds") label_col = args.pop("label_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " + "{}").format(args.keys())) timestamp = datetime.datetime.now().strftime('%d %b %Y %H:%M:%S') with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating on {}, feature " + "set {} ...").format( train_set_name, featureset), file=log_file) elif task == 'evaluate': print( ("Training on {}, Test on {}, " + "feature set {} ...").format( train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format( train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = os.path.join(model_path, '{}.model'.format(job_name)) # load the training and test examples if task == 'cross_validate' or (not os.path.exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map) # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count) # load the model if it already exists else: if os.path.exists(modelfile) and not overwrite: print(('\tloading pre-existing {} ' + 'model: {}').format( learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, unlabelled=True) # create a list of dictionaries of the results information learner_result_dict_base = { 'experiment_name': experiment_name, 'train_set_name': train_set_name, 'test_set_name': test_set_name, 'featureset': json.dumps(featureset), 'learner_name': learner_name, 'task': task, 'timestamp': timestamp, 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'min_feature_count': min_feature_count } # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate( train_examples, prediction_prefix=prediction_prefix, grid_search=grid_search, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not os.path.exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) grid_search_folds = 5 if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}'.format( grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems( learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [ learner.evaluate(test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective) ] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train if task == 'cross_validate' or task == 'evaluate': results_json_path = os.path.join( results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file) with open( os.path.join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res
def _classify_featureset(args): """ Classification job to be submitted to grid. Parameters ---------- args : dict A dictionary with arguments for classifying the ``FeatureSet`` instance. Returns ------- res : list of dicts The results of the classification, in the format of a list of dictionaries. Raises ------ ValueError If extra unknown arguments are passed to the function. """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") output_metrics = args.pop("output_metrics") suffix = args.pop("suffix") job_log_file = args.pop("log_file") job_log_level = args.pop("log_level") probability = args.pop("probability") pipeline = args.pop("pipeline") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") folds_file = args.pop("folds_file") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") save_cv_folds = args.pop("save_cv_folds") save_cv_models = args.pop("save_cv_models") use_folds_file_for_grid_search = args.pop("use_folds_file_for_grid_search") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") custom_metric_path = args.pop("custom_metric_path") quiet = args.pop('quiet', False) learning_curve_cv_folds = args.pop("learning_curve_cv_folds") learning_curve_train_sizes = args.pop("learning_curve_train_sizes") if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() # create a new SKLL logger for this specific job and # use the given log level logger = get_skll_logger(job_name, job_log_file, log_level=job_log_level) try: # log messages logger.info("Task: {}".format(task)) # check if we have any possible custom metrics possible_custom_metric_names = [] for metric_name in output_metrics + [grid_objective]: # metrics that are not in `SCORERS` or `None` are candidates # (the `None` is a by-product of how jobs with single tuning # objectives are created) if metric_name not in SCORERS and metric_name is not None: possible_custom_metric_names.append(metric_name) # if the metric is already in `SCORERS`, is it a custom one # that we already registered? if so, log that elif metric_name in _CUSTOM_METRICS: logger.info( f"custom metric '{metric_name}' is already registered") # initialize list that will hold any invalid metrics # that we could not register as custom metrics invalid_metric_names = [] # if we have possible custom metrics if possible_custom_metric_names: # check that we have a file to load them from if not custom_metric_path: raise ValueError( f"invalid metrics specified: {possible_custom_metric_names}" ) else: # try to register each possible custom metric # raise an exception if we fail, if we don't then # add the custom metric function to `globals()` so # that it serializes properly for gridmap for custom_metric_name in possible_custom_metric_names: try: custom_metric_func = register_custom_metric( custom_metric_path, custom_metric_name) except (AttributeError, NameError, ValueError): invalid_metric_names.append(custom_metric_name) else: logger.info(f"registered '{custom_metric_name}' as a " f"custom metric") globals()[custom_metric_name] = custom_metric_func # raise an error if we have any invalid metrics if invalid_metric_names: raise ValueError( f"invalid metrics specified: {invalid_metric_names}. " f"If these are custom metrics, check the function " f"names.") if task == 'cross_validate': if isinstance(cv_folds, int): num_folds = cv_folds else: # folds_file was used, so count the unique fold ids. num_folds = len(set(cv_folds.values())) logger.info("Cross-validating ({} folds) on {}, feature " "set {} ...".format(num_folds, train_set_name, featureset)) elif task == 'evaluate': logger.info("Training on {}, Test on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) elif task == 'train': logger.info("Training on {}, feature set {} ...".format( train_set_name, featureset)) elif task == 'learning_curve': logger.info("Generating learning curve " "({} 80/20 folds, sizes={}, objective={}) on {}, " "feature set {} ...".format( learning_curve_cv_folds, learning_curve_train_sizes, grid_objective, train_set_name, featureset)) else: # predict logger.info("Training on {}, Making predictions on {}, " "feature set {} ...".format(train_set_name, test_set_name, featureset)) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if (task in ['cross_validate', 'learning_curve'] or not exists(modelfile) or overwrite): train_examples = load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features, logger=logger) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, pipeline=pipeline, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path, logger=logger) # load the model if it already exists else: # import custom learner into global namespace if we are reusing # a saved model if custom_learner_path: globals()[learner_name] = load_custom_learner( custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: logger.info("Loading pre-existing {} model: {}".format( learner_name, modelfile)) learner = Learner.from_file(modelfile) # attach the job logger to this learner learner.logger = logger # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # compute information about xval and grid folds that can be put in results # in readable form if isinstance(cv_folds, dict): cv_folds_to_print = '{} via folds file'.format( len(set(cv_folds.values()))) else: cv_folds_to_print = str(cv_folds) if isinstance(grid_search_folds, dict): grid_search_folds_to_print = \ '{} via folds file'.format(len(set(grid_search_folds.values()))) else: grid_search_folds_to_print = str(grid_search_folds) # create a list of dictionaries of the results information learner_result_dict_base = { 'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'folds_file': folds_file, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds_to_print, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds_to_print, 'using_folds_file': isinstance(cv_folds, dict) or isinstance(grid_search_folds, dict), 'save_cv_folds': save_cv_folds, 'save_cv_models': save_cv_models, 'use_folds_file_for_grid_search': use_folds_file_for_grid_search, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION } # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': logger.info('Cross-validating') ( task_results, grid_scores, grid_search_cv_results_dicts, skll_fold_ids, models ) = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, output_metrics=output_metrics, param_grid=param_grid, grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds, save_cv_models=save_cv_models, use_custom_folds_for_grid_search=use_folds_file_for_grid_search ) if models: for index, m in enumerate(models, start=1): modelfile = join(model_path, '{}_fold{}.model'.format(job_name, index)) m.save(modelfile) elif task == 'learning_curve': logger.info("Generating learning curve(s)") (curve_train_scores, curve_test_scores, computed_curve_train_sizes) = learner.learning_curve( train_examples, grid_objective, cv_folds=learning_curve_cv_folds, train_sizes=learning_curve_train_sizes) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: logger.info("Featurizing and training new {} model".format( learner_name)) (best_score, grid_search_cv_results) = learner.train( train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] grid_search_cv_results_dicts = [grid_search_cv_results] # save model if model_path: learner.save(modelfile) if grid_search: logger.info("Best {} grid search score: {}".format( grid_objective, round(best_score, 3))) else: grid_scores = [None] grid_search_cv_results_dicts = [None] # print out the parameters param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in learner.model.get_params().items()) logger.info("Hyperparameters: {}".format(', '.join(param_out))) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': logger.info("Evaluating predictions") task_results = [ learner.evaluate(test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective, output_metrics=output_metrics) ] elif task == 'predict': logger.info("Writing predictions") # we set `class_labels` to `False` so that if the learner is # probabilistic, probabilities are written instead of labels learner.predict(test_examples, prediction_prefix=prediction_prefix, class_labels=False) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, grid_search_cv_results_dicts, learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) elif task == 'learning_curve': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = {} res.update(learner_result_dict_base) res.update({ 'learning_curve_cv_folds': learning_curve_cv_folds, 'given_curve_train_sizes': learning_curve_train_sizes, 'learning_curve_train_scores_means': np.mean(curve_train_scores, axis=1), 'learning_curve_test_scores_means': np.mean(curve_test_scores, axis=1), 'learning_curve_train_scores_stds': np.std(curve_train_scores, axis=1, ddof=1), 'learning_curve_test_scores_stds': np.std(curve_test_scores, axis=1, ddof=1), 'computed_curve_train_sizes': computed_curve_train_sizes }) # we need to return and write out a list of dictionaries res = [res] # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) # For all other tasks, i.e. train or predict else: if results_path: results_json_path = join(results_path, '{}.results.json'.format(job_name)) assert len(grid_scores) == 1 assert len(grid_search_cv_results_dicts) == 1 grid_search_cv_results_dict = {"grid_score": grid_scores[0]} grid_search_cv_results_dict["grid_search_cv_results"] = \ grid_search_cv_results_dicts[0] grid_search_cv_results_dict.update(learner_result_dict_base) # write out the result dictionary to a json file with open(results_json_path, 'w') as json_file: json.dump(grid_search_cv_results_dict, json_file, cls=NumpyTypeEncoder) res = [learner_result_dict_base] # write out the cv folds if required if task == 'cross_validate' and save_cv_folds: skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv' with open(join(results_path, skll_fold_ids_file), 'w') as output_file: _write_skll_folds(skll_fold_ids, output_file) finally: close_and_remove_logger_handlers(logger) return res
def _classify_featureset(args): """ Classification job to be submitted to grid """ # Extract all the arguments. # (There doesn't seem to be a better way to do this since one can't specify # required keyword arguments.) experiment_name = args.pop("experiment_name") task = args.pop("task") sampler = args.pop("sampler") feature_hasher = args.pop("feature_hasher") hasher_features = args.pop("hasher_features") job_name = args.pop("job_name") featureset = args.pop("featureset") featureset_name = args.pop("featureset_name") learner_name = args.pop("learner_name") train_path = args.pop("train_path") test_path = args.pop("test_path") train_set_name = args.pop("train_set_name") test_set_name = args.pop("test_set_name") shuffle = args.pop('shuffle') model_path = args.pop("model_path") prediction_prefix = args.pop("prediction_prefix") grid_search = args.pop("grid_search") grid_objective = args.pop("grid_objective") suffix = args.pop("suffix") log_path = args.pop("log_path") probability = args.pop("probability") results_path = args.pop("results_path") fixed_parameters = args.pop("fixed_parameters") sampler_parameters = args.pop("sampler_parameters") param_grid = args.pop("param_grid") pos_label_str = args.pop("pos_label_str") overwrite = args.pop("overwrite") feature_scaling = args.pop("feature_scaling") min_feature_count = args.pop("min_feature_count") grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") class_map = args.pop("class_map") custom_learner_path = args.pop("custom_learner_path") quiet = args.pop('quiet', False) if args: raise ValueError(("Extra arguments passed to _classify_featureset: " "{}").format(args.keys())) start_timestamp = datetime.datetime.now() with open(log_path, 'w') as log_file: # logging print("Task:", task, file=log_file) if task == 'cross_validate': print(("Cross-validating ({} folds) on {}, feature " + "set {} ...").format(cv_folds, train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) elif task == 'train': print("Training on {}, feature set {} ...".format(train_set_name, featureset), file=log_file) else: # predict print(("Training on {}, Making predictions about {}, " + "feature set {} ...").format(train_set_name, test_set_name, featureset), file=log_file) # check whether a trained model on the same data with the same # featureset already exists if so, load it and then use it on test data modelfile = join(model_path, '{}.model'.format(job_name)) if task == 'cross_validate' or (not exists(modelfile) or overwrite): train_examples = _load_featureset(train_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) train_set_size = len(train_examples.ids) if not train_examples.has_labels: raise ValueError('Training examples do not have labels') # initialize a classifer object learner = Learner(learner_name, probability=probability, feature_scaling=feature_scaling, model_kwargs=fixed_parameters, pos_label_str=pos_label_str, min_feature_count=min_feature_count, sampler=sampler, sampler_kwargs=sampler_parameters, custom_learner_path=custom_learner_path) # load the model if it already exists else: # import the custom learner path here in case we are reusing a # saved model if custom_learner_path: _import_custom_learner(custom_learner_path, learner_name) train_set_size = 'unknown' if exists(modelfile) and not overwrite: print(('\tloading pre-existing %s model: %s') % (learner_name, modelfile)) learner = Learner.from_file(modelfile) # Load test set if there is one if task == 'evaluate' or task == 'predict': test_examples = _load_featureset(test_path, featureset, suffix, label_col=label_col, id_col=id_col, ids_to_floats=ids_to_floats, quiet=quiet, class_map=class_map, feature_hasher=feature_hasher, num_features=hasher_features) test_set_size = len(test_examples.ids) else: test_set_size = 'n/a' # create a list of dictionaries of the results information learner_result_dict_base = {'experiment_name': experiment_name, 'train_set_name': train_set_name, 'train_set_size': train_set_size, 'test_set_name': test_set_name, 'test_set_size': test_set_size, 'featureset': json.dumps(featureset), 'featureset_name': featureset_name, 'shuffle': shuffle, 'learner_name': learner_name, 'task': task, 'start_timestamp': start_timestamp.strftime('%d %b %Y %H:%M:' '%S.%f'), 'version': __version__, 'feature_scaling': feature_scaling, 'grid_search': grid_search, 'grid_objective': grid_objective, 'grid_search_folds': grid_search_folds, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds, 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION} # check if we're doing cross-validation, because we only load/save # models when we're not. task_results = None if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate( train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) else: # if we have do not have a saved model, we need to train one. if not exists(modelfile) or overwrite: print(('\tfeaturizing and training new ' + '{} model').format(learner_name), file=log_file) if not isinstance(cv_folds, int): grid_search_folds = cv_folds best_score = learner.train(train_examples, shuffle=shuffle, grid_search=grid_search, grid_search_folds=grid_search_folds, grid_objective=grid_objective, param_grid=param_grid, grid_jobs=grid_search_jobs) grid_scores = [best_score] # save model if model_path: learner.save(modelfile) if grid_search: # note: bankers' rounding is used in python 3, # so these scores may be different between runs in # python 2 and 3 at the final decimal place. print('\tbest {} grid search score: {}' .format(grid_objective, round(best_score, 3)), file=log_file) else: grid_scores = [None] # print out the tuned parameters and best CV score param_out = ('{}: {}'.format(param_name, param_value) for param_name, param_value in iteritems(learner.model.get_params())) print('\thyperparameters: {}'.format(', '.join(param_out)), file=log_file) # run on test set or cross-validate on training data, # depending on what was asked for if task == 'evaluate': print('\tevaluating predictions', file=log_file) task_results = [learner.evaluate( test_examples, prediction_prefix=prediction_prefix, grid_objective=grid_objective)] elif task == 'predict': print('\twriting predictions', file=log_file) learner.predict(test_examples, prediction_prefix=prediction_prefix) # do nothing here for train end_timestamp = datetime.datetime.now() learner_result_dict_base['end_timestamp'] = end_timestamp.strftime( '%d %b %Y %H:%M:%S.%f') total_time = end_timestamp - start_timestamp learner_result_dict_base['total_time'] = str(total_time) if task == 'cross_validate' or task == 'evaluate': results_json_path = join(results_path, '{}.results.json'.format(job_name)) res = _create_learner_result_dicts(task_results, grid_scores, learner_result_dict_base) # write out the result dictionary to a json file file_mode = 'w' if sys.version_info >= (3, 0) else 'wb' with open(results_json_path, file_mode) as json_file: json.dump(res, json_file, cls=NumpyTypeEncoder) with open(join(results_path, '{}.results'.format(job_name)), 'w') as output_file: _print_fancy_output(res, output_file) else: res = [learner_result_dict_base] return res