def test_syntax11_learner(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, FastLinearBinaryClassifier(max_iterations=1) << { 'Features': ['edu1', 'edu2'], Role.Label: 'y' } ]) exp.fit(df) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax5_regular_expression(self): # REVIEW: not implemented yet # The best would be to handle regular expression inside nimbusml. # It could be handled in entrypoint.py just before calling nimbusml. # It can be handled inside Pipeline if it is aware of # the input schema. df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education' }, OneHotHashVectorizer() << { 'f2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, Concat() << { 'Features': 'f[0-9]+' }, FastLinearBinaryClassifier(max_iterations=1) << 'Features' ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax6_regular_expression(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education' }, OneHotHashVectorizer() << { 'f2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, Concat() << { 'Features': ['f%d' % i for i in range(1, 4)] }, Drop() << '~Features', FastLinearBinaryClassifier(max_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax4_fail(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'wki': 'workclass' }, FastLinearBinaryClassifier(max_iterations=1) << ['edu1', 'edu2', 'wki'] ]) try: exp.fit(X, y) assert False except RuntimeError as e: assert "ConcatTransform() << {'Input': ['edu1', 'edu2', 'wki']}" \ in str(e)
def test_syntax4_fail2(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'wki': 'workclass' }, FastLinearBinaryClassifier(max_iterations=1) << ['edu1', 'edu4', 'wki'] ]) try: exp.fit(X, y) raise AssertionError("The test should not reach this line.") except Exception as e: assert "Feature column 'edu4' not found" in str(e)
def test_syntax4_dict(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'wki': 'workclass' }, Concat() << { 'Inputs': ['edu1', 'edu2', 'wki'] }, FastLinearBinaryClassifier(max_iterations=1) << 'Inputs' ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax6_change_role(self): # REVIEW: the pipeline drops all columns but one --> # nimbusml still thinks the Features are eduction, workclass # and does not automatically detects that the only remaining # columns should play that role # (maybe because the label column is here too even though # the only remaining column without a role is Features). df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << {'f1': 'education'}, OneHotHashVectorizer() << {'f2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'Features': ['f%d' % i for i in range(1, 4)]}, Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'], FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features'] ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list(prediction.columns)) == [ 'PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_ensemble_supports_user_defined_transforms(self): test2_df = test_df.copy(deep=True) test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]})) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result1 = r1.predict(test2_df) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result2 = r2.predict(test2_df) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result3 = r3.predict(test2_df) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([ RangeFilter(min=0, max=10, columns='c1'), VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) pipeline.fit(train_df) result4 = pipeline.predict(test2_df) self.assertEqual(len(result4), 3) average1 = (result1[0] + result2[0] + result3[0]) / 3 average2 = (result1[1] + result2[1] + result3[1]) / 3 average3 = (result1[2] + result2[2] + result3[2]) / 3 self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5) self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5) self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
def test_syntax3(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << 'education', OneHotVectorizer(max_num_terms=2) << 'workclass', # Currently the learner does not use edu1 # unless it is specified explicitely so nimbusml # does not do what the syntax implicetely tells. # We need to modify either the bridge to look into # every available column at one step. FastLinearBinaryClassifier(max_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax10_weights(self): df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weight=[1., 1., 1., 2., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) X = df.drop(['y', 'weight'], axis=1) y = df['y'] w = df['weight'] exp = Pipeline([ OneHotVectorizer() << ['workclass', 'education'], FastLinearRegressor() ]) exp.fit(X, y, weight=w, verbose=0) assert exp.nodes[-1].feature_column == 'Features' assert exp.nodes[-1].label_column == 'y' assert exp.nodes[-1].weight_column == 'weight' X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) if prediction['Score'].min() < 1.: raise Exception(prediction) if prediction['Score'].max() > 3.6: raise Exception(prediction) if len(set(prediction['Score'])) < 4: raise Exception(prediction)
def test_test(self): transformed_data, transformed_data_df = transform_data() fl = FastLinearRegressor( feature=[ 'parity', 'in', 'sp', 'stratum'], label='age') flpipe = Pipeline([fl]) flpipe.fit(transformed_data) metrics, scores = flpipe.test(transformed_data, output_scores=True) metrics_df, scores_df = flpipe.test( transformed_data_df, output_scores=True) assert_array_equal(scores, scores_df) assert_array_equal(metrics, metrics_df) flpipe.fit( transformed_data_df.drop( 'age', axis=1), transformed_data_df['age']) metrics, scores = flpipe.test(transformed_data, output_scores=True) metrics_df, scores_df = flpipe.test( transformed_data_df, output_scores=True) assert_array_equal(scores, scores_df) assert_array_equal(metrics, metrics_df)
def test_metrics_evaluate_clusterer(self): np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = KMeansPlusPlus(n_clusters=2, initialization_algorithm="Random") e = Pipeline([lr]) e.fit(X_train, y_train.to_frame(), verbose=0) metrics, _ = e.test(X_test, y_test) # if abs(metrics['NMI'][0] - 0.7) >= 0.15: # raise AssertionError("NMI loss should be %f not %f" % \ # (0.7, metrics['NMI'][0])) # if abs(metrics['AvgMinScore'][0] - 0.014) >= 0.015: # raise AssertionError("AvgMinScore should be %f not %f" % (\ # 0.014, metrics['AvgMinScore'][0])) assert_almost_equal(metrics['NMI'][0], 0.7, decimal=0, err_msg="NMI loss should be %s" % 0.7) assert_almost_equal(metrics['AvgMinScore'][0], 0.032, decimal=2, err_msg="AvgMinScore should be %s" % 0.014)
def test_pipeline_loaded_from_zip_has_feature_contributions(self): features = ['age', 'education-num', 'hours-per-week'] model_nimbusml = Pipeline( steps=[FastLinearBinaryClassifier(feature=features)]) model_nimbusml.fit(train, label) fc = model_nimbusml.get_feature_contributions(test) # Save the model to zip model_filename = get_temp_file(suffix='.zip') model_nimbusml.save_model(model_filename) # Load the model from zip model_nimbusml_zip = Pipeline() model_nimbusml_zip.load_model(model_filename) fc_zip = model_nimbusml_zip.get_feature_contributions(test) assert ['FeatureContributions.' + feature in fc_zip.columns for feature in features] assert [fc['FeatureContributions.' + feature].equals( fc_zip['FeatureContributions.' + feature]) for feature in features] os.remove(model_filename)
def test_pipeline_with_no_columns_raise(self): trainData = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier() ]) assert ppl is not None # Bug 147697 info = ppl.get_fit_info(trainData[["SentimentText"]], trainData["Sentiment"]) assert len(info) == 2 assert len(info[0]) == 3 with self.assertRaises(RuntimeError): # Message # System.InvalidOperationException: # 'LightGBM Error, code is -1, error message is # 'Cannot construct Dataset since there are not useful features. # It should be at least two unique rows. # If the num_row (num_data) is small, # you can set min_data=1 and min_data_in_bin=1 to fix this. # Otherwise please make sure you are using the right dataset.' ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])
def test_model_summary(self): for learner in learners: pipeline = Pipeline( [OneHotVectorizer() << categorical_columns, learner]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column) pipeline.summary()
def test_two_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model( self): """ This test verifies that two models can be combined even if the transform increases the number of columns. """ # Create and fit a OneHotVectorizer transform using the # training data and use it to transform the training data. transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df, as_binary_data_stream=True) # Create and fit an OnlineGradientDescentRegressor using # the transformed training data from the previous step. predictor_pipeline = Pipeline( [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])], random_state=seed) predictor_pipeline.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline.transform(test_df, as_binary_data_stream=True) result_1 = predictor_pipeline.predict(df) # Combine the above Pipelines in to one Pipeline and use # the new Pipeline to get predictions given the test data. combined_pipeline = Pipeline.combine_models(transform_pipeline, predictor_pipeline) result_2 = combined_pipeline.predict(test_df) # Verify that the prediction from the combined Pipeline # matches the prediction from the original two Pipelines. self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_experiment_loadsavemodel(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') cat = OneHotVectorizer() << categorical_columns ftree = FastTreesBinaryClassifier() pipeline = Pipeline([cat, ftree]) pipeline.fit(train, label) metrics1, scores1 = pipeline.test(test, label1, 'binary', output_scores=True) sum1 = metrics1.sum().sum() (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin') fl = os.fdopen(fd, 'w') fl.close() pipeline.save_model(modelfilename) pipeline2 = Pipeline() pipeline2.load_model(modelfilename) metrics2, scores2 = pipeline2.test(test, label1, 'binary', output_scores=True) sum2 = metrics2.sum().sum() assert_equal(sum1, sum2, "model metrics don't match after loading model")
def test_notvectorized_output_predictor_model(self): """ This test verifies that outputted predictor model from combined (with featurizers) pipeline runs successfully on featurized data with no vectors. """ df = train_df.drop(['c0'], axis=1) # Create and fit a RangeFilter transform using the training # data and use it to transform the training data. transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'], random_state=seed) transform_pipeline.fit(df) df1 = transform_pipeline.transform(df) # Create and fit a combined model and spit out predictor model combined_pipeline = Pipeline([ RangeFilter(min=0.0, max=4.5) << 'c2', OnlineGradientDescentRegressor(label='c2') ], random_state=seed) combined_pipeline.fit(df, output_predictor_model=True) result_1 = combined_pipeline.predict(df) # Load predictor pipeline and score featurized data predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) result_2 = predictor_pipeline.predict(df1) self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_model_summary_not_supported(self): for learner in learners_not_supported: pipeline = Pipeline( [OneHotVectorizer() << categorical_columns, learner]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column) assert_raises(TypeError, pipeline.summary)
def test_unpickled_pipeline_has_feature_contributions(self): features = ['age', 'education-num', 'hours-per-week'] model_nimbusml = Pipeline( steps=[FastLinearBinaryClassifier(feature=features)]) model_nimbusml.fit(train, label) fc = model_nimbusml.get_feature_contributions(test) # Save with pickle pickle_filename = get_temp_file(suffix='.p') with open(pickle_filename, 'wb') as f: pickle.dump(model_nimbusml, f) # Unpickle model with open(pickle_filename, "rb") as f: model_nimbusml_pickle = pickle.load(f) fc_pickle = model_nimbusml_pickle.get_feature_contributions(test) assert ['FeatureContributions.' + feature in fc_pickle.columns for feature in features] assert [fc['FeatureContributions.' + feature].equals( fc_pickle['FeatureContributions.' + feature]) for feature in features] os.remove(pickle_filename)
def test_metrics_evaluate_regressor(self): np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = FastTreesRegressor() e = Pipeline([lr]) e.fit(X_train, y_train.to_frame(), verbose=0) metrics, _ = e.test(X_test, y_test) # TODO: debug flucations, and increase decimal precision on checks assert_almost_equal(metrics['L1(avg)'][0], 0.107, decimal=1, err_msg="L1 loss should be %s" % 0.107) assert_almost_equal(metrics['L2(avg)'][0], 0.0453, decimal=1, err_msg="L2(avg) should be %s" % 0.0453) assert_almost_equal(metrics['Loss-fn(avg)'][0], 0.0453, decimal=1, err_msg="Loss-fn(avg)loss should be %s" % 0.0453)
def test_syntax8_label(self): df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) X = df.drop('yy', axis=1) exp = Pipeline([ MeanVarianceScaler() << {'new_y': 'yy'}, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << {'Feature': ['workclass', 'education'], Role.Label: 'new_y'} ]) exp.fit(df, verbose=0) assert exp.nodes[-1].feature_column_ == 'Features' assert exp.nodes[-1].label_column_ == 'new_y' # The pipeline requires it now as it is transformed all along. X['yy'] = 0.0 prediction = exp.predict(X, verbose=0) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) if prediction['Score'].min() < 0.4: raise Exception(prediction) if prediction['Score'].max() > 2.00: raise Exception(prediction)
def test_syntax7_rename(self): # Error message are usually not informative enough. # Missing column --> no indication of other columns. # Error is (one transform should handle it) # 'The label column 'y' of the training data has a data type # not suitable for binary classification: Vec<Key<U4, 0-1>, 2>. # Type must be Bool, R4, R8 or Key with two classes. df = pandas.DataFrame( dict( education=[ 'A', 'B', 'A', 'B', 'A'], workclass=[ 'X', 'X', 'Y', 'Y', 'Y'], y=[ 'red', 'white', 'red', 'white', 'white'])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << 'y', OneHotVectorizer() << ['workclass', 'education'], TypeConverter(result_type='R4') << {'yi': 'y'}, Drop() << 'y', FastLinearBinaryClassifier(max_iterations=1) << 'yi' ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) assert prediction.min() > 0.01 assert prediction.max() < 0.05
def test_syntax12_mixed2(self): X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weight=[10., 1., 1., 1., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline( [ OneHotVectorizer( columns=[ 'workclass', 'education']), Concat( columns={ 'Feature': ['workclass', 'education']}), FastTreesRegressor( num_trees=5, feature='Feature', weight='weight') << { Role.Label: 'y'}]) exp.fit(X, verbose=0) assert exp.nodes[-1].feature_column_ == 'Feature' assert exp.nodes[-1].label_column_ == 'y' assert exp.nodes[-1].weight_column_ == 'weight' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. X['y'] = -5 X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1)
def test_pipeline_with_no_columns(self): trainData = pd.DataFrame({ "Sentiment": [0, 1, 1, 0, 1, 1], "SentimentText": [ "this is train ", "review ", "sentence ", "an apple", "sentence 22", "another one one one" ] }) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1) ]) assert ppl is not None # Bug 147697 info = ppl.get_fit_info(trainData[["SentimentText"]], trainData["Sentiment"]) assert len(info) == 2 assert len(info[0]) == 3 ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"]) ppl = Pipeline([ NGramFeaturizer(word_feature_extractor=n_gram()), LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1) ]) assert ppl is not None ppl.fit(trainData[["SentimentText"]], np.array(trainData["Sentiment"]))
def test_metrics_evaluate_binary(self): np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = LogisticRegressionBinaryClassifier() e = Pipeline([lr]) e.fit(X_train, y_train, verbose=0) metrics, _ = e.test(X_test, y_test) # TODO: debug flucations, and increase decimal precision on checks assert_almost_equal(metrics['AUC'][0], 0.980, decimal=1, err_msg="AUC should be %s" % 0.980) assert_almost_equal(metrics['Accuracy'][0], 0.632, decimal=1, err_msg="Accuracy should be %s" % 0.632) assert_almost_equal(metrics['Positive precision'][0], 1, decimal=1, err_msg="Positive precision should be %s" % 1) assert_almost_equal(metrics['Positive recall'][0], 0.125, decimal=1, err_msg="Positive recall should be %s" % 0.125) assert_almost_equal(metrics['Negative precision'][0], 0.611, decimal=1, err_msg="Negative precision should be %s" % 0.611) assert_almost_equal(metrics['Negative recall'][0], 1, decimal=1, err_msg="Negative recall should be %s" % 1) assert_almost_equal(metrics['Log-loss'][0], 0.686, decimal=1, err_msg="Log-loss should be %s" % 0.686) assert_almost_equal(metrics['Log-loss reduction'][0], 0.3005, decimal=3, err_msg="Log-loss reduction should be %s" % 0.3005) assert_almost_equal( metrics['Test-set entropy (prior Log-Loss/instance)'][0], 0.981, decimal=1, err_msg="Test-set entropy (prior Log-Loss/instance) should be %s" % 0.981) assert_almost_equal(metrics['F1 Score'][0], 0.222, decimal=1, err_msg="F1 Score should be %s" % 0.222) assert_almost_equal(metrics['AUPRC'][0], 0.966, decimal=1, err_msg="AUPRC should be %s" % 0.966)
def _test_schema_syntax_shift_df(self): df = pandas.DataFrame( data=dict(X1=[0.1, 0.2], X2=[0.1, 0.2], yl=[1, 0], tx=['e', 'r'])) exp = Pipeline( [OneHotVectorizer() << 'tx', FastLinearBinaryClassifier()]) exp.fit(df, 'yl')
def test_trees(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastTreesBinaryClassifier()]) pipeline.fit(train, label) out_data = pipeline.predict(test) check_accuracy(test_file, label_column, out_data, 0.65)
def test_default_label(self): df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] # 1 pipeline = Pipeline([ ColumnConcatenator() << { 'Features': ["Petal_Length", "Sepal_Length"] }, FastTreesBinaryClassifier(number_of_trees=2) << { Role.Label: 'Label', Role.Feature: 'Features' } ]) model = pipeline.fit(df, verbose=0) probabilities0 = model.predict_proba(df) # 2 pipeline = Pipeline([ ColumnConcatenator() << { 'Features': ["Petal_Length", "Sepal_Length"] }, FastTreesBinaryClassifier(number_of_trees=2) << { Role.Feature: 'Features' } ]) model = pipeline.fit(df, verbose=0) probabilities = model.predict_proba(df) assert_array_almost_equal(probabilities0, probabilities) # 3 pipeline = Pipeline([ ColumnConcatenator() << { 'Features': ["Petal_Length", "Sepal_Length"] }, FastTreesBinaryClassifier(number_of_trees=2) ]) model = pipeline.fit(df, verbose=0) probabilities = model.predict_proba(df) assert_array_almost_equal(probabilities0, probabilities) # 4 pipeline = Pipeline([ ColumnConcatenator() << { 'Features': ["Petal_Length", "Sepal_Length"] }, FastTreesBinaryClassifier(number_of_trees=2) << { Role.Label: 'Label' } ]) model = pipeline.fit(df, verbose=0) probabilities = model.predict_proba(df) assert_array_almost_equal(probabilities0, probabilities)
def test_trees_file(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastTreesBinaryClassifier() << { 'Label': label_column}]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column) test_stream = FileDataStream(test_file, schema=file_schema) out_data = pipeline.predict(test_stream) check_accuracy(test_file, label_column, out_data, 0.65)