def test_same_schema_with_dataframe_input(self): train_df_updated = train_df.drop(['c0'], axis=1) test_df_updated = test_df.drop(['c0'], axis=1) rf_max = 4.5 # Create reference pipeline std_pipeline = Pipeline([ RangeFilter(min=0.0, max=rf_max) << 'c2', OnlineGradientDescentRegressor(label='c2', feature=['c1']) ], random_state=seed) std_pipeline.fit(train_df_updated) result_1 = std_pipeline.predict(test_df_updated) # Create combined pipeline transform_pipeline = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2']) transform_pipeline.fit(train_df_updated) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline.model), OnlineGradientDescentRegressor(label='c2', feature=['c1']) ], random_state=seed) combined_pipeline.fit(train_df_updated) os.remove(transform_pipeline.model) result_2 = combined_pipeline.predict(test_df_updated) self.assertTrue(result_1.equals(result_2))
def test_different_schema_with_dataframe_input(self): # Create reference pipeline std_pipeline = Pipeline([ OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) std_pipeline.fit(train_df) result_1 = std_pipeline.predict(test_df) # Create combined pipeline transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_df) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline.model), OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) combined_pipeline.fit(train_df) os.remove(transform_pipeline.model) result_2 = combined_pipeline.predict(test_df) self.assertTrue(result_1.equals(result_2))
def test_combining_two_dataset_transformers(self): rf_max = 4.5 # Create reference pipeline std_pipeline = Pipeline([ RangeFilter(min=0.0, max=rf_max) << 'c2', OneHotVectorizer() << 'c0', OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) std_pipeline.fit(train_df) result_1 = std_pipeline.predict(test_df) # Create combined pipeline transform_pipeline1 = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2']) transform_pipeline1.fit(train_df) transform_pipeline2 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline2.fit(train_df) combined_pipeline = Pipeline([ DatasetTransformer(transform_model=transform_pipeline1.model), DatasetTransformer(transform_model=transform_pipeline2.model), OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) ], random_state=seed) combined_pipeline.fit(train_df) os.remove(transform_pipeline1.model) os.remove(transform_pipeline2.model) result_2 = combined_pipeline.predict(test_df) self.assertTrue(result_1.equals(result_2))
def test_notvectorized_output_predictor_model(self): """ This test verifies that outputted predictor model from combined (with featurizers) pipeline runs successfully on featurized data with no vectors. """ df = train_df.drop(['c0'], axis=1) # Create and fit a RangeFilter transform using the training # data and use it to transform the training data. transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'], random_state=seed) transform_pipeline.fit(df) df1 = transform_pipeline.transform(df) # Create and fit a combined model and spit out predictor model combined_pipeline = Pipeline([ RangeFilter(min=0.0, max=4.5) << 'c2', OnlineGradientDescentRegressor(label='c2') ], random_state=seed) combined_pipeline.fit(df, output_predictor_model=True) result_1 = combined_pipeline.predict(df) # Load predictor pipeline and score featurized data predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) result_2 = predictor_pipeline.predict(df1) self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_predict(self): transformed_data, transformed_data_df = transform_data() fl = FastLinearRegressor(feature=['parity', 'in', 'sp', 'stratum'], label='age') flpipe = Pipeline([fl]) flpipe.fit(transformed_data) scores = flpipe.predict(transformed_data) scores_df = flpipe.predict(transformed_data_df) assert_array_equal(scores, scores_df)
def test_model_datastream(self): model_nimbusml = Pipeline( steps=[ ('cat', OneHotVectorizer() << categorical_columns), ('linear', FastLinearBinaryClassifier( shuffle=False, number_of_threads=1))]) model_nimbusml.fit(train, label) # Save with pickle pickle_filename = get_temp_file(suffix='.p') with open(pickle_filename, 'wb') as f: pickle.dump(model_nimbusml, f) with open(pickle_filename, "rb") as f: model_nimbusml_pickle = pickle.load(f) os.remove(pickle_filename) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_pickle.predict(test).head(5) metrics, score = model_nimbusml.test(test, test_label, output_scores=True) metrics_pickle, score_pickle = model_nimbusml_pickle.test( test, test_label, output_scores=True) assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2) assert_almost_equal( metrics.sum().sum(), metrics_pickle.sum().sum(), decimal=2) # Save load with pipeline methods model_filename = get_temp_file(suffix='.m') model_nimbusml.save_model(model_filename) model_nimbusml_load = Pipeline() model_nimbusml_load.load_model(model_filename) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_load.predict(test).head(5) metrics, score = model_nimbusml.test(test, test_label, output_scores=True) model_nimbusml_load, score_load = model_nimbusml_load.test( test, test_label, evaltype='binary', output_scores=True) assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2) assert_almost_equal( metrics.sum().sum(), model_nimbusml_load.sum().sum(), decimal=2) os.remove(model_filename)
def test_syntax_concat_slots(self): X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weight=[10., 1., 1., 1., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline([ OneHotVectorizer() << ['workclass', 'education'], Concat() << {'newcol': ['workclass', 'education']}, ]) exp.fit(X, verbose=0) exp.predict(X)
def test_pipeline_subclass_can_override_predict(self): X, y = generate_dataset_1() pipeline = Pipeline([LogisticRegressionBinaryClassifier()]) pipeline.fit(X, y) result = pipeline.predict(X)['PredictedLabel'] self.assertTrue(np.array_equal(result.values, y['y'].values)) pipeline = CustomPipeline([LogisticRegressionBinaryClassifier()]) pipeline.fit(X, y) self.assertEqual(pipeline.predict(X, test_return_value=3), 3)
def test_syntax6_change_role(self): # REVIEW: the pipeline drops all columns but one --> # nimbusml still thinks the Features are eduction, workclass # and does not automatically detects that the only remaining # columns should play that role # (maybe because the label column is here too even though # the only remaining column without a role is Features). df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << {'f1': 'education'}, OneHotHashVectorizer() << {'f2': 'education'}, OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'}, Concat() << {'Features': ['f%d' % i for i in range(1, 4)]}, Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'], FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features'] ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list(prediction.columns)) == [ 'PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax6_regular_expression(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education' }, OneHotHashVectorizer() << { 'f2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, Concat() << { 'Features': ['f%d' % i for i in range(1, 4)] }, Drop() << '~Features', FastLinearBinaryClassifier(max_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_ensemble_supports_user_defined_transforms(self): test2_df = test_df.copy(deep=True) test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]})) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result1 = r1.predict(test2_df) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result2 = r2.predict(test2_df) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result3 = r3.predict(test2_df) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([ RangeFilter(min=0, max=10, columns='c1'), VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ]) pipeline.fit(train_df) result4 = pipeline.predict(test2_df) self.assertEqual(len(result4), 3) average1 = (result1[0] + result2[0] + result3[0]) / 3 average2 = (result1[1] + result2[1] + result3[1]) / 3 average3 = (result1[2] + result2[2] + result3[2]) / 3 self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5) self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5) self.assertAlmostEqual(average3, result4.loc[2, 'Score'], places=5)
def test_syntax8_label(self): df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], yy=[1.1, 2.2, 1.24, 3.4, 3.4])) X = df.drop('yy', axis=1) exp = Pipeline([ MeanVarianceScaler() << {'new_y': 'yy'}, OneHotVectorizer() << ['workclass', 'education'], Drop() << 'yy', FastLinearRegressor() << {'Feature': ['workclass', 'education'], Role.Label: 'new_y'} ]) exp.fit(df, verbose=0) assert exp.nodes[-1].feature_column_ == 'Features' assert exp.nodes[-1].label_column_ == 'new_y' # The pipeline requires it now as it is transformed all along. X['yy'] = 0.0 prediction = exp.predict(X, verbose=0) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) if prediction['Score'].min() < 0.4: raise Exception(prediction) if prediction['Score'].max() > 2.00: raise Exception(prediction)
def test_syntax12_mixed2(self): X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weight=[10., 1., 1., 1., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) exp = Pipeline( [ OneHotVectorizer( columns=[ 'workclass', 'education']), Concat( columns={ 'Feature': ['workclass', 'education']}), FastTreesRegressor( num_trees=5, feature='Feature', weight='weight') << { Role.Label: 'y'}]) exp.fit(X, verbose=0) assert exp.nodes[-1].feature_column_ == 'Feature' assert exp.nodes[-1].label_column_ == 'y' assert exp.nodes[-1].weight_column_ == 'weight' # y is required here as well as weight. # It is replaced by fakes values. # The test does not fail but the weight is not taken into account. X['y'] = -5 X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1)
def nimbus_pred(model_path, test_set_path): X = pd.read_csv(test_set_path) X['c'] = X['c'].astype("category") p = Pipeline() p.load_model(model_path) pred = p.predict(X) print(pred)
def test_syntax10_weights(self): df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], weight=[1., 1., 1., 2., 1.], y=[1.1, 2.2, 1.24, 3.4, 3.4])) X = df.drop(['y', 'weight'], axis=1) y = df['y'] w = df['weight'] exp = Pipeline([ OneHotVectorizer() << ['workclass', 'education'], FastLinearRegressor() ]) exp.fit(X, y, weight=w, verbose=0) assert exp.nodes[-1].feature_column == 'Features' assert exp.nodes[-1].label_column == 'y' assert exp.nodes[-1].weight_column == 'weight' X['weight'] = -5 prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) if prediction['Score'].min() < 1.: raise Exception(prediction) if prediction['Score'].max() > 3.6: raise Exception(prediction) if len(set(prediction['Score'])) < 4: raise Exception(prediction)
def test_syntax3(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << 'education', OneHotVectorizer(max_num_terms=2) << 'workclass', # Currently the learner does not use edu1 # unless it is specified explicitely so nimbusml # does not do what the syntax implicetely tells. # We need to modify either the bridge to look into # every available column at one step. FastLinearBinaryClassifier(max_iterations=1) ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax11_learner(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, FastLinearBinaryClassifier(max_iterations=1) << { 'Features': ['edu1', 'edu2'], Role.Label: 'y' } ]) exp.fit(df) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax4_dict(self): df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'edu1': 'education' }, OneHotHashVectorizer() << { 'edu2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'wki': 'workclass' }, Concat() << { 'Inputs': ['edu1', 'edu2', 'wki'] }, FastLinearBinaryClassifier(max_iterations=1) << 'Inputs' ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax5_regular_expression(self): # REVIEW: not implemented yet # The best would be to handle regular expression inside nimbusml. # It could be handled in entrypoint.py just before calling nimbusml. # It can be handled inside Pipeline if it is aware of # the input schema. df = pandas.DataFrame( dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << { 'f1': 'education' }, OneHotHashVectorizer() << { 'f2': 'education' }, OneHotVectorizer(max_num_terms=2) << { 'f3': 'workclass' }, Concat() << { 'Features': 'f[0-9]+' }, FastLinearBinaryClassifier(max_iterations=1) << 'Features' ]) exp.fit(X, y) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list( prediction.columns)) == ['PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3)
def test_syntax7_rename(self): # Error message are usually not informative enough. # Missing column --> no indication of other columns. # Error is (one transform should handle it) # 'The label column 'y' of the training data has a data type # not suitable for binary classification: Vec<Key<U4, 0-1>, 2>. # Type must be Bool, R4, R8 or Key with two classes. df = pandas.DataFrame( dict( education=[ 'A', 'B', 'A', 'B', 'A'], workclass=[ 'X', 'X', 'Y', 'Y', 'Y'], y=[ 'red', 'white', 'red', 'white', 'white'])) X = df.drop('y', axis=1) y = df['y'] exp = Pipeline([ OneHotVectorizer() << 'y', OneHotVectorizer() << ['workclass', 'education'], TypeConverter(result_type='R4') << {'yi': 'y'}, Drop() << 'y', FastLinearBinaryClassifier(max_iterations=1) << 'yi' ]) exp.fit(X, y, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert list(prediction.columns) == ['Score'] assert prediction.shape == (5, 1) assert prediction.min() > 0.01 assert prediction.max() < 0.05
def test_two_pipelines_created_using_idv_binary_data_can_be_combined_in_to_one_model( self): """ This test verifies that two models can be combined even if the transform increases the number of columns. """ # Create and fit a OneHotVectorizer transform using the # training data and use it to transform the training data. transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) transform_pipeline.fit(train_df) df = transform_pipeline.transform(train_df, as_binary_data_stream=True) # Create and fit an OnlineGradientDescentRegressor using # the transformed training data from the previous step. predictor_pipeline = Pipeline( [OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1'])], random_state=seed) predictor_pipeline.fit(df) # Perform a prediction given the test data using # the transform and predictor defined previously. df = transform_pipeline.transform(test_df, as_binary_data_stream=True) result_1 = predictor_pipeline.predict(df) # Combine the above Pipelines in to one Pipeline and use # the new Pipeline to get predictions given the test data. combined_pipeline = Pipeline.combine_models(transform_pipeline, predictor_pipeline) result_2 = combined_pipeline.predict(test_df) # Verify that the prediction from the combined Pipeline # matches the prediction from the original two Pipelines. self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score'])
def test_trees(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastTreesBinaryClassifier()]) pipeline.fit(train, label) out_data = pipeline.predict(test) check_accuracy(test_file, label_column, out_data, 0.65)
def test_model_datastream(self): model_nimbusml = Pipeline( steps=[('cat', OneHotVectorizer() << categorical_columns), ('linear', FastLinearBinaryClassifier(shuffle=False, train_threads=1) )]) model_nimbusml.fit(train, label) # Save with pickle pickle.dump(model_nimbusml, open('nimbusml_model.p', 'wb')) model_nimbusml_pickle = pickle.load(open("nimbusml_model.p", "rb")) score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_pickle.predict(test).head(5) metrics, score = model_nimbusml.test(test, test_label, output_scores=True) metrics_pickle, score_pickle = model_nimbusml_pickle.test( test, test_label, output_scores=True) assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2) assert_almost_equal(metrics.sum().sum(), metrics_pickle.sum().sum(), decimal=2) # Save load with pipeline methods model_nimbusml.save_model('model.nimbusml.m') model_nimbusml_load = Pipeline() model_nimbusml_load.load_model('model.nimbusml.m') score1 = model_nimbusml.predict(test).head(5) score2 = model_nimbusml_load.predict(test).head(5) metrics, score = model_nimbusml.test(test, test_label, output_scores=True) model_nimbusml_load, score_load = model_nimbusml_load.test( test, test_label, evaltype='binary', output_scores=True) assert_almost_equal(score1.sum().sum(), score2.sum().sum(), decimal=2) assert_almost_equal(metrics.sum().sum(), model_nimbusml_load.sum().sum(), decimal=2)
def test_trees_file(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastTreesBinaryClassifier() << { 'Label': label_column}]) train_stream = FileDataStream(train_file, schema=file_schema) pipeline.fit(train_stream, label_column) test_stream = FileDataStream(test_file, schema=file_schema) out_data = pipeline.predict(test_stream) check_accuracy(test_file, label_column, out_data, 0.65)
def test_linear_with_train_test_schema(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastLinearBinaryClassifier(train_threads=1, shuffle=False)]) pipeline.fit(train, label) out_data = pipeline.predict(test) check_accuracy(test_file, label_column, out_data, 0.65)
def test_linear_file_role(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastLinearBinaryClassifier(train_threads=1, shuffle=False)]) train_stream = FileDataStream(train_file, schema=file_schema) train_stream._set_role('Label', label_column) pipeline.fit(train_stream) test_stream = FileDataStream(test_file, schema=file_schema) out_data = pipeline.predict(test_stream) check_accuracy(test_file, label_column, out_data, 0.65)
def test_linear_file(self): pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastLinearBinaryClassifier(train_threads=1, shuffle=False)]) train_stream = FileDataStream(train_file, schema=file_schema) assert 'sep' in train_stream.schema.options assert 'header' in train_stream.schema.options pipeline.fit(train_stream, label_column) test_stream = FileDataStream(test_file, schema=file_schema) out_data = pipeline.predict(test_stream) check_accuracy(test_file, label_column, out_data, 0.65)
def test_syntax11_append_insert(self): df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'], workclass=['X', 'X', 'Y', 'Y', 'Y'], y=[1, 0, 1, 0, 0])) X = df.drop('y', axis=1) exp = Pipeline() exp.append( ("OneHotHashVectorizer", OneHotHashVectorizer() << { 'edu2': 'education'})) exp.insert(0, OneHotVectorizer() << {'edu1': 'education'}) exp.append( FastLinearBinaryClassifier( maximum_number_of_iterations=1) << { 'Features': [ 'edu1', 'edu2'], Role.Label: 'y'}) exp.append(OneHotHashVectorizer() << {'edu2': 'education'}) del exp[-1] assert len(exp) == 3 exp.fit(df, verbose=0) prediction = exp.predict(X) assert isinstance(prediction, pandas.DataFrame) assert sorted(list(prediction.columns)) == [ 'PredictedLabel', 'Probability', 'Score'] assert prediction.shape == (5, 3) try: exp.append(OneHotHashVectorizer() << {'edu2': 'education'}) except RuntimeError as e: assert "Model is fitted and cannot be modified" in str(e) try: exp.insert(0, OneHotHashVectorizer() << {'edu2': 'education'}) except RuntimeError as e: assert "Model is fitted and cannot be modified" in str(e) try: del exp[0] except RuntimeError as e: assert "Model is fitted and cannot be modified" in str(e) obj = exp[1][1] assert obj.__class__.__name__ == "OneHotHashVectorizer" obj = exp[1][1] assert obj.__class__.__name__ == "OneHotHashVectorizer" res = exp['OneHotHashVectorizer'] assert len(res) == 1 graph = exp.graph_ assert len(graph.nodes) >= len(exp)
def test_ensemble_supports_output_predictor_model(self): test2_df = test_df.copy(deep=True) test2_df = test2_df.append(pd.DataFrame({'c1': [9, 11], 'c2': [1, 1]}), ignore_index=True) test2_df = test2_df.astype({'c1': np.float32, 'c2': np.float32}) # Create a ground truth pipeline r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1', VotingRegressor(estimators=[r1, r2], combiner='Average')]) combined_pipeline.fit(train_df) result_1 = combined_pipeline.predict(test2_df) # Create a duplicate pipeline but also request a predictor model r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c1', VotingRegressor(estimators=[r1, r2], combiner='Average')]) combined_pipeline.fit(train_df, output_predictor_model=True) result_2 = combined_pipeline.predict(test2_df) # Create a predictor model only pipeline predictor_pipeline = Pipeline() predictor_pipeline.load_model(combined_pipeline.predictor_model) result_3 = predictor_pipeline.predict(test2_df) # Verify the first rows are equal self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) self.assertEqual(result_2.loc[0, 'Score'], result_3.loc[0, 'Score']) # Verify the second rows are equal self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) self.assertEqual(result_2.loc[1, 'Score'], result_3.loc[1, 'Score']) # Verify the number of rows self.assertEqual(len(result_1), 2) self.assertEqual(len(result_2), 2) self.assertEqual(len(result_3), 4)
def test_ensemble_with_average_and_median_combiner(self): r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r1.fit(train_df) result1 = r1.predict(test_df) r2 = OnlineGradientDescentRegressor(**ogdArgs) r2.fit(train_df) result2 = r2.predict(test_df) r3 = LightGbmRegressor(**lgbmArgs) r3.fit(train_df) result3 = r3.predict(test_df) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Average')]) pipeline.fit(train_df) result4 = pipeline.predict(test_df) average1 = (result1[0] + result2[0] + result3[0]) / 3 average2 = (result1[1] + result2[1] + result3[1]) / 3 self.assertAlmostEqual(average1, result4.loc[0, 'Score'], places=5) self.assertAlmostEqual(average2, result4.loc[1, 'Score'], places=5) r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LightGbmRegressor(**lgbmArgs) pipeline = Pipeline([VotingRegressor(estimators=[r1, r2, r3], combiner='Median')]) pipeline.fit(train_df) result4 = pipeline.predict(test_df) median1 = sorted([result1.loc[0], result2.loc[0], result3.loc[0]])[1] median2 = sorted([result1.loc[1], result2.loc[1], result3.loc[1]])[1] self.assertEqual(median1, result4.loc[0, 'Score']) self.assertEqual(median2, result4.loc[1, 'Score'])