def test_pickle_pipeline(self): np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier() pipe = Pipeline(steps=[("cat", cat), ("ftree", ftree)]) pipe.fit(X_train, y_train) scores = pipe.predict(X_test) accu1 = np.mean(y_test.values.ravel() == scores.values) # Unpickle model and score. We should get the exact same accuracy as # above s = pickle.dumps(pipe) os.remove(cat.model_) os.remove(ftree.model_) pipe2 = pickle.loads(s) scores2 = pipe2.predict(X_test) accu2 = np.mean(y_test.values.ravel() == scores2.values) assert_equal( accu1, accu2, "accuracy mismatch after unpickling pipeline")
def test_pipeline_clone(self): (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier() nimbusmlpipe = nimbusmlPipeline([cat, ftree]) skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)]) skpipe.fit(X_train, y_train) scores = skpipe.predict(X_test) copy = clone(skpipe) scores2 = copy.predict(X_test) assert_frame_equal(scores, scores2) # checks we can fit again skpipe.fit(X_train, y_train) scores3 = skpipe.predict(X_test) assert_frame_equal(scores, scores3)
def test_pickle_pipeline_and_nimbusml_pipeline(self): (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier() nimbusmlpipe = nimbusmlPipeline([cat, ftree]) skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)]) skpipe.fit(X_train, y_train) scores = skpipe.predict(X_test) accu1 = np.mean(y_test.values.ravel() == scores["PredictedLabel"].values) # Unpickle model and score. We should get the exact same accuracy as # above s = pickle.dumps(skpipe) pipe2 = pickle.loads(s) scores2 = pipe2.predict(X_test) accu2 = np.mean(y_test.values.ravel() == scores2["PredictedLabel"].values) assert_equal( accu1, accu2, "accuracy mismatch after unpickling pipeline") assert_frame_equal(scores, scores2)
def test_experiment_loadsavemodel(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') cat = OneHotVectorizer() << categorical_columns ftree = FastTreesBinaryClassifier() pipeline = Pipeline([cat, ftree]) pipeline.fit(train, label) metrics1, scores1 = pipeline.test(test, label1, 'binary', output_scores=True) sum1 = metrics1.sum().sum() (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin') fl = os.fdopen(fd, 'w') fl.close() pipeline.save_model(modelfilename) pipeline2 = Pipeline() pipeline2.load_model(modelfilename) metrics2, scores2 = pipeline2.test(test, label1, 'binary', output_scores=True) sum2 = metrics2.sum().sum() assert_equal(sum1, sum2, "model metrics don't match after loading model")
def test_pipeline_grid_search(self): (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier(number_of_trees=5) pipe = Pipeline( steps=[ ("cat", cat), ('pca', PCA(5)), ("ftree", ftree)]) grid = GridSearchCV(pipe, dict(pca__n_components=[2], ftree__number_of_trees=[11])) grid.fit(X_train, y_train) assert grid.best_params_ == { 'ftree__number_of_trees': 11, 'pca__n_components': 2} steps = grid.best_estimator_.steps ft = steps[-1][1] number_of_trees = ft.number_of_trees assert number_of_trees == 11
def test_clone_sweep(self): # grid search, then clone pipeline and grid search again # results should be same np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', encoding='utf-8') (X_test, y_test) = get_X_y(test_file, label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns learner = FastTreesBinaryClassifier(number_of_trees=100, number_of_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner__number_of_trees=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) grid.fit(X_train, y_train) pipe1 = pipe.clone() grid1 = GridSearchCV(pipe1, param_grid) grid1.fit(X_train, y_train) assert grid.best_params_[ 'learner__number_of_trees'] == grid1.best_params_[ 'learner__number_of_trees']
def test_trees(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastTreesBinaryClassifier()]) pipeline.fit(train, label) out_data = pipeline.predict(test) check_accuracy(test_file, label_column, out_data, 0.65)
def test_linear_with_train_test_schema(self): (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipeline = Pipeline([OneHotVectorizer() << categorical_columns, FastLinearBinaryClassifier(train_threads=1, shuffle=False)]) pipeline.fit(train, label) out_data = pipeline.predict(test) check_accuracy(test_file, label_column, out_data, 0.65)
def test_trees(self): np.random.seed(0) (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipe = Pipeline(steps=[('cat', OneHotVectorizer() << categorical_columns ), ('linear', FastTreesBinaryClassifier())]) pipe.fit(train, label) out_data = pipe.predict(test) check_accuracy_scikit(test_file, label_column, out_data, 0.77)
def test_linear(self): np.random.seed(0) (train, label) = get_X_y(train_file, label_column, sep=',') (test, label1) = get_X_y(test_file, label_column, sep=',') pipe = Pipeline(steps=[('cat', OneHotVectorizer() << categorical_columns), ('linear', FastLinearBinaryClassifier( shuffle=False, number_of_threads=1))]) pipe.fit(train, label) out_data = pipe.predict(test) check_accuracy_scikit(test_file, label_column, out_data, 0.779)
def check_accuracy(test_file, label_column, predictions, threshold, sep=','): (test, label) = get_X_y(test_file, label_column, sep=sep) accuracy = np.mean( label[label_column].values == predictions.ix[:, 'PredictedLabel'].values) assert_greater(accuracy, threshold, "accuracy should be greater than %s" % threshold)
def test_error_conditions(self): # grid search on a wrong param np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', encoding='utf-8') (X_test, y_test) = get_X_y(test_file, label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns learner = FastTreesBinaryClassifier(num_trees=100, num_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner__wrong_arg=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) assert_raises(ValueError, grid.fit, X_train, y_train)
def test_lightgbmclassifier(self): np.random.seed(0) train_file = get_dataset('wiki_detox_train').as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t', encoding="utf-8") X_train, X_test, y_train, y_test = train_test_split( train['SentimentText'], label) # map text reviews to vector space texttransform = NGramFeaturizer( word_feature_extractor=n_gram(), vector_normalizer='None') << 'SentimentText' X_train = texttransform.fit_transform(X_train, max_slots=5000) X_test = texttransform.transform(X_test, max_slots=5000) mymodel = LightGbmClassifier().fit(X_train, y_train, verbose=0) scores = mymodel.predict(X_test) accuracy = np.mean(y_test.values.ravel() == scores.values) assert_greater( accuracy, 0.58, "accuracy should be greater than %s" % 0.58)
def test_naivebayesclassifier(self): np.random.seed(0) train_file = get_dataset("wiki_detox_train").as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t') X_train, X_test, y_train, y_test = train_test_split( train['SentimentText'], label) # map text reviews to vector space texttransform = NGramFeaturizer( word_feature_extractor=n_gram(), vector_normalizer='None') << 'SentimentText' X_train = texttransform.fit_transform(X_train) X_test = texttransform.transform(X_test) mymodel = NaiveBayesClassifier() mymodel.fit(X_train, y_train) scores = mymodel.predict(X_test) accuracy = np.mean(y_test == [i for i in scores])[0] assert_greater( accuracy, 0.5, "accuracy should be greater than %s" % 0.5)
def test_feature_union(self): np.random.seed(0) (train, label) = get_X_y(train_file, label_column, sep=',', features=selected_features) (test, label1) = get_X_y(test_file, label_column, sep=',', features=selected_features) fu = FeatureUnion(transformer_list=[ ('onehot', OneHotEncoder()), ('cat', OneHotVectorizer()) ]) pipe = Pipeline( steps=[ ('fu', fu), ('linear', FastLinearBinaryClassifier( shuffle=False, number_of_threads=1))]) pipe.fit(train, label) out_data = pipe.predict(test) check_accuracy_scikit(test_file, label_column, out_data, 0.709)
def test_parallel(self): (train, label) = get_X_y(train_file, label_column, sep=',') cat = OneHotVectorizer() << categorical_columns ftree = FastTreesBinaryClassifier() pipeline = Pipeline([cat, ftree]) result = pipeline.fit(train, label, parallel=8) result2 = pipeline.fit(train, label, parallel=1) assert_true(result == result2)
def test_uciadult_sweep(self): # grid search over number_of_trees and then confirm the best number_of_trees by # full train np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', encoding='utf-8') (X_test, y_test) = get_X_y(test_file, label_column, sep=',', encoding='utf-8') cat = OneHotHashVectorizer() << categorical_columns # number_of_trees 100 will actually be never run by grid search # as its not in param_grid below learner = FastTreesBinaryClassifier(number_of_trees=100, number_of_leaves=5) pipe = Pipeline(steps=[('cat', cat), ('learner', learner)]) param_grid = dict(learner__number_of_trees=[1, 5, 10]) grid = GridSearchCV(pipe, param_grid) grid.fit(X_train, y_train) assert grid.best_params_['learner__number_of_trees'] == 10 # compare AUC on number_of_trees 1, 5, 10 pipe.set_params(learner__number_of_trees=1) pipe.fit(X_train, y_train) metrics1, _ = pipe.test(X_train, y_train) pipe.set_params(learner__number_of_trees=5) pipe.fit(X_train, y_train) metrics5, _ = pipe.test(X_train, y_train) pipe.set_params(learner__number_of_trees=10) pipe.fit(X_train, y_train) metrics10, _ = pipe.test(X_train, y_train) assert metrics10['AUC'][0] > metrics5['AUC'][0] assert metrics10['AUC'][0] > metrics1['AUC'][0] assert metrics10['AUC'][0] > 0.59
def test_pipeline_get_params(self): (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier() nimbusmlpipe = nimbusmlPipeline([cat, ftree]) skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)]) skpipe.fit(X_train, y_train) pars = skpipe.get_params(deep=True) assert 'steps' in pars step = pars['steps'][0] assert len(step) == 2 assert 'nimbusml' in pars assert 'nimbusml__random_state' in pars assert 'nimbusml__steps' in pars
def test_pickle_predictor(self): np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) ftree = FastTreesBinaryClassifier().fit(X_train, y_train) scores = ftree.predict(X_test) accu1 = np.mean(y_test.values.ravel() == scores.values) # Unpickle model and score. We should get the exact same accuracy as # above s = pickle.dumps(ftree) ftree2 = pickle.loads(s) scores2 = ftree2.predict(X_test) accu2 = np.mean(y_test.values.ravel() == scores2.values) assert_equal(accu1, accu2, "accuracy mismatch after unpickling predictor")
def test_ngramfeaturizer(self): np.random.seed(0) train_file = get_dataset('wiki_detox_train').as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t', encoding="utf-8") X_train, X_test, y_train, y_test = train_test_split( train['SentimentText'], label) # map text reviews to vector space texttransform = NGramFeaturizer( word_feature_extractor=n_gram(), vector_normalizer='None') << 'SentimentText' X_train = texttransform.fit_transform(X_train[:100]) sum = X_train.iloc[:].sum().sum() assert_equal(sum, 30513, "sum of all features is incorrect!")
def test_pickle_transform(self): np.random.seed(0) (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) cat = (OneHotVectorizer() << ['age']).fit(X_train, verbose=0) out1 = cat.transform(X_train) # Unpickle transform and generate output. # We should get the exact same output as above s = pickle.dumps(cat) cat2 = pickle.loads(s) out2 = cat2.transform(X_train) assert_equal(out1.sum().sum(), out2.sum().sum(), "data mismatch after unpickling transform")
def test_performance_syntax(self): train_file = get_dataset('uciadult_train').as_filepath() test_file = get_dataset('uciadult_test').as_filepath() file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 ' \ 'col=workclass:TX:1 col=education:TX:2 ' \ 'col=marital-status:TX:3 col=occupation:TX:4 ' \ 'col=relationship:TX:5 col=ethnicity:TX:6 ' \ 'col=sex:TX:7 col=native-country-region:TX:8 header+' categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'ethnicity', 'sex', 'native-country-region' ] label_column = 'label' na_columns = ['Features'] feature_columns_idv = na_columns + categorical_columns exp = Pipeline([ OneHotHashVectorizer(columns=categorical_columns), Handler(columns=na_columns), FastLinearBinaryClassifier(feature=feature_columns_idv, label=label_column) ]) train_data = FileDataStream(train_file, schema=file_schema) exp.fit(train_data, label_column, verbose=0) print("train time %s" % exp._run_time) test_data = FileDataStream(test_file, schema=file_schema) out_data = exp.predict(test_data) print("predict time %s" % exp._run_time) (test, label_test) = get_X_y(test_file, label_column, sep=',') (acc1, auc1) = evaluate_binary_classifier( label_test.iloc[:, 0].values, out_data.loc[:, 'PredictedLabel'].values, out_data.loc[:, 'Probability'].values) print('ACC %s, AUC %s' % (acc1, auc1)) exp = Pipeline([ OneHotHashVectorizer() << categorical_columns, Handler() << na_columns, FastLinearBinaryClassifier() << feature_columns_idv ]) train_data = FileDataStream(train_file, schema=file_schema) exp.fit(train_data, label_column, verbose=0) print("train time %s" % exp._run_time) test_data = FileDataStream(test_file, schema=file_schema) out_data = exp.predict(test_data) print("predict time %s" % exp._run_time) (test, label_test) = get_X_y(test_file, label_column, sep=',') (acc2, auc2) = evaluate_binary_classifier( label_test.iloc[:, 0].values, out_data.loc[:, 'PredictedLabel'].values, out_data.loc[:, 'Probability'].values) print('ACC %s, AUC %s' % (acc2, auc2)) assert abs(acc1 - acc2) < 0.02 assert abs(auc1 - auc2) < 0.02
from nimbusml import Pipeline from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotVectorizer from nimbusml.linear_model import FastLinearBinaryClassifier from nimbusml.utils import get_X_y from numpy.testing import assert_almost_equal train_file = get_dataset('uciadult_train').as_filepath() test_file = get_dataset('uciadult_test').as_filepath() categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'ethnicity', 'sex', 'native-country-region' ] label_column = 'label' (train, label) = get_X_y(train_file, label_column, sep=',') (test, test_label) = get_X_y(test_file, label_column, sep=',') class TestLoadSave(unittest.TestCase): def test_model_dataframe(self): model_nimbusml = Pipeline( steps=[('cat', OneHotVectorizer() << categorical_columns), ('linear', FastLinearBinaryClassifier(shuffle=False, train_threads=1) )]) model_nimbusml.fit(train, label) # Save with pickle pickle.dump(model_nimbusml, open('nimbusml_model.p', 'wb'))
import numpy as np from nimbusml import Pipeline from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.feature_extraction.text.extractor import Ngram from nimbusml.naive_bayes import NaiveBayesClassifier from nimbusml.utils import get_X_y from sklearn.model_selection import train_test_split # use 'wiki_detox_train' data set to create test and train data # Sentiment SentimentText # 1 ==RUDE== Dude, you are rude upload that carl picture back, or else. # 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIKI THEN!!! np.random.seed(0) train_file = get_dataset("wiki_detox_train").as_filepath() (train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t') X_train, X_test, y_train, y_test = train_test_split(train, label) # map text reviews to vector space texttransform = NGramFeaturizer(word_feature_extractor=Ngram(), vector_normalizer='None') << 'SentimentText' nb = NaiveBayesClassifier(feature=['SentimentText']) ppl = Pipeline([texttransform, nb]) ppl.fit(X_train, y_train) scores = ppl.predict(X_test)['PredictedLabel'] # evaluate the model