def test_pickle_pipeline_and_nimbusml_pipeline(self): (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier() nimbusmlpipe = nimbusmlPipeline([cat, ftree]) skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)]) skpipe.fit(X_train, y_train) scores = skpipe.predict(X_test) accu1 = np.mean(y_test.values.ravel() == scores["PredictedLabel"].values) # Unpickle model and score. We should get the exact same accuracy as # above s = pickle.dumps(skpipe) pipe2 = pickle.loads(s) scores2 = pipe2.predict(X_test) accu2 = np.mean(y_test.values.ravel() == scores2["PredictedLabel"].values) assert_equal( accu1, accu2, "accuracy mismatch after unpickling pipeline") assert_frame_equal(scores, scores2)
def test_pipeline_clone(self): (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier() nimbusmlpipe = nimbusmlPipeline([cat, ftree]) skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)]) skpipe.fit(X_train, y_train) scores = skpipe.predict(X_test) copy = clone(skpipe) scores2 = copy.predict(X_test) assert_frame_equal(scores, scores2) # checks we can fit again skpipe.fit(X_train, y_train) scores3 = skpipe.predict(X_test) assert_frame_equal(scores, scores3)
def test_lr_named_steps_iris(self): iris = load_iris() X = iris.data[:, :2] # we only take the first two features. y = iris.target df = pd.DataFrame(X, columns=['X1', 'X2']) df['Label'] = y pipe = nimbusmlPipeline([('norm', MeanVarianceScaler() << ['X1', 'X2']), ('lr', LogisticRegressionClassifier() << ['X1', 'X2'])]) pipe.fit(df) pred = pipe.predict(df).head() assert len(pred) == 5
def test_char_tokenizer(self): customer_reviews = pd.DataFrame(data=dict(review=[ "I really did not like the taste of it", "It was surprisingly quite good!", "I will never ever ever go to that place again!!", "The best ever!! It was amazingly good and super fast", "I wish I had gone earlier, it was that great", "somewhat dissapointing. I'd probably wont try again", "Never visit again... rascals!" ])) tokenize = CharTokenizer(['review']) concat = ColumnConcatenator() >> 'features' << [['review']] pipeline = nimbusmlPipeline([concat, tokenize]) y = pipeline.fit_transform(customer_reviews) assert y is not None
def test_pipeline_get_params(self): (X_train, y_train) = get_X_y(train_file, label_column, sep=',', features=selected_features) (X_test, y_test) = get_X_y(test_file, label_column, sep=',', features=selected_features) if 'F1' in X_train.columns: raise Exception("F1 is in the dataset") cat = OneHotVectorizer() << 'age' ftree = FastTreesBinaryClassifier() nimbusmlpipe = nimbusmlPipeline([cat, ftree]) skpipe = Pipeline(steps=[('nimbusml', nimbusmlpipe)]) skpipe.fit(X_train, y_train) pars = skpipe.get_params(deep=True) assert 'steps' in pars step = pars['steps'][0] assert len(step) == 2 assert 'nimbusml' in pars assert 'nimbusml__random_state' in pars assert 'nimbusml__steps' in pars