def test_ensemble_rejects_estimators_with_incorrect_type(self): r1 = OrdinaryLeastSquaresRegressor(**olsrArgs) r2 = OnlineGradientDescentRegressor(**ogdArgs) r3 = LogisticRegressionClassifier() try: vr = VotingRegressor(estimators=[r1, r2, r3], combiner='Average') except Exception as e: print(e) else: self.fail('VotingRegressor should only work with regressors.')
def test_pass_predict_proba_multiclass_with_pipeline(self): algos = [ LogisticRegressionClassifier(), FastLinearClassifier(), LightGbmClassifier() ] for algo in algos: assert_almost_equal(proba_sum(Pipeline([algo])), 38.0, decimal=3, err_msg=invalid_predict_proba_output)
def test_lr_named_steps_iris(self): iris = load_iris() X = iris.data[:, :2] # we only take the first two features. y = iris.target df = pd.DataFrame(X, columns=['X1', 'X2']) df['Label'] = y pipe = nimbusmlPipeline([('norm', MeanVarianceScaler() << ['X1', 'X2']), ('lr', LogisticRegressionClassifier() << ['X1', 'X2'])]) pipe.fit(df) pred = pipe.predict(df).head() assert len(pred) == 5
def test_score_multiclass(self): np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = LogisticRegressionClassifier(train_threads=1) e = Pipeline([lr]) e.fit(X_train, y_train.to_frame()) metrics = e.score(X_test, y_test) print(metrics) assert_almost_equal(metrics, 0.7631578947368421, decimal=5, err_msg="Accuracy(micro-avg) should be %s" % 0.7631578947368421)
def test_metrics_evaluate_multiclass(self): np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) df.Label = [1 if x == 1 else 0 for x in df.Label] X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = LogisticRegressionClassifier() e = Pipeline([lr]) e.fit(X_train, y_train.to_frame(), verbose=0) metrics, _ = e.test(X_test, y_test) # TODO: debug flucations, and increase decimal precision on checks assert_almost_equal(metrics['Accuracy(micro-avg)'][0], 0.763, decimal=1, err_msg="Accuracy(micro-avg) should be %s" % 0.763) assert_almost_equal(metrics['Accuracy(macro-avg)'][0], 0.718, decimal=1, err_msg="Accuracy(macro-avg) should be %s" % 0.718) assert_almost_equal(metrics['Log-loss'][0], 0.419, decimal=3, err_msg="Log-loss should be %s" % 0.419) assert_almost_equal(metrics['Log-loss reduction'][0], 0.38476, decimal=3, err_msg="Log-loss reduction should be %s" % 0.38476) assert_almost_equal(metrics['(class 0)'][0], 0.223, decimal=1, err_msg="(class 0) should be %s" % 0.223) assert_almost_equal(metrics['(class 1)'][0], 0.688, decimal=1, err_msg="(class 1) should be %s" % 0.688)
def test_pass_predict_proba_multiclass(self): assert_almost_equal(proba_sum(LogisticRegressionClassifier()), 38.0, decimal=3, err_msg=invalid_predict_proba_output)
def test_fail_decision_function_multiclass_with_pipeline(self): check_unsupported_decision_function( self, Pipeline([LogisticRegressionClassifier()]), X_train, y_train, X_test)
def test_fail_decision_function_multiclass(self): check_unsupported_decision_function(self, LogisticRegressionClassifier(), X_train, y_train, X_test)
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LogisticRegressionClassifier(feature=['parity', 'edu'], label='induced') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score.0 Score.1 Score.2 # 0 2 0.171122 0.250151 0.578727 # 1 0 0.678313 0.220665 0.101022 # 2 2 0.171122 0.250151 0.578727 # 3 0 0.360849 0.289190 0.349961 # 4 0 0.556921 0.260420 0.182658 # print evaluation metrics
############################################################################### # LogisticRegressionClassifier import numpy as np import pandas as pd from nimbusml.datasets import get_dataset from nimbusml.linear_model import LogisticRegressionClassifier from sklearn.model_selection import train_test_split # use 'iris' data set to create test and train data # Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa # 0 5.1 3.5 1.4 0.2 0 setosa 1.0 # 1 4.9 3.0 1.4 0.2 0 setosa 1.0 np.random.seed(0) df = get_dataset("iris").as_df() df.drop(['Species'], inplace=True, axis=1) X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) lr = LogisticRegressionClassifier().fit(X_train, y_train) scores = lr.predict(X_test) scores = pd.to_numeric(scores) # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores]))
from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotVectorizer from nimbusml.linear_model import LogisticRegressionClassifier, \ FastLinearRegressor from nimbusml.model_selection import CV from nimbusml.preprocessing.missing_values import Indicator, Handler # Case 1: Default usage of CV path = get_dataset('infert').as_filepath() schema = DataSchema.read_schema(path, numeric_dtype=np.float32) data = FileDataStream.read_csv(path, schema=schema) pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), LogisticRegressionClassifier(feature=['age', 'spontaneous', 'edu'], label='induced') ]) # Do 3-fold cross-validation cv_results = CV(pipeline).fit(data, cv=3) # print summary statistic of metrics print(cv_results['metrics_summary']) # print metrics for all folds print(cv_results['metrics']) # print confusion matrix for fold 1 cm = cv_results['confusion_matrix'] print(cm[cm.Fold == 1])
'col=sex:TX:7 col=native-country-region:TX:8 header+' label_column = 'label' learners = [ FastForestBinaryClassifier(), FastForestRegressor(), FastTreesBinaryClassifier(), FastTreesRegressor(), FastTreesTweedieRegressor(), LightGbmRegressor(), LightGbmBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastLinearBinaryClassifier(), FastLinearClassifier(), FastLinearRegressor(), LogisticRegressionBinaryClassifier(), LogisticRegressionClassifier(), OnlineGradientDescentRegressor(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor() ] learners_not_supported = [ NaiveBayesClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView KMeansPlusPlus(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView FactorizationMachineBinaryClassifier(), PcaAnomalyDetector(),
# 0 5.1 3.5 1.4 0.2 0 setosa 1.0 # 1 4.9 3.0 1.4 0.2 0 setosa 1.0 np.random.seed(0) df = get_dataset("iris").as_df() X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) mycols = [ 'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa' ] # drop 'Species' column using ColumnDropper Transform # select mycols for training using ColumnConcatenator transform dropcols = ColumnDropper() << 'Species' concat = ColumnConcatenator() << {Role.Feature: mycols} pipeline = Pipeline([dropcols, concat, LogisticRegressionClassifier()]) pipeline.fit(X_train, y_train) scores1 = pipeline.predict(X_test) # Select mycols using SelectColumns Transform select = ColumnSelector() << mycols pipeline.fit(X_train, y_train) pipeline2 = Pipeline([select, LogisticRegressionClassifier()]) scores2 = pipeline.predict(X_test) # Verify that we get identical results in both Experiments print(scores1.head()) print(scores2.head())