def test_GamBinaryClassifier(self): np.random.seed(0) df = get_dataset("infert").as_df() df.columns = [i.replace(': ', '') for i in df.columns] df = (OneHotVectorizer() << 'education_str').fit_transform(df) X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'case'], df['case']) lr = GamBinaryClassifier().fit(X_train, y_train) scores = lr.predict(X_test) acc = np.mean(y_test == [i for i in scores]) assert_greater(acc, 0.70, "accuracy should %s" % 0.70)
def test_ovr_accuracy(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(number_of_threads=1), FastForestBinaryClassifier(minimum_example_count_per_leaf=1, number_of_threads=1), GamBinaryClassifier(number_of_threads=1), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1, number_of_threads=1), FastLinearBinaryClassifier(number_of_threads=1), SgdBinaryClassifier(number_of_threads=1), # SymSgdBinaryClassifier(number_of_threads=1), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True) metrics = accuracy(ovr) accu = metrics['Accuracy(micro-avg)'][0] # algos will have wide range of accuracy, so use low bar. Also # checks Pipeline + Ova + clf assert_greater( accu, 0.65, "{} accuracy is too low {}".format(clf.__class__, accu))
def test_failing_predict_proba_called_with_use_probabilites_false(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False) check_predict_proba_when_trained_with_use_probabilites_false( self, ovr, clf)
def test_failing_decision_function_called_with_use_probabilites_true(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(min_split=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(min_split=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True) check_decision_function_when_trained_with_use_probabilites_true( self, ovr, clf)
def test_decision_function_produces_distribution_not_sum_to_1(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False) scoremean = decfun_average(ovr) assert_not_equal( scoremean, 1.0, '{} raw scores should not sum to 1.0 over 3 classes'.format( clf.__class__))
def test_predict_proba_produces_distribution_sum_to_1(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # TODO: why symsgd does not sum to 1.0 # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf) probmean = proba_average(ovr) assert_equal( probmean, 1.0, '{} probabilites {} do not sum to 1.0 over 3 classes'.format( clf.__class__, probmean))
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), GamBinaryClassifier(feature=['age', 'edu'], label='case') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Score # 0 0 -0.050461 # 1 0 -0.049737 # 2 0 -0.049737 # 3 0 -0.050461 # 4 0 -0.050552 # print evaluation metrics
# GamBinaryClassifier import numpy as np from nimbusml.datasets import get_dataset from nimbusml.ensemble import GamBinaryClassifier from nimbusml.feature_extraction.categorical import OneHotVectorizer from sklearn.model_selection import train_test_split # use the built-in data set 'infert' to create test and train data # Unnamed: 0 education age parity induced case spontaneous stratum \ # 0 1 0.0 26.0 6.0 1.0 1.0 2.0 1.0 # 1 2 0.0 42.0 1.0 1.0 1.0 0.0 2.0 # pooled.stratum education_str # 0 3.0 0-5yrs # 1 1.0 0-5yrs np.random.seed(0) df = get_dataset("infert").as_df() # remove : and ' ' from column names, and encode categorical column df.columns = [i.replace(': ', '') for i in df.columns] df = (OneHotVectorizer() << 'education_str').fit_transform(df) X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'case'], df['case']) ftree = GamBinaryClassifier().fit(X_train, y_train) scores = ftree.predict(X_test) # evaluate the model print('Accuracy:', np.mean(y_test == [i for i in scores]))
# SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor() ] learners_not_supported = [ NaiveBayesClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView KMeansPlusPlus(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView FactorizationMachineBinaryClassifier(), PcaAnomalyDetector(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # PcaTransformer(), # REVIEW: crashes GamBinaryClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView GamRegressor( ), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView LightGbmClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # LightGbmRanker(), # REVIEW: crashes # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView OneVsRestClassifier(FastLinearBinaryClassifier()), ] class TestModelSummary(unittest.TestCase): def test_model_summary(self): for learner in learners: pipeline = Pipeline(