def test_ovr_accuracy(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(number_of_threads=1), FastForestBinaryClassifier(minimum_example_count_per_leaf=1, number_of_threads=1), GamBinaryClassifier(number_of_threads=1), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1, number_of_threads=1), FastLinearBinaryClassifier(number_of_threads=1), SgdBinaryClassifier(number_of_threads=1), # SymSgdBinaryClassifier(number_of_threads=1), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True) metrics = accuracy(ovr) accu = metrics['Accuracy(micro-avg)'][0] # algos will have wide range of accuracy, so use low bar. Also # checks Pipeline + Ova + clf assert_greater( accu, 0.65, "{} accuracy is too low {}".format(clf.__class__, accu))
def test_failing_predict_proba_called_with_use_probabilites_false(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False) check_predict_proba_when_trained_with_use_probabilites_false( self, ovr, clf)
def test_failing_decision_function_called_with_use_probabilites_true(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(min_split=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(min_split=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True) check_decision_function_when_trained_with_use_probabilites_true( self, ovr, clf)
def test_decision_function_produces_distribution_not_sum_to_1(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False) scoremean = decfun_average(ovr) assert_not_equal( scoremean, 1.0, '{} raw scores should not sum to 1.0 over 3 classes'.format( clf.__class__))
def test_predict_proba_produces_distribution_sum_to_1(self): clfs = [ # TODO: BUG 231482 , why doesnt FM work # FactorizationMachineBinaryClassifier(), LogisticRegressionBinaryClassifier(), FastForestBinaryClassifier(minimum_example_count_per_leaf=1), GamBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastTreesBinaryClassifier(minimum_example_count_per_leaf=1), LightGbmBinaryClassifier(), FastLinearBinaryClassifier(), SgdBinaryClassifier(), # TODO: why symsgd does not sum to 1.0 # SymSgdBinaryClassifier(), ] for clf in clfs: ovr = OneVsRestClassifier(classifier=clf) probmean = proba_average(ovr) assert_equal( probmean, 1.0, '{} probabilites {} do not sum to 1.0 over 3 classes'.format( clf.__class__, probmean))
'LightGbmBinaryClassifier': LightGbmBinaryClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRegressor': LightGbmRegressor(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRanker': LightGbmRanker(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), 'SgdBinaryClassifier': SgdBinaryClassifier(number_of_threads=1, shuffle=False), 'SkipFilter': SkipFilter(count=5), 'TakeFilter': TakeFilter(count=100000), 'IidSpikeDetector': IidSpikeDetector(columns=['F0']), 'IidChangePointDetector': IidChangePointDetector(columns=['F0']), 'SsaSpikeDetector': SsaSpikeDetector(columns=['F0'], seasonal_window_size=2), 'SsaChangePointDetector': SsaChangePointDetector(columns=['F0'], seasonal_window_size=2), 'SsaForecaster': SsaForecaster(columns=['F0'], window_size=2,
learners = [ FastForestBinaryClassifier(), FastForestRegressor(), FastTreesBinaryClassifier(), FastTreesRegressor(), FastTreesTweedieRegressor(), LightGbmRegressor(), LightGbmBinaryClassifier(), AveragedPerceptronBinaryClassifier(), FastLinearBinaryClassifier(), FastLinearClassifier(), FastLinearRegressor(), LogisticRegressionBinaryClassifier(), LogisticRegressionClassifier(), OnlineGradientDescentRegressor(), SgdBinaryClassifier(), # SymSgdBinaryClassifier(), OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor() ] learners_not_supported = [ NaiveBayesClassifier(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView KMeansPlusPlus(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView FactorizationMachineBinaryClassifier(), PcaAnomalyDetector(), # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView # PcaTransformer(), # REVIEW: crashes
# 0 3.0 0-5yrs # 1 1.0 0-5yrs train_file = get_dataset("infert").as_filepath() schema = "col=none:R4:0 col=education:R4:1 col=age:R4:2 col=parity:R4:3 " \ "col=induced:R4:4 col=case:R4:5 col=spontaneous:R4:6 " \ "col=stratum:R4:7 col=pooledstratum:R4:8 col=educationstr:R4:9 " \ "sep=, header=+" fds = FileDataStream(train_file, schema=schema) # target and features columns y = 'case' X = [ 'educationstr', 'age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooledstratum'] # set up pipeline pipe = Pipeline([ OneHotVectorizer() << 'educationstr', SgdBinaryClassifier() << {Role.Label: y, Role.Feature: X} ]) # train and evaluate the model metrics, scores = pipe.fit(fds, y).test(fds, y, output_scores=True) print(metrics)
# data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path) print(data.head()) # age case education induced parity ... row_num spontaneous ... # 0 26 1 0-5yrs 1 6 ... 1 2 ... # 1 42 1 0-5yrs 1 1 ... 2 0 ... # 2 39 1 0-5yrs 2 6 ... 3 0 ... # 3 34 1 0-5yrs 2 4 ... 4 0 ... # 4 35 1 6-11yrs 1 3 ... 5 1 ... # define the training pipeline pipeline = Pipeline([ OneHotVectorizer(columns={'edu': 'education'}), SgdBinaryClassifier(feature=['parity', 'edu'], label='case') ]) # train, predict, and evaluate # TODO: Replace with CV metrics, predictions = pipeline.fit(data).test(data, output_scores=True) # print predictions print(predictions.head()) # PredictedLabel Probability Score # 0 0 0.363427 -0.560521 # 1 0 0.378848 -0.494439 # 2 0 0.363427 -0.560521 # 3 0 0.369564 -0.534088 # 4 0 0.336350 -0.679603 # print evaluation metrics
############################################################################### # Exponential Loss from nimbusml.linear_model import SgdBinaryClassifier from nimbusml.loss import Exp # specify loss function using string keyword trainer1 = SgdBinaryClassifier(loss='exp') # can also use the loss class instead of string. trainer1 = SgdBinaryClassifier(loss=Exp()) # equivalent to loss='exp' trainer2 = SgdBinaryClassifier(loss=Exp(beta=0.4))
def test_sgdbinaryclassifier(self): accuracy = get_accuracy(self, SgdBinaryClassifier()) assert_greater(accuracy, 0.87, "accuracy should be %s" % 0.87)