def test_ovr_accuracy(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(number_of_threads=1),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1,
                                       number_of_threads=1),
            GamBinaryClassifier(number_of_threads=1),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1,
                                      number_of_threads=1),
            FastLinearBinaryClassifier(number_of_threads=1),
            SgdBinaryClassifier(number_of_threads=1),
            # SymSgdBinaryClassifier(number_of_threads=1),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True)
            metrics = accuracy(ovr)
            accu = metrics['Accuracy(micro-avg)'][0]
            # algos will have wide range of accuracy, so use low bar. Also
            # checks Pipeline + Ova + clf
            assert_greater(
                accu, 0.65,
                "{} accuracy is too low {}".format(clf.__class__, accu))
    def test_failing_predict_proba_called_with_use_probabilites_false(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False)
            check_predict_proba_when_trained_with_use_probabilites_false(
                self, ovr, clf)
Beispiel #3
0
    def test_failing_decision_function_called_with_use_probabilites_true(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(min_split=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(min_split=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True)
            check_decision_function_when_trained_with_use_probabilites_true(
                self, ovr, clf)
    def test_decision_function_produces_distribution_not_sum_to_1(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False)
            scoremean = decfun_average(ovr)
            assert_not_equal(
                scoremean, 1.0,
                '{} raw scores should not sum to 1.0 over 3 classes'.format(
                    clf.__class__))
    def test_predict_proba_produces_distribution_sum_to_1(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # TODO: why symsgd does not sum to 1.0
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf)
            probmean = proba_average(ovr)
            assert_equal(
                probmean, 1.0,
                '{} probabilites {} do not sum to 1.0 over 3 classes'.format(
                    clf.__class__, probmean))
 'LightGbmBinaryClassifier':
 LightGbmBinaryClassifier(minimum_example_count_per_group=1,
                          minimum_example_count_per_leaf=1),
 'LightGbmClassifier':
 LightGbmClassifier(minimum_example_count_per_group=1,
                    minimum_example_count_per_leaf=1),
 'LightGbmRegressor':
 LightGbmRegressor(minimum_example_count_per_group=1,
                   minimum_example_count_per_leaf=1),
 'LightGbmRanker':
 LightGbmRanker(minimum_example_count_per_group=1,
                minimum_example_count_per_leaf=1),
 'NGramFeaturizer':
 NGramFeaturizer(word_feature_extractor=n_gram()),
 'SgdBinaryClassifier':
 SgdBinaryClassifier(number_of_threads=1, shuffle=False),
 'SkipFilter':
 SkipFilter(count=5),
 'TakeFilter':
 TakeFilter(count=100000),
 'IidSpikeDetector':
 IidSpikeDetector(columns=['F0']),
 'IidChangePointDetector':
 IidChangePointDetector(columns=['F0']),
 'SsaSpikeDetector':
 SsaSpikeDetector(columns=['F0'], seasonal_window_size=2),
 'SsaChangePointDetector':
 SsaChangePointDetector(columns=['F0'], seasonal_window_size=2),
 'SsaForecaster':
 SsaForecaster(columns=['F0'],
               window_size=2,
learners = [
    FastForestBinaryClassifier(),
    FastForestRegressor(),
    FastTreesBinaryClassifier(),
    FastTreesRegressor(),
    FastTreesTweedieRegressor(),
    LightGbmRegressor(),
    LightGbmBinaryClassifier(),
    AveragedPerceptronBinaryClassifier(),
    FastLinearBinaryClassifier(),
    FastLinearClassifier(),
    FastLinearRegressor(),
    LogisticRegressionBinaryClassifier(),
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor()
]

learners_not_supported = [
    NaiveBayesClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    KMeansPlusPlus(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    FactorizationMachineBinaryClassifier(),
    PcaAnomalyDetector(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # PcaTransformer(), # REVIEW: crashes
Beispiel #8
0
# 0             3.0        0-5yrs
# 1             1.0        0-5yrs

train_file = get_dataset("infert").as_filepath()
schema = "col=none:R4:0 col=education:R4:1 col=age:R4:2 col=parity:R4:3 " \
         "col=induced:R4:4 col=case:R4:5 col=spontaneous:R4:6 " \
         "col=stratum:R4:7 col=pooledstratum:R4:8 col=educationstr:R4:9 " \
         "sep=, header=+"
fds = FileDataStream(train_file, schema=schema)

# target and features columns
y = 'case'
X = [
    'educationstr',
    'age',
    'parity',
    'induced',
    'spontaneous',
    'stratum',
    'pooledstratum']

# set up pipeline
pipe = Pipeline([
    OneHotVectorizer() << 'educationstr',
    SgdBinaryClassifier() << {Role.Label: y, Role.Feature: X}
])

# train and evaluate the model
metrics, scores = pipe.fit(fds, y).test(fds, y, output_scores=True)
print(metrics)
Beispiel #9
0
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    SgdBinaryClassifier(feature=['parity', 'edu'], label='case')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel  Probability     Score
# 0               0     0.363427 -0.560521
# 1               0     0.378848 -0.494439
# 2               0     0.363427 -0.560521
# 3               0     0.369564 -0.534088
# 4               0     0.336350 -0.679603
# print evaluation metrics
Beispiel #10
0
###############################################################################
# Exponential Loss
from nimbusml.linear_model import SgdBinaryClassifier
from nimbusml.loss import Exp

# specify loss function using string keyword
trainer1 = SgdBinaryClassifier(loss='exp')

# can also use the loss class instead of string.

trainer1 = SgdBinaryClassifier(loss=Exp())  # equivalent to loss='exp'
trainer2 = SgdBinaryClassifier(loss=Exp(beta=0.4))
 def test_sgdbinaryclassifier(self):
     accuracy = get_accuracy(self, SgdBinaryClassifier())
     assert_greater(accuracy, 0.87, "accuracy should be %s" % 0.87)