def test_lightgbmbinaryclassifier(self):
        np.random.seed(0)

        df = get_dataset("infert").as_df()

        # remove : and ' ' from column names, and encode categorical column
        df.columns = [i.replace(': ', '') for i in df.columns]
        df = (OneHotVectorizer() << 'education_str').fit_transform(df)

        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'case'], df['case'])

        lr = LightGbmBinaryClassifier().fit(X_train, y_train)
        scores = lr.predict(X_test)
        accuracy = np.mean(y_test == [i for i in scores])
        assert_greater(accuracy, 0.98, "accuracy should be %s" % 0.98)
    def test_failing_predict_proba_called_with_use_probabilites_false(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False)
            check_predict_proba_when_trained_with_use_probabilites_false(
                self, ovr, clf)
Exemple #3
0
    def test_failing_decision_function_called_with_use_probabilites_true(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(min_split=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(min_split=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True)
            check_decision_function_when_trained_with_use_probabilites_true(
                self, ovr, clf)
    def test_decision_function_produces_distribution_not_sum_to_1(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False)
            scoremean = decfun_average(ovr)
            assert_not_equal(
                scoremean, 1.0,
                '{} raw scores should not sum to 1.0 over 3 classes'.format(
                    clf.__class__))
    def test_predict_proba_produces_distribution_sum_to_1(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # TODO: why symsgd does not sum to 1.0
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf)
            probmean = proba_average(ovr)
            assert_equal(
                probmean, 1.0,
                '{} probabilites {} do not sum to 1.0 over 3 classes'.format(
                    clf.__class__, probmean))
Exemple #6
0
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LightGbmBinaryClassifier(feature=['induced', 'edu'],
                             label='case',
                             booster=Goss(top_rate=0.9))
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data, 'case').test(data,
                                                       output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel  Probability     Score
# 0               1     0.612220  0.913309
# 1               1     0.612220  0.913309
# 2               0     0.334486 -1.375929
# 3               0     0.334486 -1.375929
    'check_estimators_pickle')

OMITTED_CHECKS_ALWAYS = 'check_estimators_nan_inf'

NOBINARY_CHECKS = [
    'check_estimator_sparse_data', 'check_dtype_object',
    'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature',
    'check_dont_overwrite_parameters', 'check_supervised_y_2d',
    'check_estimators_fit_returns_self', 'check_estimators_overwrite_params',
    'check_estimators_dtypes', 'check_classifiers_classes',
    'check_classifiers_train'
]

INSTANCES = {
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(minimum_example_count_per_group=1,
                             minimum_example_count_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(minimum_example_count_per_group=1,
                       minimum_example_count_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(minimum_example_count_per_group=1,
                      minimum_example_count_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(minimum_example_count_per_group=1,
                   minimum_example_count_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SkipFilter':
    SkipFilter(count=5),
    'TakeFilter':
    TakeFilter(count=100000),
    'workclass', 'education', 'marital-status', 'occupation', 'relationship',
    'ethnicity', 'sex', 'native-country-region'
]
file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 col=workclass:TX:1 ' \
              'col=education:TX:2 col=marital-status:TX:3 ' \
              'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \
              'col=sex:TX:7 col=native-country-region:TX:8 header+'
label_column = 'label'
learners = [
    FastForestBinaryClassifier(),
    FastForestRegressor(),
    FastTreesBinaryClassifier(),
    FastTreesRegressor(),
    FastTreesTweedieRegressor(),
    LightGbmRegressor(),
    LightGbmBinaryClassifier(),
    AveragedPerceptronBinaryClassifier(),
    FastLinearBinaryClassifier(),
    FastLinearClassifier(),
    FastLinearRegressor(),
    LogisticRegressionBinaryClassifier(),
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor()
]

learners_not_supported = [
    NaiveBayesClassifier(),
Exemple #9
0
    PoissonRegressionRegressor(),
    OneVsRestClassifier(FastLinearBinaryClassifier()),
    GamRegressor(),
    GamBinaryClassifier(),
    PcaAnomalyDetector(),
    FactorizationMachineBinaryClassifier(),
    KMeansPlusPlus(n_clusters=2),
    NaiveBayesClassifier(),
    FastForestBinaryClassifier(number_of_trees=2), 
    FastForestRegressor(number_of_trees=2),
    FastTreesBinaryClassifier(number_of_trees=2),
    FastTreesRegressor(number_of_trees=2),
    FastTreesTweedieRegressor(number_of_trees=2),
    LightGbmRegressor(number_of_iterations=2),
    LightGbmClassifier(),
    LightGbmBinaryClassifier(number_of_iterations=2)
]

learners_not_supported = [
    #PcaTransformer(), # REVIEW: crashes
]


class TestModelSummary(unittest.TestCase):

    def test_model_summary(self):
        for learner in learners:
            pipeline = Pipeline(
                [OneHotVectorizer() << categorical_columns, learner])
            train_stream = FileDataStream(train_file, schema=file_schema)
            pipeline.fit(train_stream, label_column)
Exemple #10
0
    'check_estimators_pickle')

OMITTED_CHECKS_ALWAYS = 'check_estimators_nan_inf'

NOBINARY_CHECKS = [
    'check_estimator_sparse_data', 'check_dtype_object',
    'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature',
    'check_dont_overwrite_parameters', 'check_supervised_y_2d',
    'check_estimators_fit_returns_self', 'check_estimators_overwrite_params',
    'check_estimators_dtypes', 'check_classifiers_classes',
    'check_classifiers_train'
]

INSTANCES = {
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(min_data_per_group=1, min_data_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SkipFilter':
    SkipFilter(count=5),
    'TensorFlowScorer':
    TensorFlowScorer(model=os.path.join(this, '..', 'nimbusml', 'examples',
                                        'frozen_saved_model.pb'),
                     columns={'c': ['a', 'b']}),
}
# LightGbmBinaryClassifier
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import LightGbmBinaryClassifier
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from sklearn.model_selection import train_test_split

# use the built-in data set 'infert' to create test and train data
#   Unnamed: 0  education   age  parity  induced  case  spontaneous  stratum  \
# 0           1        0.0  26.0     6.0      1.0   1.0          2.0      1.0
# 1           2        0.0  42.0     1.0      1.0   1.0          0.0      2.0
#   pooled.stratum education_str
# 0             3.0        0-5yrs
# 1             1.0        0-5yrs
np.random.seed(0)

df = get_dataset("infert").as_df()

# remove : and ' ' from column names, and encode categorical column
df.columns = [i.replace(': ', '') for i in df.columns]
df = (OneHotVectorizer() << 'education_str').fit_transform(df)

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'case'], df['case'])

fforest = LightGbmBinaryClassifier().fit(X_train, y_train)
scores = fforest.predict(X_test)

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))