def test_ovr_accuracy(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(number_of_threads=1),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1,
                                       number_of_threads=1),
            GamBinaryClassifier(number_of_threads=1),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1,
                                      number_of_threads=1),
            FastLinearBinaryClassifier(number_of_threads=1),
            SgdBinaryClassifier(number_of_threads=1),
            # SymSgdBinaryClassifier(number_of_threads=1),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True)
            metrics = accuracy(ovr)
            accu = metrics['Accuracy(micro-avg)'][0]
            # algos will have wide range of accuracy, so use low bar. Also
            # checks Pipeline + Ova + clf
            assert_greater(
                accu, 0.65,
                "{} accuracy is too low {}".format(clf.__class__, accu))
    def test_failing_predict_proba_called_with_use_probabilites_false(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False)
            check_predict_proba_when_trained_with_use_probabilites_false(
                self, ovr, clf)
Beispiel #3
0
    def test_failing_decision_function_called_with_use_probabilites_true(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(min_split=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(min_split=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True)
            check_decision_function_when_trained_with_use_probabilites_true(
                self, ovr, clf)
    def test_decision_function_produces_distribution_not_sum_to_1(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False)
            scoremean = decfun_average(ovr)
            assert_not_equal(
                scoremean, 1.0,
                '{} raw scores should not sum to 1.0 over 3 classes'.format(
                    clf.__class__))
    def test_predict_proba_produces_distribution_sum_to_1(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # TODO: why symsgd does not sum to 1.0
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf)
            probmean = proba_average(ovr)
            assert_equal(
                probmean, 1.0,
                '{} probabilites {} do not sum to 1.0 over 3 classes'.format(
                    clf.__class__, probmean))
INSTANCES = {
    'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier(
        feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']),
    'Binner': Binner(num_bins=3),
    'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
    'ColumnConcatenator': ColumnConcatenator(columns={'Features': [
        'Sepal_Length',
        'Sepal_Width',
        'Petal_Length',
        'Petal_Width',
        'Setosa']}),
    'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']),
    'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}),
    'CountSelector': CountSelector(count=5, columns=['Sepal_Width']),
    'DateTimeSplitter': DateTimeSplitter(prefix='dt'),
    'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
    'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']),
    'FromKey': Pipeline([
        ToKey(columns=['Sepal_Length']),
        FromKey(columns=['Sepal_Length'])
    ]),
    # GlobalContrastRowScaler currently requires a vector input to work
    'GlobalContrastRowScaler': Pipeline([
        ColumnConcatenator() << {
            'concated_columns': [
                'Petal_Length',
                'Sepal_Width',
                'Sepal_Length']},
from nimbusml.multiclass import OneVsRestClassifier
from nimbusml.naive_bayes import NaiveBayesClassifier
from sklearn.utils.testing import assert_raises

train_file = get_dataset("uciadult_train").as_filepath()
categorical_columns = [
    'workclass', 'education', 'marital-status', 'occupation', 'relationship',
    'ethnicity', 'sex', 'native-country-region'
]
file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 col=workclass:TX:1 ' \
              'col=education:TX:2 col=marital-status:TX:3 ' \
              'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \
              'col=sex:TX:7 col=native-country-region:TX:8 header+'
label_column = 'label'
learners = [
    FastForestBinaryClassifier(),
    FastForestRegressor(),
    FastTreesBinaryClassifier(),
    FastTreesRegressor(),
    FastTreesTweedieRegressor(),
    LightGbmRegressor(),
    LightGbmBinaryClassifier(),
    AveragedPerceptronBinaryClassifier(),
    FastLinearBinaryClassifier(),
    FastLinearClassifier(),
    FastLinearRegressor(),
    LogisticRegressionBinaryClassifier(),
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # SymSgdBinaryClassifier(),
Beispiel #8
0
# FastForestBinaryClassifier
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import FastForestBinaryClassifier
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from sklearn.model_selection import train_test_split

# use the built-in data set 'infert' to create test and train data
#   Unnamed: 0  education   age  parity  induced  case  spontaneous  stratum  \
# 0           1        0.0  26.0     6.0      1.0   1.0          2.0      1.0
# 1           2        0.0  42.0     1.0      1.0   1.0          0.0      2.0
#   pooled.stratum education_str
# 0             3.0        0-5yrs
# 1             1.0        0-5yrs
np.random.seed(0)

df = get_dataset("infert").as_df()

# remove : and ' ' from column names, and encode categorical column
df.columns = [i.replace(': ', '') for i in df.columns]
df = (OneHotVectorizer() << 'education_str').fit_transform(df)

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'case'], df['case'])

fforest = FastForestBinaryClassifier().fit(X_train, y_train)
scores = fforest.predict(X_test)

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))
Beispiel #9
0
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # Error on linux
    # Unable to load shared library 'SymSgdNative' or one of its dependencies
    #SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor(),
    OneVsRestClassifier(FastLinearBinaryClassifier()),
    GamRegressor(),
    GamBinaryClassifier(),
    PcaAnomalyDetector(),
    FactorizationMachineBinaryClassifier(),
    KMeansPlusPlus(n_clusters=2),
    NaiveBayesClassifier(),
    FastForestBinaryClassifier(number_of_trees=2), 
    FastForestRegressor(number_of_trees=2),
    FastTreesBinaryClassifier(number_of_trees=2),
    FastTreesRegressor(number_of_trees=2),
    FastTreesTweedieRegressor(number_of_trees=2),
    LightGbmRegressor(number_of_iterations=2),
    LightGbmClassifier(),
    LightGbmBinaryClassifier(number_of_iterations=2)
]

learners_not_supported = [
    #PcaTransformer(), # REVIEW: crashes
]


class TestModelSummary(unittest.TestCase):
                               numeric_dtype=numpy.float32,
                               names={
                                   0: 'row_num',
                                   5: 'case'
                               })
print(data.head())
#    age  case education  induced  parity  pooled.stratum  row_num  ...
# 0  26.0   1.0    0-5yrs      1.0     6.0             3.0      1.0  ...
# 1  42.0   1.0    0-5yrs      1.0     1.0             1.0      2.0  ...
# 2  39.0   1.0    0-5yrs      2.0     6.0             4.0      3.0  ...
# 3  34.0   1.0    0-5yrs      2.0     4.0             2.0      4.0  ...
# 4  35.0   1.0   6-11yrs      1.0     3.0            32.0      5.0  ...
# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    FastForestBinaryClassifier(feature=['age', 'edu', 'induced'], label='case')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel      Score
# 0             0.0 -26.985743
# 1             0.0 -26.562090
# 2             0.0 -24.832508
# 3             0.0 -23.799389
# 4             0.0 -19.612534
# print evaluation metrics