Beispiel #1
0
 def test_ensemble_rejects_estimators_with_incorrect_type(self):
     r1 = OrdinaryLeastSquaresRegressor(**olsrArgs)
     r2 = OnlineGradientDescentRegressor(**ogdArgs)
     r3 = LogisticRegressionClassifier()
     try:
         vr = VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
     except Exception as e:
         print(e)
     else:
         self.fail('VotingRegressor should only work with regressors.')
Beispiel #2
0
 def test_pass_predict_proba_multiclass_with_pipeline(self):
     algos = [
         LogisticRegressionClassifier(),
         FastLinearClassifier(),
         LightGbmClassifier()
     ]
     for algo in algos:
         assert_almost_equal(proba_sum(Pipeline([algo])),
                             38.0,
                             decimal=3,
                             err_msg=invalid_predict_proba_output)
Beispiel #3
0
 def test_lr_named_steps_iris(self):
     iris = load_iris()
     X = iris.data[:, :2]  # we only take the first two features.
     y = iris.target
     df = pd.DataFrame(X, columns=['X1', 'X2'])
     df['Label'] = y
     pipe = nimbusmlPipeline([('norm', MeanVarianceScaler() << ['X1', 'X2']),
                         ('lr',
                          LogisticRegressionClassifier() << ['X1', 'X2'])])
     pipe.fit(df)
     pred = pipe.predict(df).head()
     assert len(pred) == 5
Beispiel #4
0
    def test_score_multiclass(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = LogisticRegressionClassifier(train_threads=1)
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame())
        metrics = e.score(X_test, y_test)
        print(metrics)
        assert_almost_equal(metrics,
                            0.7631578947368421,
                            decimal=5,
                            err_msg="Accuracy(micro-avg) should be %s" %
                            0.7631578947368421)
Beispiel #5
0
    def test_metrics_evaluate_multiclass(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = LogisticRegressionClassifier()
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame(), verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # TODO: debug flucations, and increase decimal precision on checks
        assert_almost_equal(metrics['Accuracy(micro-avg)'][0],
                            0.763,
                            decimal=1,
                            err_msg="Accuracy(micro-avg) should be %s" % 0.763)
        assert_almost_equal(metrics['Accuracy(macro-avg)'][0],
                            0.718,
                            decimal=1,
                            err_msg="Accuracy(macro-avg) should be %s" % 0.718)
        assert_almost_equal(metrics['Log-loss'][0],
                            0.419,
                            decimal=3,
                            err_msg="Log-loss should be %s" % 0.419)
        assert_almost_equal(metrics['Log-loss reduction'][0],
                            0.38476,
                            decimal=3,
                            err_msg="Log-loss reduction should be %s" %
                            0.38476)
        assert_almost_equal(metrics['(class 0)'][0],
                            0.223,
                            decimal=1,
                            err_msg="(class 0) should be %s" % 0.223)
        assert_almost_equal(metrics['(class 1)'][0],
                            0.688,
                            decimal=1,
                            err_msg="(class 1) should be %s" % 0.688)
Beispiel #6
0
 def test_pass_predict_proba_multiclass(self):
     assert_almost_equal(proba_sum(LogisticRegressionClassifier()),
                         38.0,
                         decimal=3,
                         err_msg=invalid_predict_proba_output)
Beispiel #7
0
 def test_fail_decision_function_multiclass_with_pipeline(self):
     check_unsupported_decision_function(
         self, Pipeline([LogisticRegressionClassifier()]), X_train, y_train,
         X_test)
Beispiel #8
0
 def test_fail_decision_function_multiclass(self):
     check_unsupported_decision_function(self,
                                         LogisticRegressionClassifier(),
                                         X_train, y_train, X_test)
Beispiel #9
0
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LogisticRegressionClassifier(feature=['parity', 'edu'], label='induced')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel   Score.0   Score.1   Score.2
# 0               2  0.171122  0.250151  0.578727
# 1               0  0.678313  0.220665  0.101022
# 2               2  0.171122  0.250151  0.578727
# 3               0  0.360849  0.289190  0.349961
# 4               0  0.556921  0.260420  0.182658
# print evaluation metrics
Beispiel #10
0
###############################################################################
# LogisticRegressionClassifier
import numpy as np
import pandas as pd
from nimbusml.datasets import get_dataset
from nimbusml.linear_model import LogisticRegressionClassifier
from sklearn.model_selection import train_test_split

# use 'iris' data set to create test and train data
#    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label Species  Setosa
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
np.random.seed(0)

df = get_dataset("iris").as_df()
df.drop(['Species'], inplace=True, axis=1)

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])
lr = LogisticRegressionClassifier().fit(X_train, y_train)

scores = lr.predict(X_test)
scores = pd.to_numeric(scores)

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))
Beispiel #11
0
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import LogisticRegressionClassifier, \
    FastLinearRegressor
from nimbusml.model_selection import CV
from nimbusml.preprocessing.missing_values import Indicator, Handler

# Case 1: Default usage of CV

path = get_dataset('infert').as_filepath()
schema = DataSchema.read_schema(path, numeric_dtype=np.float32)
data = FileDataStream.read_csv(path, schema=schema)

pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LogisticRegressionClassifier(feature=['age', 'spontaneous', 'edu'],
                                 label='induced')
])

# Do 3-fold cross-validation
cv_results = CV(pipeline).fit(data, cv=3)

# print summary statistic of metrics
print(cv_results['metrics_summary'])

# print metrics for all folds
print(cv_results['metrics'])

# print confusion matrix for fold 1
cm = cv_results['confusion_matrix']
print(cm[cm.Fold == 1])
              'col=sex:TX:7 col=native-country-region:TX:8 header+'
label_column = 'label'
learners = [
    FastForestBinaryClassifier(),
    FastForestRegressor(),
    FastTreesBinaryClassifier(),
    FastTreesRegressor(),
    FastTreesTweedieRegressor(),
    LightGbmRegressor(),
    LightGbmBinaryClassifier(),
    AveragedPerceptronBinaryClassifier(),
    FastLinearBinaryClassifier(),
    FastLinearClassifier(),
    FastLinearRegressor(),
    LogisticRegressionBinaryClassifier(),
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor()
]

learners_not_supported = [
    NaiveBayesClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    KMeansPlusPlus(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    FactorizationMachineBinaryClassifier(),
    PcaAnomalyDetector(),
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
np.random.seed(0)
df = get_dataset("iris").as_df()

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

mycols = [
    'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa'
]

# drop 'Species' column using ColumnDropper Transform
# select mycols for training using ColumnConcatenator transform
dropcols = ColumnDropper() << 'Species'
concat = ColumnConcatenator() << {Role.Feature: mycols}

pipeline = Pipeline([dropcols, concat, LogisticRegressionClassifier()])
pipeline.fit(X_train, y_train)
scores1 = pipeline.predict(X_test)

# Select mycols using SelectColumns Transform
select = ColumnSelector() << mycols
pipeline.fit(X_train, y_train)
pipeline2 = Pipeline([select, LogisticRegressionClassifier()])
scores2 = pipeline.predict(X_test)

# Verify that we get identical results in both Experiments
print(scores1.head())
print(scores2.head())