Beispiel #1
0
    def test_syntax12_mixed2(self):
        X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                  workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                  weight=[10., 1., 1., 1., 1.],
                                  y=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline(
            [
                OneHotVectorizer(
                    columns=[
                        'workclass', 'education']),
                Concat(
                    columns={
                        'Feature': ['workclass', 'education']}),
                FastTreesRegressor(
                    num_trees=5, feature='Feature', weight='weight') << {
                    Role.Label: 'y'}])
        exp.fit(X, verbose=0)
        assert exp.nodes[-1].feature_column_ == 'Feature'
        assert exp.nodes[-1].label_column_ == 'y'
        assert exp.nodes[-1].weight_column_ == 'weight'
        # y is required here as well as weight.
        # It is replaced by fakes values.
        # The test does not fail but the weight is not taken into account.
        X['y'] = -5
        X['weight'] = -5
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
Beispiel #2
0
    def test_metrics_evaluate_regressor(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = FastTreesRegressor()
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame(), verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # TODO: debug flucations, and increase decimal precision on checks
        assert_almost_equal(metrics['L1(avg)'][0],
                            0.107,
                            decimal=1,
                            err_msg="L1 loss should be %s" % 0.107)
        assert_almost_equal(metrics['L2(avg)'][0],
                            0.0453,
                            decimal=1,
                            err_msg="L2(avg) should be %s" % 0.0453)
        assert_almost_equal(metrics['Loss-fn(avg)'][0],
                            0.0453,
                            decimal=1,
                            err_msg="Loss-fn(avg)loss should be %s" % 0.0453)
Beispiel #3
0
    def test_label_column_defaults_to_label_when_label_column_in_input_data(
            self):
        train_data = {
            'c1': [2, 3, 4, 5],
            'c2': [3, 4, 5, 6],
            'c3': [4, 5, 6, 7],
            'Label': [0, 1, 2, 1]
        }
        train_df = pd.DataFrame(train_data)

        predictor = FastTreesRegressor()
        pipeline = Pipeline([predictor])
        result = json.loads(pipeline.fit(train_df, dry_run=True))

        self.verify_regressor_nodes(result, "Label", ["c1", "c2", "c3"],
                                    "Trainers.FastTreeRegressor")
Beispiel #4
0
    def test_score_regressor(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = FastTreesRegressor(train_threads=1)
        e = Pipeline([lr])
        e.fit(X_train, y_train.to_frame())
        metrics = e.score(X_test, y_test)
        print(metrics)
        assert_almost_equal(metrics,
                            0.814061733686017,
                            decimal=5,
                            err_msg="L1 loss should be %s" % 0.814061733686017)
 def test_syntax12_group(self):
     # This tests check that a learner raises an exception
     # if a role is not allowed by the entrypoint.
     X = pandas.DataFrame(
         dict(education=['A', 'B', 'A', 'B', 'A'],
              workclass=['X', 'X', 'Y', 'Y', 'Y'],
              gr=[0, 0, 1, 1, 1],
              y=[1.1, 2.2, 1.24, 3.4, 3.4]))
     exp = Pipeline([
         OneHotVectorizer(columns=['workclass', 'education']),
         Concat(columns={'Feature': ['workclass', 'education']}),
         ToKey() << 'gr',
         FastTreesRegressor(
             number_of_trees=5, feature='Feature', group_id='gr') << {
                 Role.Label: 'y'
             }
     ])
     exp.fit(X, verbose=0)
     assert not hasattr(exp.nodes[-1], 'feature_')
     assert not hasattr(exp.nodes[-1], 'group_id_')
     assert exp.nodes[-1].feature_column_name_ == 'Feature'
     assert exp.nodes[-1].label_column_name_ == 'y'
     # assert not hasattr(exp.nodes[-1], 'row_group_column_name_')
     assert not hasattr(exp.nodes[-1], 'group_id_column')
     assert not hasattr(exp.nodes[-1], 'groupid_column_')
     assert not hasattr(exp.nodes[-1], 'groupid_column')
     if not hasattr(exp.nodes[-1], 'row_group_column_name_'):
         raise AssertionError("Attribute not found: {0}".format(", ".join(
             sorted(dir(exp.nodes[-1])))))
     assert exp.nodes[-1].row_group_column_name_ == 'gr'
     # y is required here as well as weight.
     # It is replaced by fakes values.
     # The test does not fail but the weight is not taken into account.
     X['y'] = -5
     X['weight'] = -5
     prediction = exp.predict(X)
     assert isinstance(prediction, pandas.DataFrame)
     assert list(prediction.columns) == ['Score']
     assert prediction.shape == (5, 1)
train_file = get_dataset("uciadult_train").as_filepath()
categorical_columns = [
    'workclass', 'education', 'marital-status', 'occupation', 'relationship',
    'ethnicity', 'sex', 'native-country-region'
]
file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 col=workclass:TX:1 ' \
              'col=education:TX:2 col=marital-status:TX:3 ' \
              'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \
              'col=sex:TX:7 col=native-country-region:TX:8 header+'
label_column = 'label'
learners = [
    FastForestBinaryClassifier(),
    FastForestRegressor(),
    FastTreesBinaryClassifier(),
    FastTreesRegressor(),
    FastTreesTweedieRegressor(),
    LightGbmRegressor(),
    LightGbmBinaryClassifier(),
    AveragedPerceptronBinaryClassifier(),
    FastLinearBinaryClassifier(),
    FastLinearClassifier(),
    FastLinearRegressor(),
    LogisticRegressionBinaryClassifier(),
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor()
]
Beispiel #7
0
    # Error on linux
    # Unable to load shared library 'SymSgdNative' or one of its dependencies
    #SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor(),
    OneVsRestClassifier(FastLinearBinaryClassifier()),
    GamRegressor(),
    GamBinaryClassifier(),
    PcaAnomalyDetector(),
    FactorizationMachineBinaryClassifier(),
    KMeansPlusPlus(n_clusters=2),
    NaiveBayesClassifier(),
    FastForestBinaryClassifier(number_of_trees=2), 
    FastForestRegressor(number_of_trees=2),
    FastTreesBinaryClassifier(number_of_trees=2),
    FastTreesRegressor(number_of_trees=2),
    FastTreesTweedieRegressor(number_of_trees=2),
    LightGbmRegressor(number_of_iterations=2),
    LightGbmClassifier(),
    LightGbmBinaryClassifier(number_of_iterations=2)
]

learners_not_supported = [
    #PcaTransformer(), # REVIEW: crashes
]


class TestModelSummary(unittest.TestCase):

    def test_model_summary(self):
        for learner in learners:
Beispiel #8
0
###############################################################################
# FastTreesRegressor
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import FastTreesRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

# use the built-in data set 'airquality' to create test and train data
#    Unnamed: 0  Ozone  Solar_R  Wind  Temp  Month  Day
# 0           1   41.0    190.0   7.4    67      5    1
# 1           2   36.0    118.0   8.0    72      5    2
np.random.seed(0)

df = get_dataset("airquality").as_df().fillna(0)
df = df[df.Ozone.notnull()]

X_train, X_test, y_train, y_test = train_test_split(
    df.loc[:, df.columns != 'Ozone'], df['Ozone'])

# train a model and score
ftree = FastTreesRegressor().fit(X_train, y_train)
scores = ftree.predict(X_test)

# evaluate the model
print('R-squared fit:', r2_score(y_test, scores))
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path)
print(data.head())
#   age  case education  induced  parity  ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    FastTreesRegressor(feature=['induced', 'edu'], label='age')
])

# train, predict, and evaluate
# TODO: Replace with CV
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#       Score
# 0  35.171112
# 1  35.171112
# 2  34.118595
# 3  34.118595
# 4  32.484325
# print evaluation metrics