def test_pipeline_with_no_columns(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None
        ppl.fit(trainData[["SentimentText"]], np.array(trainData["Sentiment"]))
Example #2
0
    def test_lightgbmclassifier(self):
        np.random.seed(0)
        train_file = get_dataset('wiki_detox_train').as_filepath()
        (train,
         label) = get_X_y(train_file,
                          label_column='Sentiment',
                          sep='\t',
                          encoding="utf-8")
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train, max_slots=5000)
        X_test = texttransform.transform(X_test, max_slots=5000)

        mymodel = LightGbmClassifier().fit(X_train, y_train, verbose=0)
        scores = mymodel.predict(X_test)
        accuracy = np.mean(y_test.values.ravel() == scores.values)
        assert_greater(
            accuracy,
            0.58,
            "accuracy should be greater than %s" %
            0.58)
    def test_pipeline_with_no_columns_raise(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier()
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        with self.assertRaises(RuntimeError):
            # Message
            # System.InvalidOperationException:
            # 'LightGBM Error, code is -1, error message is
            # 'Cannot construct Dataset since there are not useful features.
            # It should be at least two unique rows.
            # If the num_row (num_data) is small,
            # you can set min_data=1 and min_data_in_bin=1 to fix this.
            # Otherwise please make sure you are using the right dataset.'
            ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])
Example #4
0
 def test_pass_predict_proba_multiclass_with_pipeline(self):
     algos = [
         LogisticRegressionClassifier(),
         FastLinearClassifier(),
         LightGbmClassifier()
     ]
     for algo in algos:
         assert_almost_equal(proba_sum(Pipeline([algo])),
                             38.0,
                             decimal=3,
                             err_msg=invalid_predict_proba_output)
Example #5
0
def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None):
    data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [1.0, 2.0, 3.0], [2.0, 2.0, 2.0]]
    label = [1, 0, 1, 1]
    if fit_X_type == "sparse":
        model = Pipeline([Binner(), LightGbmClassifier(minimum_example_count_per_leaf=1)])
    else:
        model = Pipeline([Binner(), LogisticRegressionBinaryClassifier()])
    data_with_new_type = transform_data(data, fit_X_type)
    label_with_new_type = transform_data(label, fit_Y_type)
    model.fit(data_with_new_type, label_with_new_type)
    metrics, scores = model.test(
        data_with_new_type, label_with_new_type, output_scores=True)
    test_data_with_new_type = transform_data(data, predict_X_type)
    return model.predict(test_data_with_new_type), scores, metrics
 def test_syntax1_passing(self):
     df, X, y = self.get_simple_df()
     exp = Pipeline([
         OneHotVectorizer() << {
             'f1': 'education2'
         },
         OneHotVectorizer(max_num_terms=2) << {
             'f3': 'workclass'
         },
         LightGbmClassifier(min_data_per_leaf=1) << ['f1', 'f3']
     ])
     exp.fit(X, y)
     res = exp.transform(X)
     assert res.shape == (5, 16)
Example #7
0
def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None):
    data = [
        "This is sentence 1", "Talk about second", "Thrid one",
        "Final example."
    ]
    label = [1, 0, 1, 1]
    model = Pipeline([
        NGramFeaturizer(),
        LightGbmClassifier(min_data_per_leaf=1, n_thread=1)
    ])
    data_with_new_type = transform_data(data, fit_X_type)
    label_with_new_type = transform_data(label, fit_Y_type)
    model.fit(data_with_new_type, label_with_new_type)
    metrics, scores = model.test(data_with_new_type,
                                 label_with_new_type,
                                 output_scores=True)
    test_data_with_new_type = transform_data(data, predict_X_type)
    return model.predict(test_data_with_new_type), scores, metrics
    def test_pipeline_name_error(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })
        NGramFeaturizer(word_feature_extractor=n_gram()).fit_transform(
            trainData[["SentimentText"]])

        msg = "Parameters ['NumLeaves', 'min_data', 'min_data_in_bin', " \
              "'minsplit'] are not allowed"
        with self.assertRaises(NameError, msg=msg):
            LightGbmClassifier(min_data=1,
                               min_data_in_bin=1,
                               min_data_per_leaf=1,
                               minsplit=1,
                               NumLeaves=2)
Example #9
0
def train_data_type_single(fit_X_type="dataframe",
                           fit_Y_type=None,
                           predict_X_type=None):
    data = [[1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 2, 2]]
    label = [1, 0, 1, 1]
    if fit_X_type == "sparse":
        model = LightGbmClassifier(minimum_example_count_per_leaf=1)
    else:
        model = LogisticRegressionBinaryClassifier()
    data_with_new_type = transform_data(data, fit_X_type)
    label_with_new_type = transform_data(label, fit_Y_type)
    model.fit(data_with_new_type, label_with_new_type)
    test_data_with_new_type = transform_data(data, predict_X_type)
    return model.predict(test_data_with_new_type)
Example #10
0
NOBINARY_CHECKS = [
    'check_estimator_sparse_data', 'check_dtype_object',
    'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature',
    'check_dont_overwrite_parameters', 'check_supervised_y_2d',
    'check_estimators_fit_returns_self', 'check_estimators_overwrite_params',
    'check_estimators_dtypes', 'check_classifiers_classes',
    'check_classifiers_train'
]

INSTANCES = {
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(minimum_example_count_per_group=1,
                             minimum_example_count_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(minimum_example_count_per_group=1,
                       minimum_example_count_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(minimum_example_count_per_group=1,
                      minimum_example_count_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(minimum_example_count_per_group=1,
                   minimum_example_count_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SkipFilter':
    SkipFilter(count=5),
    'TakeFilter':
    TakeFilter(count=100000),
    'IidSpikeDetector':
    IidSpikeDetector(columns=['F0']),
    'IidChangePointDetector':
Example #11
0
###############################################################################
# LightGbmClassifier
import numpy as np
import pandas as pd
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import LightGbmClassifier
from sklearn.model_selection import train_test_split

np.random.seed(0)

# use 'iris' data set to create test and train data
df = get_dataset("iris").as_df()
print(df.head())
#    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label Species  Setosa
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0

df.drop(['Species'], inplace=True, axis=1)

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])
lr = LightGbmClassifier().fit(X_train, y_train)

scores = lr.predict(X_test)
scores = pd.to_numeric(scores)

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))
Example #12
0
learners_not_supported = [
    NaiveBayesClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    KMeansPlusPlus(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    FactorizationMachineBinaryClassifier(),
    PcaAnomalyDetector(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # PcaTransformer(), # REVIEW: crashes
    GamBinaryClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    GamRegressor(
    ),  # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    LightGbmClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # LightGbmRanker(), # REVIEW: crashes
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    OneVsRestClassifier(FastLinearBinaryClassifier()),
]


class TestModelSummary(unittest.TestCase):
    def test_model_summary(self):
        for learner in learners:
            pipeline = Pipeline(
                [OneHotVectorizer() << categorical_columns, learner])
            train_stream = FileDataStream(train_file, schema=file_schema)
            pipeline.fit(train_stream, label_column)
            pipeline.summary()
Example #13
0
OMITTED_CHECKS_ALWAYS = 'check_estimators_nan_inf'

NOBINARY_CHECKS = [
    'check_estimator_sparse_data', 'check_dtype_object',
    'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature',
    'check_dont_overwrite_parameters', 'check_supervised_y_2d',
    'check_estimators_fit_returns_self', 'check_estimators_overwrite_params',
    'check_estimators_dtypes', 'check_classifiers_classes',
    'check_classifiers_train'
]

INSTANCES = {
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(min_data_per_group=1, min_data_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SkipFilter':
    SkipFilter(count=5),
    'TensorFlowScorer':
    TensorFlowScorer(model=os.path.join(this, '..', 'nimbusml', 'examples',
                                        'frozen_saved_model.pb'),
                     columns={'c': ['a', 'b']}),
}

MULTI_OUTPUT_EX = [
Example #14
0
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LightGbmClassifier(feature=['parity', 'edu'],
                       label='induced',
                       booster=Dart(reg_lambda=0.1))
])

# train, predict, and evaluate
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel   Score.0   Score.1   Score.2
# 0               2  0.070722  0.145439  0.783839
# 1               0  0.737733  0.260116  0.002150
# 2               2  0.070722  0.145439  0.783839
# 3               0  0.490715  0.091749  0.417537
# 4               0  0.562419  0.197818  0.239763
# print evaluation metrics