def test_lightgbmranker_scikit(self):
        np.random.seed(0)

        file_path = get_dataset("gen_tickettrain").as_filepath()

        # Pure-nimbusml paradigm
        df = pd.read_csv(file_path, encoding='utf-8')
        df['group'] = df['group'].astype(np.uint32)

        # construct a scikit pipeline
        pipe = Pipeline([
            # the group_id column must be of key type
            ('lgbm', LightGbmRanker(feature=[
                'Class', 'dep_day', 'duration'], group_id='group'))
        ])

        # Train Scikit Pipeline
        X = df.drop(['rank'], axis=1)
        y = df['rank']
        pipe.fit(X, y)
        # Predict
        scores = pipe.predict(X)

        assert_almost_equal(
            scores.values,
            self.nimbusml_per_instance_scores_sampleinputextraction(),
            decimal=6)
Beispiel #2
0
 def test_pipeline_clone_dataframe_roles_shift_operator(self):
     pipe = Pipeline([
         LightGbmRanker(number_of_iterations=1, number_of_leaves=4) << {
             Role.Feature: features,
             Role.Label: 'label_1',
             Role.GroupId: 'group_2'}
     ])
     fit_test_clone_and_check(pipe, df, debug=False)
Beispiel #3
0
 def test_pipeline_clone_dataframe_roles_arguments(self):
     pipe = Pipeline([
         LightGbmRanker(feature=features,
                        label='label_1',
                        group_id='group_2',
                        num_boost_round=1,
                        num_leaves=4)
     ])
     fit_test_clone_and_check(pipe, df)
Beispiel #4
0
 def test_nofit_pipeline_clone(self):
     pipe = Pipeline([
         LightGbmRanker(feature=features,
                        label='label_1',
                        group_id='group_2',
                        number_of_iterations=1,
                        number_of_leaves=4)
     ])
     clone_and_check(pipe)
Beispiel #5
0
 def test_pipeline_clone_filedatastream_roles_shift_operator(self):
     pipe = Pipeline([
         ToKey() << {'group_2': 'group_2'},
         LightGbmRanker(number_of_iterations=1, number_of_leaves=4) << {
             Role.Feature: features,
             Role.Label: 'label_1',
             Role.GroupId: 'group_2'}
     ])
     fit_test_clone_and_check(pipe, fds)
Beispiel #6
0
 def test_nofit_pipeline_clone(self):
     pipe = Pipeline([
         LightGbmRanker(feature=features,
                        label='label_1',
                        group_id='group_2',
                        num_boost_round=1,
                        num_leaves=4)
     ])
     clone_and_check(pipe)
Beispiel #7
0
 def test_pipeline_clone_filedatastream_roles_arguments(self):
     pipe = Pipeline([
         ToKey() << {'group_2': 'group_2'},
         LightGbmRanker(feature=features,
                        label='label_1',
                        group_id='group_2',
                        number_of_iterations=1,
                        number_of_leaves=4)
     ])
     fit_test_clone_and_check(pipe, fds)
Beispiel #8
0
    def test_lightgbmranker_asdataframe(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        df = pd.read_csv(file_path, encoding='utf-8')
        df['group'] = df['group'].astype(np.uint32)

        e = Pipeline([ToKey(columns={'rank': 'rank', 'group': 'group'}),
                      LightGbmRanker() << {
                          Role.Feature: ['Class', 'dep_day', 'duration'],
                          Role.Label: 'rank', Role.GroupId: 'group'}])

        e.fit(df)

        metrics, _ = e.test(df)
        assert_almost_equal(
            metrics['NDCG@1'][0],
            0.43571429,
            decimal=7,
            err_msg="NDCG@1 should be %s" %
                    0.43571429)
        assert_almost_equal(
            metrics['NDCG@2'][0],
            0.5128226,
            decimal=7,
            err_msg="NDCG@2 should be %s" %
                    0.5128226)
        assert_almost_equal(
            metrics['NDCG@3'][0],
            0.55168069,
            decimal=7,
            err_msg="NDCG@3 should be %s" %
                    0.55168069)
        assert_almost_equal(
            metrics['DCG@1'][0],
            4.688759,
            decimal=3,
            err_msg="DCG@1 should be %s" %
                    4.688759)
        assert_almost_equal(
            metrics['DCG@2'][0],
            9.012395,
            decimal=3,
            err_msg="DCG@2 should be %s" %
                    9.012395)
        assert_almost_equal(
            metrics['DCG@3'][0],
            11.446943,
            decimal=3,
            err_msg="DCG@3 should be %s" %
                    11.446943)
    def setUpClass(self):
        adult_path = get_dataset('uciadult_train').as_filepath()
        self.classification_data = FileDataStream.read_csv(adult_path)
        binary_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            LogisticRegressionBinaryClassifier(feature=['age', 'education'],
                                               label='label',
                                               number_of_threads=1)
        ])
        self.binary_model = binary_pipeline.fit(self.classification_data)
        self.binary_pfi = self.binary_model.permutation_feature_importance(
            self.classification_data)
        classifier_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearClassifier(feature=['age', 'education'],
                                 label='label',
                                 number_of_threads=1,
                                 shuffle=False)
        ])
        self.classifier_model = classifier_pipeline.fit(
            self.classification_data)
        self.classifier_pfi = self.classifier_model.permutation_feature_importance(
            self.classification_data)

        infert_path = get_dataset('infert').as_filepath()
        self.regression_data = FileDataStream.read_csv(infert_path)
        regressor_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearRegressor(feature=['induced', 'education'],
                                label='age',
                                number_of_threads=1,
                                shuffle=False)
        ])
        self.regressor_model = regressor_pipeline.fit(self.regression_data)
        self.regressor_pfi = self.regressor_model.permutation_feature_importance(
            self.regression_data)

        ticket_path = get_dataset('gen_tickettrain').as_filepath()
        self.ranking_data = FileDataStream.read_csv(ticket_path)
        ranker_pipeline = Pipeline([
            ToKey(columns=['group']),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group',
                           random_state=0,
                           number_of_threads=1)
        ])
        self.ranker_model = ranker_pipeline.fit(self.ranking_data)
        self.ranker_pfi = self.ranker_model.permutation_feature_importance(
            self.ranking_data)
    def test_lightgbmranker_asfilestream(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        # Pure-nimbusml paradigm
        train_stream = FileDataStream.read_csv(file_path, encoding='utf-8')

        # pipeline
        pipeline = Pipeline([
            # the group_id column must be of key type
            ToKey(columns={
                'rank': 'rank',
                'group': 'group'
            }),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group')
        ])

        # train
        pipeline.fit(train_stream)

        # test
        eval_stream = FileDataStream.read_csv(file_path)
        metrics, _ = pipeline.test(eval_stream)
        assert_almost_equal(metrics['NDCG@1'][0],
                            43.571429,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 43.571429)
        assert_almost_equal(metrics['NDCG@2'][0],
                            51.28226,
                            decimal=5,
                            err_msg="NDCG@2 should be %s" % 51.28226)
        assert_almost_equal(metrics['NDCG@3'][0],
                            55.168069,
                            decimal=5,
                            err_msg="NDCG@3 should be %s" % 55.168069)
        assert_almost_equal(metrics['DCG@1'][0],
                            4.688759,
                            decimal=3,
                            err_msg="DCG@1 should be %s" % 4.688759)
        assert_almost_equal(metrics['DCG@2'][0],
                            9.012395,
                            decimal=3,
                            err_msg="DCG@2 should be %s" % 9.012395)
        assert_almost_equal(metrics['DCG@3'][0],
                            11.446943,
                            decimal=3,
                            err_msg="DCG@3 should be %s" % 11.446943)
Beispiel #11
0
 def check_cv_with_defaults_df(
         self,
         label_name='rank',
         group_id='group',
         features=['price', 'Class', 'dep_day', 'nbr_stops', 'duration'],
         **params):
     steps = [
         ToKey() << {
             group_id: group_id
         },
         LightGbmRanker(min_data_per_leaf=1,
                        feature=features,
                        label='rank',
                        group_id='group')
     ]
     data = self.data_pandas()
     check_cv(pipeline=Pipeline(steps), X=data, **params)
Beispiel #12
0
 def check_cv_with_defaults2(self,
                             label_name='Label',
                             group_id='GroupId',
                             features='Features_1',
                             **params):
     steps = [
         OneHotHashVectorizer(output_kind='Key') << {
             group_id: group_id
         },
         ColumnConcatenator() << {
             'Features': [features]
         },
         LightGbmRanker(min_data_per_leaf=1) << {
             Role.GroupId: group_id
         }
     ]
     data = self.data_wt_rename(label_name, group_id, features)
     check_cv(pipeline=Pipeline(steps), X=data, **params)
    def test_lightgbmranker_asdataframe_groupid(self):
        # Data file
        file_path = get_dataset("gen_tickettrain").as_filepath()

        df = pd.read_csv(file_path, encoding='utf-8')
        df['group'] = df['group'].astype(np.uint32)

        e = Pipeline([
            ToKey(columns={
                'rank': 'rank',
                'group': 'group'
            }),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group')
        ])

        e.fit(df)

        metrics, _ = e.test(df)
        assert_almost_equal(metrics['NDCG@1'][0],
                            43.571429,
                            decimal=5,
                            err_msg="NDCG@1 should be %s" % 43.571429)
        assert_almost_equal(metrics['NDCG@2'][0],
                            51.28226,
                            decimal=5,
                            err_msg="NDCG@2 should be %s" % 51.28226)
        assert_almost_equal(metrics['NDCG@3'][0],
                            55.168069,
                            decimal=5,
                            err_msg="NDCG@3 should be %s" % 55.168069)
        assert_almost_equal(metrics['DCG@1'][0],
                            4.688759,
                            decimal=3,
                            err_msg="DCG@1 should be %s" % 4.688759)
        assert_almost_equal(metrics['DCG@2'][0],
                            9.012395,
                            decimal=3,
                            err_msg="DCG@2 should be %s" % 9.012395)
        assert_almost_equal(metrics['DCG@3'][0],
                            11.446943,
                            decimal=3,
                            err_msg="DCG@3 should be %s" % 11.446943)
Beispiel #14
0
 def check_cv_with_defaults(self,
                            label_name='Label',
                            group_id='GroupId',
                            features='Features_1',
                            **params):
     steps = [
         OneHotHashVectorizer(output_kind='Key') << {
             group_id: group_id
         },
         # even specify all the roles neede in the following line, the
         # roles are still not passed correctly
         LightGbmRanker(min_data_per_leaf=1) << {
             Role.GroupId: group_id,
             Role.Feature: features,
             Role.Label: label_name
         }
     ]
     data = self.data(label_name, group_id, features)
     check_cv(pipeline=Pipeline(steps), X=data, **params)
Beispiel #15
0
 def check_cv_with_defaults2(self,
                             label_name='Label',
                             group_id='GroupId',
                             features='Features_1',
                             **params):
     # REVIEW: Replace back ToKey() with OneHotHashVectorizer()  and reinstate metrics checks
     # once issue https://github.com/dotnet/machinelearning/issues/1939 is resolved.
     params.pop('expected_metrics', None)
     steps = [
         ToKey() << {
             group_id: group_id
         },
         ColumnConcatenator() << {
             'Features': [features]
         },
         LightGbmRanker(min_data_per_leaf=1) << {
             Role.GroupId: group_id
         }
     ]
     data = self.data_wt_rename(label_name, group_id, features)
     check_cv(pipeline=Pipeline(steps), X=data, **params)
Beispiel #16
0
 def check_cv_with_defaults(self,
                            label_name='Label',
                            group_id='GroupId',
                            features='Features_1',
                            **params):
     # REVIEW: Replace back ToKey() with OneHotHashVectorizer()  and reinstate metrics checks
     # once issue https://github.com/dotnet/machinelearning/issues/1939 is resolved.
     params.pop('expected_metrics', None)
     steps = [
         ToKey() << {
             group_id: group_id
         },
         # even specify all the roles needed in the following line, the
         # roles are still not passed correctly
         LightGbmRanker(min_data_per_leaf=1) << {
             Role.GroupId: group_id,
             Role.Feature: features,
             Role.Label: label_name
         }
     ]
     data = self.data(label_name, group_id, features)
     check_cv(pipeline=Pipeline(steps), X=data, **params)
Beispiel #17
0
 def check_cv_with_non_defaults(self,
                                label_name='label',
                                group_id='groupid',
                                features='Features_1',
                                **params):
     steps = [
         ToKey(columns={
             'groupid2': group_id,
             'label2': label_name
         }),
         LightGbmRanker() << {
             Role.GroupId: 'groupid2',
             Role.Label: 'label2',
             Role.Feature: [features]
         }
     ]
     data = self.data(label_name, group_id, features)
     cv = CV(steps)
     results = cv.fit(data, groups='groupid', cv=4)
     check_cv_results(cv._learner_type,
                      results,
                      n_folds=4,
                      expected_metrics={})
Beispiel #18
0
    def test_get_fit_info_ranker(self):
        file_path = get_dataset("gen_tickettrain").as_filepath()
        file_schema = 'sep=, col=Label_1:R4:0 col=GroupId_2:TX:1 ' \
                      'col=Features_3:R4:3-5'
        train_stream = FileDataStream(file_path, schema=file_schema)
        pipeline = Pipeline([
            ToKey() << {
                'GroupId_2': 'GroupId_2'
            },
            ColumnConcatenator() << {
                'Features': ['Features_3']
            },
            LightGbmRanker() << {
                Role.Feature: 'Features',
                Role.Label: 'Label_1',
                Role.GroupId: 'GroupId_2'
            }
        ])

        info = pipeline.get_fit_info(train_stream)
        last = info[0][-1]
        inp = last['inputs']
        assert 'GroupId:GroupId_2' in inp
###############################################################################
# LightGbmRanker
import numpy as np
import pandas as pd
from nimbusml import Pipeline, Role
from nimbusml.datasets import get_dataset
from nimbusml.ensemble import LightGbmRanker

np.random.seed(0)
file_path = get_dataset("gen_tickettrain").as_filepath()

df = pd.read_csv(file_path)
df['group'] = df['group'].astype(np.uint32)

X = df.drop(['rank'], axis=1)
y = df['rank']

e = Pipeline([LightGbmRanker() << {Role.Feature: [
    'Class', 'dep_day', 'duration'], Role.Label: 'rank',
    Role.GroupId: 'group'}])

e.fit(df)

# test
metrics, scores = e.test(X, y, evaltype='ranking',
                         group_id='group', output_scores=True)
print(metrics)
    'check_estimators_dtypes', 'check_classifiers_classes',
    'check_classifiers_train'
]

INSTANCES = {
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(minimum_example_count_per_group=1,
                             minimum_example_count_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(minimum_example_count_per_group=1,
                       minimum_example_count_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(minimum_example_count_per_group=1,
                      minimum_example_count_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(minimum_example_count_per_group=1,
                   minimum_example_count_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SkipFilter':
    SkipFilter(count=5),
    'TakeFilter':
    TakeFilter(count=100000),
    'IidSpikeDetector':
    IidSpikeDetector(columns=['F0']),
    'IidChangePointDetector':
    IidChangePointDetector(columns=['F0']),
    'SsaSpikeDetector':
    SsaSpikeDetector(columns=['F0'], seasonal_window_size=2),
    'SsaChangePointDetector':
    SsaChangePointDetector(columns=['F0'], seasonal_window_size=2),
    'SsaForecaster':
Beispiel #21
0
 # GlobalContrastRowScaler currently requires a vector input to work
 'GlobalContrastRowScaler': Pipeline([
     ColumnConcatenator() << {
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}),
 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']),
 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']),
 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}),
 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=3, feature=['Sepal_Width', 'Sepal_Length']),
 'LightGbmRanker': LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                                  label='rank',
                                  group_id='group'),
 'Loader': Loader(columns={'ImgPath': 'Path'}),
 'LpScaler': Pipeline([
     ColumnConcatenator() << {
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     LpScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'MutualInformationSelector': Pipeline([
     ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}),
     MutualInformationSelector(
         columns='Features',
         label='Label',
Beispiel #22
0
    'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature',
    'check_dont_overwrite_parameters', 'check_supervised_y_2d',
    'check_estimators_fit_returns_self', 'check_estimators_overwrite_params',
    'check_estimators_dtypes', 'check_classifiers_classes',
    'check_classifiers_train'
]

INSTANCES = {
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(min_data_per_group=1, min_data_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(min_data_per_group=1, min_data_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SkipFilter':
    SkipFilter(count=5),
    'TensorFlowScorer':
    TensorFlowScorer(model=os.path.join(this, '..', 'nimbusml', 'examples',
                                        'frozen_saved_model.pb'),
                     columns={'c': ['a', 'b']}),
}

MULTI_OUTPUT_EX = [
    'FastLinearClassifier', 'FastLinearRegressor',
    'LogisticRegressionClassifier', 'FastTreesRegressor',
    'FastForestRegressor', 'FastTreesTweedieRegressor',
    'OneClassSvmAnomalyDetector', 'NaiveBayesClassifier',