def test_lightgbmranker_scikit(self): np.random.seed(0) file_path = get_dataset("gen_tickettrain").as_filepath() # Pure-nimbusml paradigm df = pd.read_csv(file_path, encoding='utf-8') df['group'] = df['group'].astype(np.uint32) # construct a scikit pipeline pipe = Pipeline([ # the group_id column must be of key type ('lgbm', LightGbmRanker(feature=[ 'Class', 'dep_day', 'duration'], group_id='group')) ]) # Train Scikit Pipeline X = df.drop(['rank'], axis=1) y = df['rank'] pipe.fit(X, y) # Predict scores = pipe.predict(X) assert_almost_equal( scores.values, self.nimbusml_per_instance_scores_sampleinputextraction(), decimal=6)
def test_pipeline_clone_dataframe_roles_shift_operator(self): pipe = Pipeline([ LightGbmRanker(number_of_iterations=1, number_of_leaves=4) << { Role.Feature: features, Role.Label: 'label_1', Role.GroupId: 'group_2'} ]) fit_test_clone_and_check(pipe, df, debug=False)
def test_pipeline_clone_dataframe_roles_arguments(self): pipe = Pipeline([ LightGbmRanker(feature=features, label='label_1', group_id='group_2', num_boost_round=1, num_leaves=4) ]) fit_test_clone_and_check(pipe, df)
def test_nofit_pipeline_clone(self): pipe = Pipeline([ LightGbmRanker(feature=features, label='label_1', group_id='group_2', number_of_iterations=1, number_of_leaves=4) ]) clone_and_check(pipe)
def test_pipeline_clone_filedatastream_roles_shift_operator(self): pipe = Pipeline([ ToKey() << {'group_2': 'group_2'}, LightGbmRanker(number_of_iterations=1, number_of_leaves=4) << { Role.Feature: features, Role.Label: 'label_1', Role.GroupId: 'group_2'} ]) fit_test_clone_and_check(pipe, fds)
def test_nofit_pipeline_clone(self): pipe = Pipeline([ LightGbmRanker(feature=features, label='label_1', group_id='group_2', num_boost_round=1, num_leaves=4) ]) clone_and_check(pipe)
def test_pipeline_clone_filedatastream_roles_arguments(self): pipe = Pipeline([ ToKey() << {'group_2': 'group_2'}, LightGbmRanker(feature=features, label='label_1', group_id='group_2', number_of_iterations=1, number_of_leaves=4) ]) fit_test_clone_and_check(pipe, fds)
def test_lightgbmranker_asdataframe(self): # Data file file_path = get_dataset("gen_tickettrain").as_filepath() df = pd.read_csv(file_path, encoding='utf-8') df['group'] = df['group'].astype(np.uint32) e = Pipeline([ToKey(columns={'rank': 'rank', 'group': 'group'}), LightGbmRanker() << { Role.Feature: ['Class', 'dep_day', 'duration'], Role.Label: 'rank', Role.GroupId: 'group'}]) e.fit(df) metrics, _ = e.test(df) assert_almost_equal( metrics['NDCG@1'][0], 0.43571429, decimal=7, err_msg="NDCG@1 should be %s" % 0.43571429) assert_almost_equal( metrics['NDCG@2'][0], 0.5128226, decimal=7, err_msg="NDCG@2 should be %s" % 0.5128226) assert_almost_equal( metrics['NDCG@3'][0], 0.55168069, decimal=7, err_msg="NDCG@3 should be %s" % 0.55168069) assert_almost_equal( metrics['DCG@1'][0], 4.688759, decimal=3, err_msg="DCG@1 should be %s" % 4.688759) assert_almost_equal( metrics['DCG@2'][0], 9.012395, decimal=3, err_msg="DCG@2 should be %s" % 9.012395) assert_almost_equal( metrics['DCG@3'][0], 11.446943, decimal=3, err_msg="DCG@3 should be %s" % 11.446943)
def setUpClass(self): adult_path = get_dataset('uciadult_train').as_filepath() self.classification_data = FileDataStream.read_csv(adult_path) binary_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), LogisticRegressionBinaryClassifier(feature=['age', 'education'], label='label', number_of_threads=1) ]) self.binary_model = binary_pipeline.fit(self.classification_data) self.binary_pfi = self.binary_model.permutation_feature_importance( self.classification_data) classifier_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), FastLinearClassifier(feature=['age', 'education'], label='label', number_of_threads=1, shuffle=False) ]) self.classifier_model = classifier_pipeline.fit( self.classification_data) self.classifier_pfi = self.classifier_model.permutation_feature_importance( self.classification_data) infert_path = get_dataset('infert').as_filepath() self.regression_data = FileDataStream.read_csv(infert_path) regressor_pipeline = Pipeline([ OneHotVectorizer(columns=['education']), FastLinearRegressor(feature=['induced', 'education'], label='age', number_of_threads=1, shuffle=False) ]) self.regressor_model = regressor_pipeline.fit(self.regression_data) self.regressor_pfi = self.regressor_model.permutation_feature_importance( self.regression_data) ticket_path = get_dataset('gen_tickettrain').as_filepath() self.ranking_data = FileDataStream.read_csv(ticket_path) ranker_pipeline = Pipeline([ ToKey(columns=['group']), LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group', random_state=0, number_of_threads=1) ]) self.ranker_model = ranker_pipeline.fit(self.ranking_data) self.ranker_pfi = self.ranker_model.permutation_feature_importance( self.ranking_data)
def test_lightgbmranker_asfilestream(self): # Data file file_path = get_dataset("gen_tickettrain").as_filepath() # Pure-nimbusml paradigm train_stream = FileDataStream.read_csv(file_path, encoding='utf-8') # pipeline pipeline = Pipeline([ # the group_id column must be of key type ToKey(columns={ 'rank': 'rank', 'group': 'group' }), LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group') ]) # train pipeline.fit(train_stream) # test eval_stream = FileDataStream.read_csv(file_path) metrics, _ = pipeline.test(eval_stream) assert_almost_equal(metrics['NDCG@1'][0], 43.571429, decimal=5, err_msg="NDCG@1 should be %s" % 43.571429) assert_almost_equal(metrics['NDCG@2'][0], 51.28226, decimal=5, err_msg="NDCG@2 should be %s" % 51.28226) assert_almost_equal(metrics['NDCG@3'][0], 55.168069, decimal=5, err_msg="NDCG@3 should be %s" % 55.168069) assert_almost_equal(metrics['DCG@1'][0], 4.688759, decimal=3, err_msg="DCG@1 should be %s" % 4.688759) assert_almost_equal(metrics['DCG@2'][0], 9.012395, decimal=3, err_msg="DCG@2 should be %s" % 9.012395) assert_almost_equal(metrics['DCG@3'][0], 11.446943, decimal=3, err_msg="DCG@3 should be %s" % 11.446943)
def check_cv_with_defaults_df( self, label_name='rank', group_id='group', features=['price', 'Class', 'dep_day', 'nbr_stops', 'duration'], **params): steps = [ ToKey() << { group_id: group_id }, LightGbmRanker(min_data_per_leaf=1, feature=features, label='rank', group_id='group') ] data = self.data_pandas() check_cv(pipeline=Pipeline(steps), X=data, **params)
def check_cv_with_defaults2(self, label_name='Label', group_id='GroupId', features='Features_1', **params): steps = [ OneHotHashVectorizer(output_kind='Key') << { group_id: group_id }, ColumnConcatenator() << { 'Features': [features] }, LightGbmRanker(min_data_per_leaf=1) << { Role.GroupId: group_id } ] data = self.data_wt_rename(label_name, group_id, features) check_cv(pipeline=Pipeline(steps), X=data, **params)
def test_lightgbmranker_asdataframe_groupid(self): # Data file file_path = get_dataset("gen_tickettrain").as_filepath() df = pd.read_csv(file_path, encoding='utf-8') df['group'] = df['group'].astype(np.uint32) e = Pipeline([ ToKey(columns={ 'rank': 'rank', 'group': 'group' }), LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group') ]) e.fit(df) metrics, _ = e.test(df) assert_almost_equal(metrics['NDCG@1'][0], 43.571429, decimal=5, err_msg="NDCG@1 should be %s" % 43.571429) assert_almost_equal(metrics['NDCG@2'][0], 51.28226, decimal=5, err_msg="NDCG@2 should be %s" % 51.28226) assert_almost_equal(metrics['NDCG@3'][0], 55.168069, decimal=5, err_msg="NDCG@3 should be %s" % 55.168069) assert_almost_equal(metrics['DCG@1'][0], 4.688759, decimal=3, err_msg="DCG@1 should be %s" % 4.688759) assert_almost_equal(metrics['DCG@2'][0], 9.012395, decimal=3, err_msg="DCG@2 should be %s" % 9.012395) assert_almost_equal(metrics['DCG@3'][0], 11.446943, decimal=3, err_msg="DCG@3 should be %s" % 11.446943)
def check_cv_with_defaults(self, label_name='Label', group_id='GroupId', features='Features_1', **params): steps = [ OneHotHashVectorizer(output_kind='Key') << { group_id: group_id }, # even specify all the roles neede in the following line, the # roles are still not passed correctly LightGbmRanker(min_data_per_leaf=1) << { Role.GroupId: group_id, Role.Feature: features, Role.Label: label_name } ] data = self.data(label_name, group_id, features) check_cv(pipeline=Pipeline(steps), X=data, **params)
def check_cv_with_defaults2(self, label_name='Label', group_id='GroupId', features='Features_1', **params): # REVIEW: Replace back ToKey() with OneHotHashVectorizer() and reinstate metrics checks # once issue https://github.com/dotnet/machinelearning/issues/1939 is resolved. params.pop('expected_metrics', None) steps = [ ToKey() << { group_id: group_id }, ColumnConcatenator() << { 'Features': [features] }, LightGbmRanker(min_data_per_leaf=1) << { Role.GroupId: group_id } ] data = self.data_wt_rename(label_name, group_id, features) check_cv(pipeline=Pipeline(steps), X=data, **params)
def check_cv_with_defaults(self, label_name='Label', group_id='GroupId', features='Features_1', **params): # REVIEW: Replace back ToKey() with OneHotHashVectorizer() and reinstate metrics checks # once issue https://github.com/dotnet/machinelearning/issues/1939 is resolved. params.pop('expected_metrics', None) steps = [ ToKey() << { group_id: group_id }, # even specify all the roles needed in the following line, the # roles are still not passed correctly LightGbmRanker(min_data_per_leaf=1) << { Role.GroupId: group_id, Role.Feature: features, Role.Label: label_name } ] data = self.data(label_name, group_id, features) check_cv(pipeline=Pipeline(steps), X=data, **params)
def check_cv_with_non_defaults(self, label_name='label', group_id='groupid', features='Features_1', **params): steps = [ ToKey(columns={ 'groupid2': group_id, 'label2': label_name }), LightGbmRanker() << { Role.GroupId: 'groupid2', Role.Label: 'label2', Role.Feature: [features] } ] data = self.data(label_name, group_id, features) cv = CV(steps) results = cv.fit(data, groups='groupid', cv=4) check_cv_results(cv._learner_type, results, n_folds=4, expected_metrics={})
def test_get_fit_info_ranker(self): file_path = get_dataset("gen_tickettrain").as_filepath() file_schema = 'sep=, col=Label_1:R4:0 col=GroupId_2:TX:1 ' \ 'col=Features_3:R4:3-5' train_stream = FileDataStream(file_path, schema=file_schema) pipeline = Pipeline([ ToKey() << { 'GroupId_2': 'GroupId_2' }, ColumnConcatenator() << { 'Features': ['Features_3'] }, LightGbmRanker() << { Role.Feature: 'Features', Role.Label: 'Label_1', Role.GroupId: 'GroupId_2' } ]) info = pipeline.get_fit_info(train_stream) last = info[0][-1] inp = last['inputs'] assert 'GroupId:GroupId_2' in inp
############################################################################### # LightGbmRanker import numpy as np import pandas as pd from nimbusml import Pipeline, Role from nimbusml.datasets import get_dataset from nimbusml.ensemble import LightGbmRanker np.random.seed(0) file_path = get_dataset("gen_tickettrain").as_filepath() df = pd.read_csv(file_path) df['group'] = df['group'].astype(np.uint32) X = df.drop(['rank'], axis=1) y = df['rank'] e = Pipeline([LightGbmRanker() << {Role.Feature: [ 'Class', 'dep_day', 'duration'], Role.Label: 'rank', Role.GroupId: 'group'}]) e.fit(df) # test metrics, scores = e.test(X, y, evaltype='ranking', group_id='group', output_scores=True) print(metrics)
'check_estimators_dtypes', 'check_classifiers_classes', 'check_classifiers_train' ] INSTANCES = { 'LightGbmBinaryClassifier': LightGbmBinaryClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRegressor': LightGbmRegressor(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmRanker': LightGbmRanker(minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), 'SkipFilter': SkipFilter(count=5), 'TakeFilter': TakeFilter(count=100000), 'IidSpikeDetector': IidSpikeDetector(columns=['F0']), 'IidChangePointDetector': IidChangePointDetector(columns=['F0']), 'SsaSpikeDetector': SsaSpikeDetector(columns=['F0'], seasonal_window_size=2), 'SsaChangePointDetector': SsaChangePointDetector(columns=['F0'], seasonal_window_size=2), 'SsaForecaster':
# GlobalContrastRowScaler currently requires a vector input to work 'GlobalContrastRowScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'}) ]), 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}), 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']), 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']), 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}), 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=3, feature=['Sepal_Width', 'Sepal_Length']), 'LightGbmRanker': LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group'), 'Loader': Loader(columns={'ImgPath': 'Path'}), 'LpScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, LpScaler(columns={'normed_columns': 'concated_columns'}) ]), 'MutualInformationSelector': Pipeline([ ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}), MutualInformationSelector( columns='Features', label='Label',
'check_fit_score_takes_y', 'check_fit2d_predict1d', 'check_fit1d_1feature', 'check_dont_overwrite_parameters', 'check_supervised_y_2d', 'check_estimators_fit_returns_self', 'check_estimators_overwrite_params', 'check_estimators_dtypes', 'check_classifiers_classes', 'check_classifiers_train' ] INSTANCES = { 'LightGbmBinaryClassifier': LightGbmBinaryClassifier(min_data_per_group=1, min_data_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier(min_data_per_group=1, min_data_per_leaf=1), 'LightGbmRegressor': LightGbmRegressor(min_data_per_group=1, min_data_per_leaf=1), 'LightGbmRanker': LightGbmRanker(min_data_per_group=1, min_data_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), 'SkipFilter': SkipFilter(count=5), 'TensorFlowScorer': TensorFlowScorer(model=os.path.join(this, '..', 'nimbusml', 'examples', 'frozen_saved_model.pb'), columns={'c': ['a', 'b']}), } MULTI_OUTPUT_EX = [ 'FastLinearClassifier', 'FastLinearRegressor', 'LogisticRegressionClassifier', 'FastTreesRegressor', 'FastForestRegressor', 'FastTreesTweedieRegressor', 'OneClassSvmAnomalyDetector', 'NaiveBayesClassifier',