def test_syntax11_append_insert(self):

        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)

        exp = Pipeline()
        exp.append(
            ("OneHotHashVectorizer",
             OneHotHashVectorizer() << {
                 'edu2': 'education'}))
        exp.insert(0, OneHotVectorizer() << {'edu1': 'education'})
        exp.append(
            FastLinearBinaryClassifier(
                maximum_number_of_iterations=1) << {
                'Features': [
                    'edu1',
                    'edu2'],
                Role.Label: 'y'})
        exp.append(OneHotHashVectorizer() << {'edu2': 'education'})
        del exp[-1]
        assert len(exp) == 3

        exp.fit(df, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(prediction.columns)) == [
            'PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)

        try:
            exp.append(OneHotHashVectorizer() << {'edu2': 'education'})
        except RuntimeError as e:
            assert "Model is fitted and cannot be modified" in str(e)
        try:
            exp.insert(0, OneHotHashVectorizer() << {'edu2': 'education'})
        except RuntimeError as e:
            assert "Model is fitted and cannot be modified" in str(e)
        try:
            del exp[0]
        except RuntimeError as e:
            assert "Model is fitted and cannot be modified" in str(e)

        obj = exp[1][1]
        assert obj.__class__.__name__ == "OneHotHashVectorizer"
        obj = exp[1][1]
        assert obj.__class__.__name__ == "OneHotHashVectorizer"
        res = exp['OneHotHashVectorizer']
        assert len(res) == 1
        graph = exp.graph_
        assert len(graph.nodes) >= len(exp)
Exemple #2
0
    def test_syntax3(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << 'education',
            OneHotVectorizer(max_num_terms=2) << 'workclass',
            # Currently the learner does not use edu1
            # unless it is specified explicitely so nimbusml
            # does not do what the syntax implicetely tells.
            # We need to modify either the bridge to look into
            # every available column at one step.
            FastLinearBinaryClassifier(max_iterations=1)
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Exemple #3
0
    def test_syntax11_learner(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            FastLinearBinaryClassifier(max_iterations=1) << {
                'Features': ['edu1', 'edu2'],
                Role.Label: 'y'
            }
        ])
        exp.fit(df)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Exemple #4
0
    def test_syntax6_regular_expression(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': ['f%d' % i for i in range(1, 4)]
            },
            Drop() << '~Features',
            FastLinearBinaryClassifier(max_iterations=1)
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Exemple #5
0
    def test_syntax5_regular_expression(self):
        # REVIEW: not implemented yet
        # The best would be to handle regular expression inside nimbusml.
        # It could be handled in entrypoint.py just before calling nimbusml.
        # It can be handled inside Pipeline if it is aware of
        # the input schema.

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': 'f[0-9]+'
            },
            FastLinearBinaryClassifier(max_iterations=1) << 'Features'
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Exemple #6
0
    def test_syntax4_fail2(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'wki': 'workclass'
            },
            FastLinearBinaryClassifier(max_iterations=1) <<
            ['edu1', 'edu4', 'wki']
        ])
        try:
            exp.fit(X, y)
            raise AssertionError("The test should not reach this line.")
        except Exception as e:
            assert "Feature column 'edu4' not found" in str(e)
Exemple #7
0
    def test_syntax4_fail(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'wki': 'workclass'
            },
            FastLinearBinaryClassifier(max_iterations=1) <<
            ['edu1', 'edu2', 'wki']
        ])
        try:
            exp.fit(X, y)
            assert False
        except RuntimeError as e:
            assert "ConcatTransform() << {'Input': ['edu1', 'edu2', 'wki']}" \
                   in str(e)
Exemple #8
0
    def test_syntax4_dict(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'wki': 'workclass'
            },
            Concat() << {
                'Inputs': ['edu1', 'edu2', 'wki']
            },
            FastLinearBinaryClassifier(max_iterations=1) << 'Inputs'
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Exemple #9
0
    def test_clone_sweep(self):
        # grid search, then clone pipeline and grid search again
        # results should be same
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column,
                                     sep=',',
                                     encoding='utf-8')
        (X_test, y_test) = get_X_y(test_file,
                                   label_column,
                                   sep=',',
                                   encoding='utf-8')

        cat = OneHotHashVectorizer() << categorical_columns
        learner = FastTreesBinaryClassifier(number_of_trees=100,
                                            number_of_leaves=5)
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner__number_of_trees=[1, 5, 10])
        grid = GridSearchCV(pipe, param_grid)
        grid.fit(X_train, y_train)

        pipe1 = pipe.clone()
        grid1 = GridSearchCV(pipe1, param_grid)
        grid1.fit(X_train, y_train)

        assert grid.best_params_[
            'learner__number_of_trees'] == grid1.best_params_[
                'learner__number_of_trees']
Exemple #10
0
    def test_hyperparameters_sweep(self):
        # general test with combination of named and unnamed steps
        np.random.seed(0)
        df = pd.DataFrame(
            dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'],
                 workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'],
                 y=[1, 0, 1, 1, 0, 1, 0]))
        X = df.drop('y', axis=1)
        y = df['y']
        pipe = Pipeline([
            ('cat', OneHotVectorizer() << 'education'),
            # unnamed step, stays same in grid search
            OneHotHashVectorizer() << 'workclass',
            # number_of_trees 0 will actually be never run by grid search
            ('learner',
             FastTreesBinaryClassifier(number_of_trees=0, number_of_leaves=2))
        ])

        param_grid = dict(cat__output_kind=['Indicator', 'Binary'],
                          learner__number_of_trees=[1, 2, 3])
        grid = GridSearchCV(pipe, param_grid)

        grid.fit(X, y)
        print(grid.best_params_)
        assert grid.best_params_ == {
            'cat__output_kind': 'Indicator',
            'learner__number_of_trees': 1
        }
    def test_syntax6_change_role(self):
        # REVIEW: the pipeline drops all columns but one -->
        # nimbusml still thinks the Features are eduction, workclass
        # and does not automatically detects that the only remaining
        # columns should play that role
        # (maybe because the label column is here too even though
        # the only remaining column without a role is Features).
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {'f1': 'education'},
            OneHotHashVectorizer() << {'f2': 'education'},
            OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'},
            Concat() << {'Features': ['f%d' % i for i in range(1, 4)]},
            Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'],
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features']
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(prediction.columns)) == [
            'PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Exemple #12
0
 def test_split_start(self):
     long_transforms = [
         OneHotVectorizer(columns={'edu': 'education'}),
         OneHotHashVectorizer(columns={'edu_hash': 'education'}),
         ColumnDropper(columns='education')
     ]
     pipeline = self.pipeline(
         transforms=long_transforms,
         learner_arguments={'feature': ['Features', 'edu', 'edu_hash']})
     check_cv(pipeline, self.data('Label'), split_start='try_all')
Exemple #13
0
    def test_error_conditions(self):
        # grid search on a wrong param
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column, sep=',', encoding='utf-8')
        (X_test, y_test) = get_X_y(test_file,
                                   label_column, sep=',', encoding='utf-8')

        cat = OneHotHashVectorizer() << categorical_columns
        learner = FastTreesBinaryClassifier(num_trees=100, num_leaves=5)
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner__wrong_arg=[1, 5, 10])
        grid = GridSearchCV(pipe, param_grid)

        assert_raises(ValueError, grid.fit, X_train, y_train)
Exemple #14
0
 def check_cv_with_defaults_df(
         self,
         label_name='rank',
         group_id='group',
         features=['price', 'Class', 'dep_day', 'nbr_stops', 'duration'],
         **params):
     steps = [
         OneHotHashVectorizer(output_kind='Key') << {
             group_id: group_id
         },
         LightGbmRanker(min_data_per_leaf=1,
                        feature=features,
                        label='rank',
                        group_id='group')
     ]
     data = self.data_pandas()
     check_cv(pipeline=Pipeline(steps), X=data, **params)
Exemple #15
0
 def check_cv_with_defaults2(self,
                             label_name='Label',
                             group_id='GroupId',
                             features='Features_1',
                             **params):
     steps = [
         OneHotHashVectorizer(output_kind='Key') << {
             group_id: group_id
         },
         ColumnConcatenator() << {
             'Features': [features]
         },
         LightGbmRanker(min_data_per_leaf=1) << {
             Role.GroupId: group_id
         }
     ]
     data = self.data_wt_rename(label_name, group_id, features)
     check_cv(pipeline=Pipeline(steps), X=data, **params)
Exemple #16
0
    def test_numeric_columns(self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path, sep=',',
                                       numeric_dtype=np.float32)

        xf = OneHotHashVectorizer(
            columns={
                'edu': 'education',
                'in': 'induced',
                'sp': 'spontaneous'},
            number_of_bits=2)
        xf.fit_transform(data)

        xf = OneHotHashVectorizer(
            columns=[
                'education',
                'induced',
                'spontaneous'],
            number_of_bits=2)
        xf.fit_transform(data)
Exemple #17
0
 def check_cv_with_defaults(self,
                            label_name='Label',
                            group_id='GroupId',
                            features='Features_1',
                            **params):
     steps = [
         OneHotHashVectorizer(output_kind='Key') << {
             group_id: group_id
         },
         # even specify all the roles neede in the following line, the
         # roles are still not passed correctly
         LightGbmRanker(min_data_per_leaf=1) << {
             Role.GroupId: group_id,
             Role.Feature: features,
             Role.Label: label_name
         }
     ]
     data = self.data(label_name, group_id, features)
     check_cv(pipeline=Pipeline(steps), X=data, **params)
Exemple #18
0
    def test_uciadult_sweep(self):
        # grid search over number_of_trees and then confirm the best number_of_trees by
        # full train
        np.random.seed(0)
        (X_train, y_train) = get_X_y(train_file,
                                     label_column,
                                     sep=',',
                                     encoding='utf-8')
        (X_test, y_test) = get_X_y(test_file,
                                   label_column,
                                   sep=',',
                                   encoding='utf-8')

        cat = OneHotHashVectorizer() << categorical_columns
        # number_of_trees 100 will actually be never run by grid search
        # as its not in param_grid below
        learner = FastTreesBinaryClassifier(number_of_trees=100,
                                            number_of_leaves=5)
        pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

        param_grid = dict(learner__number_of_trees=[1, 5, 10])
        grid = GridSearchCV(pipe, param_grid)

        grid.fit(X_train, y_train)
        assert grid.best_params_['learner__number_of_trees'] == 10

        # compare AUC on number_of_trees 1, 5, 10
        pipe.set_params(learner__number_of_trees=1)
        pipe.fit(X_train, y_train)
        metrics1, _ = pipe.test(X_train, y_train)

        pipe.set_params(learner__number_of_trees=5)
        pipe.fit(X_train, y_train)
        metrics5, _ = pipe.test(X_train, y_train)

        pipe.set_params(learner__number_of_trees=10)
        pipe.fit(X_train, y_train)
        metrics10, _ = pipe.test(X_train, y_train)

        assert metrics10['AUC'][0] > metrics5['AUC'][0]
        assert metrics10['AUC'][0] > metrics1['AUC'][0]
        assert metrics10['AUC'][0] > 0.59
Exemple #19
0
# GridSearchCV with Pipeline: hyperparameter grid search.
import pandas as pd
from nimbusml import Pipeline
from nimbusml.ensemble import FastTreesBinaryClassifier
from nimbusml.feature_extraction.categorical import OneHotHashVectorizer, \
    OneHotVectorizer
from sklearn.model_selection import GridSearchCV

df = pd.DataFrame(
    dict(education=['A', 'B', 'A', 'B', 'A'],
         workclass=['X', 'X', 'Y', 'Y', 'Y'],
         y=[1, 0, 1, 0, 0]))
X = df.drop('y', axis=1)
y = df['y']
pipe = Pipeline([
    ('cat', OneHotVectorizer() << 'education'),
    # unnamed step, stays same in grid search
    OneHotHashVectorizer() << 'workclass',
    # this instance of FastTreesBinaryClassifier with num_trees 0 will be
    # never run by grid search as its not a part of param_grid below
    ('learner', FastTreesBinaryClassifier(num_trees=0, num_leaves=2))
])

param_grid = dict(cat__output_kind=['Ind', 'Bin'],
                  learner__num_trees=[1, 2, 3])
grid = GridSearchCV(pipe, param_grid, cv=3, iid='warn')

grid.fit(X, y)
print(grid.best_params_)
# {'cat__output_kind': 'Ind', 'learner__num_trees': 1}
    def test_performance_syntax(self):
        train_file = get_dataset('uciadult_train').as_filepath()
        test_file = get_dataset('uciadult_test').as_filepath()
        file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 ' \
                      'col=workclass:TX:1 col=education:TX:2 ' \
                      'col=marital-status:TX:3 col=occupation:TX:4 ' \
                      'col=relationship:TX:5 col=ethnicity:TX:6 ' \
                      'col=sex:TX:7 col=native-country-region:TX:8 header+'
        categorical_columns = [
            'workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'ethnicity', 'sex', 'native-country-region'
        ]
        label_column = 'label'
        na_columns = ['Features']
        feature_columns_idv = na_columns + categorical_columns

        exp = Pipeline([
            OneHotHashVectorizer(columns=categorical_columns),
            Handler(columns=na_columns),
            FastLinearBinaryClassifier(feature=feature_columns_idv,
                                       label=label_column)
        ])

        train_data = FileDataStream(train_file, schema=file_schema)
        exp.fit(train_data, label_column, verbose=0)
        print("train time %s" % exp._run_time)

        test_data = FileDataStream(test_file, schema=file_schema)
        out_data = exp.predict(test_data)
        print("predict time %s" % exp._run_time)

        (test, label_test) = get_X_y(test_file, label_column, sep=',')
        (acc1, auc1) = evaluate_binary_classifier(
            label_test.iloc[:, 0].values,
            out_data.loc[:, 'PredictedLabel'].values,
            out_data.loc[:, 'Probability'].values)

        print('ACC %s, AUC %s' % (acc1, auc1))

        exp = Pipeline([
            OneHotHashVectorizer() << categorical_columns,
            Handler() << na_columns,
            FastLinearBinaryClassifier() << feature_columns_idv
        ])

        train_data = FileDataStream(train_file, schema=file_schema)
        exp.fit(train_data, label_column, verbose=0)
        print("train time %s" % exp._run_time)

        test_data = FileDataStream(test_file, schema=file_schema)
        out_data = exp.predict(test_data)
        print("predict time %s" % exp._run_time)

        (test, label_test) = get_X_y(test_file, label_column, sep=',')
        (acc2, auc2) = evaluate_binary_classifier(
            label_test.iloc[:, 0].values,
            out_data.loc[:, 'PredictedLabel'].values,
            out_data.loc[:, 'Probability'].values)
        print('ACC %s, AUC %s' % (acc2, auc2))
        assert abs(acc1 - acc2) < 0.02
        assert abs(auc1 - auc2) < 0.02
             'Sepal_Length']},
     LpScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'MutualInformationSelector': Pipeline([
     ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}),
     MutualInformationSelector(
         columns='Features',
         label='Label',
         slots_in_output=2)  # only accept one column
 ]),
 'NaiveBayesClassifier': NaiveBayesClassifier(feature=['Sepal_Width', 'Sepal_Length']),
 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=Ngram(),
                                    char_feature_extractor=Ngram(),
                                    keep_diacritics=True,
                                    columns={ 'features': ['SentimentText']}),
 'OneHotHashVectorizer': OneHotHashVectorizer(columns=['education_str']),
 'OneHotVectorizer': OneHotVectorizer(columns=['education_str']),
 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \
     OneVsRestClassifier(AveragedPerceptronBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',
                                  'education_str.0-5yrs',
                                  'education_str.6-11yrs',
                                  'education_str.12+ yrs'],
                         label='induced'),
 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \
     OneVsRestClassifier(LinearSvmBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',
                                  'education_str.0-5yrs',
                                  'education_str.6-11yrs',
Exemple #22
0
###############################################################################
# OneHotHashVectorizer
from nimbusml import FileDataStream, Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotHashVectorizer
from nimbusml.feature_selection import CountSelector

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path, sep=',')
print(data.head())
#   age  case education  induced  parity  ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

pip = Pipeline([
    OneHotHashVectorizer(columns={'edu': 'education'}, hash_bits=2),
    CountSelector(count=5, columns=['edu'])
])
features_selection = pip.fit_transform(data)
print(features_selection.head())
#   age  case  edu.0  edu.1 education  induced  parity  pooled.stratum  ...
# 0   26     1    0.0    1.0    0-5yrs        1       6               3  ...
# 1   42     1    0.0    1.0    0-5yrs        1       1               1  ...
# 2   39     1    0.0    1.0    0-5yrs        2       6               4  ...
# 3   34     1    0.0    1.0    0-5yrs        2       4               2  ...
# 4   35     1    1.0    0.0   6-11yrs        1       3              32  ...
Exemple #23
0
###############################################################################
# OneHotHashVectorizer
from nimbusml import FileDataStream, Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotHashVectorizer
from nimbusml.feature_selection import CountSelector

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path, sep=',')
print(data.head())
#   age  case education  induced  parity  ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

pip = Pipeline([
    OneHotHashVectorizer(columns={'edu': 'education'}, number_of_bits=2),
    CountSelector(count=5, columns=['edu'])
])
features_selection = pip.fit_transform(data)
print(features_selection.head())
#   age  case  edu.0  edu.1 education  induced  parity  pooled.stratum  ...
# 0   26     1    0.0    1.0    0-5yrs        1       6               3  ...
# 1   42     1    0.0    1.0    0-5yrs        1       1               1  ...
# 2   39     1    0.0    1.0    0-5yrs        2       6               4  ...
# 3   34     1    0.0    1.0    0-5yrs        2       4               2  ...
# 4   35     1    1.0    0.0   6-11yrs        1       3              32  ...
                  True, False, True, False, True, False, True
              ]))

test_reviews = pandas.DataFrame(data=dict(review=[
    "This is great", "I hate it", "Love it", "Really like it", "I hate it",
    "I like it a lot", "I love it", "I do like it", "I really hate it",
    "I love it"
]))

# OneHotHashVectorizer transform: the entire string is treated as a category.
# if output column name is same as input column, original input column values
# are replaced. number_of_bits=6 will hash into 2^6 -1 dimensions

y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

cat = OneHotHashVectorizer(number_of_bits=6) << 'review'
X = cat.fit_transform(X)

# view the transformed numerical values and column names
print(X)

mymodel = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = cat.transform(test_reviews)

scores = mymodel.predict(cat.transform(test_reviews))

# view the scores
print(scores)
# GridSearchCV with Pipeline: grid search over learners
import pandas as pd
from nimbusml import Pipeline
from nimbusml.ensemble import FastTreesBinaryClassifier, GamBinaryClassifier
from nimbusml.feature_extraction.categorical import OneHotHashVectorizer
from nimbusml.linear_model import FastLinearBinaryClassifier, \
    LogisticRegressionBinaryClassifier
from sklearn.model_selection import GridSearchCV

df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                       workclass=['X', 'X', 'Y', 'Y', 'Y'],
                       y=[1, 0, 1, 0, 0]))
X = df.drop('y', axis=1)
y = df['y']

cat = OneHotHashVectorizer() << ['education', 'workclass']
learner = FastTreesBinaryClassifier()
pipe = Pipeline(steps=[('cat', cat), ('learner', learner)])

param_grid = dict(cat__hash_bits=[1, 2, 4, 6, 8, 16],
                  learner=[
                      FastLinearBinaryClassifier(),
                      FastTreesBinaryClassifier(),
                      LogisticRegressionBinaryClassifier(),
                      GamBinaryClassifier()
                  ])
grid = GridSearchCV(pipe, param_grid, cv=3, iid='warn', )

grid.fit(X, y)
print(grid.best_params_['learner'].__class__.__name__)
# FastLinearBinaryClassifier
Exemple #26
0
###############################################################################
# OneHotHashVectorizer
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotHashVectorizer

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path, sep=',',
                               dtype={'spontaneous': str
                                      })  # Error with numeric input for ohhv
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

xf = OneHotHashVectorizer(columns={'edu': 'education', 'sp': 'spontaneous'})

# fit and transform
features = xf.fit_transform(data)

print(features.head())
#    age  case  edu.0   edu.1003   ...    sp.995    ...   spontaneous  stratum
# 0    26     1    0.0        0.0   ...       0.0   ...           2.0      1.0
# 1    42     1    0.0        0.0   ...       0.0   ...           0.0      2.0
# 2    39     1    0.0        0.0   ...       0.0   ...           0.0      3.0