Esempio n. 1
0
    def test_syntax12_mixed2(self):
        X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                  workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                  weight=[10., 1., 1., 1., 1.],
                                  y=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline(
            [
                OneHotVectorizer(
                    columns=[
                        'workclass', 'education']),
                Concat(
                    columns={
                        'Feature': ['workclass', 'education']}),
                FastTreesRegressor(
                    num_trees=5, feature='Feature', weight='weight') << {
                    Role.Label: 'y'}])
        exp.fit(X, verbose=0)
        assert exp.nodes[-1].feature_column_ == 'Feature'
        assert exp.nodes[-1].label_column_ == 'y'
        assert exp.nodes[-1].weight_column_ == 'weight'
        # y is required here as well as weight.
        # It is replaced by fakes values.
        # The test does not fail but the weight is not taken into account.
        X['y'] = -5
        X['weight'] = -5
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
Esempio n. 2
0
    def test_syntax6_regular_expression(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': ['f%d' % i for i in range(1, 4)]
            },
            Drop() << '~Features',
            FastLinearBinaryClassifier(max_iterations=1)
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Esempio n. 3
0
    def test_syntax5_regular_expression(self):
        # REVIEW: not implemented yet
        # The best would be to handle regular expression inside nimbusml.
        # It could be handled in entrypoint.py just before calling nimbusml.
        # It can be handled inside Pipeline if it is aware of
        # the input schema.

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': 'f[0-9]+'
            },
            FastLinearBinaryClassifier(max_iterations=1) << 'Features'
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Esempio n. 4
0
    def test_syntax4_dict(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'edu1': 'education'
            },
            OneHotHashVectorizer() << {
                'edu2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'wki': 'workclass'
            },
            Concat() << {
                'Inputs': ['edu1', 'edu2', 'wki']
            },
            FastLinearBinaryClassifier(max_iterations=1) << 'Inputs'
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Esempio n. 5
0
    def test_syntax6_change_role(self):
        # REVIEW: the pipeline drops all columns but one -->
        # nimbusml still thinks the Features are eduction, workclass
        # and does not automatically detects that the only remaining
        # columns should play that role
        # (maybe because the label column is here too even though
        # the only remaining column without a role is Features).
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {'f1': 'education'},
            OneHotHashVectorizer() << {'f2': 'education'},
            OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'},
            Concat() << {'Features': ['f%d' % i for i in range(1, 4)]},
            Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'],
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features']
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(prediction.columns)) == [
            'PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
Esempio n. 6
0
    def test_syntax_concat_slots(self):
        X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                  workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                  weight=[10., 1., 1., 1., 1.],
                                  y=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            OneHotVectorizer() << ['workclass', 'education'],
            Concat() << {'newcol': ['workclass', 'education']},
        ])
        exp.fit(X, verbose=0)
        exp.predict(X)
Esempio n. 7
0
 def test_syntax12_fail(self):
     # This tests check that a learner raises an exception
     # if a role is not allowed by the entrypoint.
     X = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                               workclass=['X', 'X', 'Y', 'Y', 'Y'],
                               weight=[10., 1., 1., 1., 1.],
                               y=[1.1, 2.2, 1.24, 3.4, 3.4]))
     try:
         Pipeline([
             OneHotVectorizer(columns=['workclass', 'education']),
             Concat(columns={'Feature': ['workclass', 'education']}),
             FastLinearBinaryClassifier(feature='Feature',
                                        group_id='weight') << {
                 Role.Label: 'y'}
         ])
         Pipeline.fit(X)
         assert False
     except (RuntimeError, NameError) as e:
         exp = "Parameter 'group_id' is not allowed " \
             "for class 'FastLinearBinaryClassifier'"
         if exp not in str(e):
             raise e
Esempio n. 8
0
 def test_syntax12_group(self):
     # This tests check that a learner raises an exception
     # if a role is not allowed by the entrypoint.
     X = pandas.DataFrame(
         dict(education=['A', 'B', 'A', 'B', 'A'],
              workclass=['X', 'X', 'Y', 'Y', 'Y'],
              gr=[0, 0, 1, 1, 1],
              y=[1.1, 2.2, 1.24, 3.4, 3.4]))
     exp = Pipeline([
         OneHotVectorizer(columns=['workclass', 'education']),
         Concat(columns={'Feature': ['workclass', 'education']}),
         ToKey() << 'gr',
         FastTreesRegressor(
             number_of_trees=5, feature='Feature', group_id='gr') << {
                 Role.Label: 'y'
             }
     ])
     exp.fit(X, verbose=0)
     assert not hasattr(exp.nodes[-1], 'feature_')
     assert not hasattr(exp.nodes[-1], 'group_id_')
     assert exp.nodes[-1].feature_column_name_ == 'Feature'
     assert exp.nodes[-1].label_column_name_ == 'y'
     # assert not hasattr(exp.nodes[-1], 'row_group_column_name_')
     assert not hasattr(exp.nodes[-1], 'group_id_column')
     assert not hasattr(exp.nodes[-1], 'groupid_column_')
     assert not hasattr(exp.nodes[-1], 'groupid_column')
     if not hasattr(exp.nodes[-1], 'row_group_column_name_'):
         raise AssertionError("Attribute not found: {0}".format(", ".join(
             sorted(dir(exp.nodes[-1])))))
     assert exp.nodes[-1].row_group_column_name_ == 'gr'
     # y is required here as well as weight.
     # It is replaced by fakes values.
     # The test does not fail but the weight is not taken into account.
     X['y'] = -5
     X['weight'] = -5
     prediction = exp.predict(X)
     assert isinstance(prediction, pandas.DataFrame)
     assert list(prediction.columns) == ['Score']
     assert prediction.shape == (5, 1)