Exemple #1
0
    def test_pipeline_info(self):
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            MeanVarianceScaler() << {
                'new_y': 'yy'
            },
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {
                'Feature': ['workclass', 'education'],
                Role.Label: 'new_y'
            }
        ])

        infos = exp.get_fit_info(df)[0]
        for inf in infos:
            if 'operator' in inf:
                del inf['operator']
        exp = [{
            'name': None,
            'schema_after': ['education', 'workclass', 'yy'],
            'type': 'start',
            'outputs': ['education', 'workclass', 'yy']
        }, {
            'name': 'TypeConverter',
            'inputs': ['yy'],
            'outputs': ['new_y'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y'],
            'type': 'transform'
        }, {
            'name': 'MeanVarianceScaler',
            'inputs': ['new_y'],
            'type': 'transform',
            'outputs': ['new_y'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y']
        }, {
            'name': 'OneHotVectorizer',
            'inputs': ['workclass', 'education'],
            'type': 'transform',
            'outputs': ['workclass', 'education'],
            'schema_after': ['education', 'workclass', 'yy', 'new_y']
        }, {
            'name': 'ColumnDropper',
            'type': 'transform',
            'schema_after': ['education', 'workclass', 'new_y'],
            'inputs': ['education', 'workclass', 'yy', 'new_y'],
            'outputs': ['education', 'workclass', 'new_y']
        }, {
            'name': 'FastLinearRegressor',
            'inputs': ['Feature:education,workclass', 'Label:new_y'],
            'type': 'regressor',
            'outputs': ['Score'],
            'schema_after': ['Score']
        }]
        if infos != exp:
            raise Exception(infos)
Exemple #2
0
    def test_syntax8_label(self):
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   yy=[1.1, 2.2, 1.24, 3.4, 3.4]))
        X = df.drop('yy', axis=1)

        exp = Pipeline([
            MeanVarianceScaler() << {'new_y': 'yy'},
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {'Feature': ['workclass', 'education'],
                                      Role.Label: 'new_y'}
        ])
        exp.fit(df, verbose=0)
        assert exp.nodes[-1].feature_column_ == 'Features'
        assert exp.nodes[-1].label_column_ == 'new_y'
        # The pipeline requires it now as it is transformed all along.
        X['yy'] = 0.0
        prediction = exp.predict(X, verbose=0)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        if prediction['Score'].min() < 0.4:
            raise Exception(prediction)
        if prediction['Score'].max() > 2.00:
            raise Exception(prediction)
Exemple #3
0
    def test_syntax7_rename(self):
        # Error message are usually not informative enough.
        # Missing column --> no indication of other columns.
        # Error is (one transform should handle it)
        # 'The label column 'y' of the training data has a data type
        # not suitable for binary classification: Vec<Key<U4, 0-1>, 2>.
        # Type must be Bool, R4, R8 or Key with two classes.

        df = pandas.DataFrame(
            dict(
                education=[
                    'A', 'B', 'A', 'B', 'A'], workclass=[
                    'X', 'X', 'Y', 'Y', 'Y'], y=[
                    'red', 'white', 'red', 'white', 'white']))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << 'y',
            OneHotVectorizer() << ['workclass', 'education'],
            TypeConverter(result_type='R4') << {'yi': 'y'},
            Drop() << 'y',
            FastLinearBinaryClassifier(max_iterations=1) << 'yi'
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert list(prediction.columns) == ['Score']
        assert prediction.shape == (5, 1)
        assert prediction.min() > 0.01
        assert prediction.max() < 0.05
Exemple #4
0
    def test_syntax6_regular_expression(self):

        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {
                'f1': 'education'
            },
            OneHotHashVectorizer() << {
                'f2': 'education'
            },
            OneHotVectorizer(max_num_terms=2) << {
                'f3': 'workclass'
            },
            Concat() << {
                'Features': ['f%d' % i for i in range(1, 4)]
            },
            Drop() << '~Features',
            FastLinearBinaryClassifier(max_iterations=1)
        ])
        exp.fit(X, y)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(
            prediction.columns)) == ['PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
    def test_syntax6_change_role(self):
        # REVIEW: the pipeline drops all columns but one -->
        # nimbusml still thinks the Features are eduction, workclass
        # and does not automatically detects that the only remaining
        # columns should play that role
        # (maybe because the label column is here too even though
        # the only remaining column without a role is Features).
        df = pandas.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                                   workclass=['X', 'X', 'Y', 'Y', 'Y'],
                                   y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)
        y = df['y']

        exp = Pipeline([
            OneHotVectorizer() << {'f1': 'education'},
            OneHotHashVectorizer() << {'f2': 'education'},
            OneHotVectorizer(max_num_terms=2) << {'f3': 'workclass'},
            Concat() << {'Features': ['f%d' % i for i in range(1, 4)]},
            Drop() << ['education', 'workclass', 'f1', 'f2', 'f3'],
            FastLinearBinaryClassifier(maximum_number_of_iterations=1) << ['Features']
        ])
        exp.fit(X, y, verbose=0)
        prediction = exp.predict(X)
        assert isinstance(prediction, pandas.DataFrame)
        assert sorted(list(prediction.columns)) == [
            'PredictedLabel', 'Probability', 'Score']
        assert prediction.shape == (5, 3)
    def test_pipeline_exports(self):
        import graphviz.backend
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            MeanVarianceScaler() << {
                'new_y': 'yy'
            },
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {
                'Feature': ['workclass', 'education'],
                Role.Label: 'new_y'
            }
        ])

        gr = img_export_pipeline(exp, df)
        name = next(tempfile._get_candidate_names())
        try:
            gr.render(name)
            assert os.path.exists(name)
        except graphviz.backend.ExecutableNotFound:
            warnings.warn('Graphviz is not installed.')
        if os.path.exists(name):
            os.remove(name)
Exemple #7
0
    def test_pipeline_info_strategy_previous_drop(self):
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 yy=[1.1, 2.2, 1.24, 3.4, 3.4]))
        X = df.drop('yy', axis=1)
        y = df['yy']

        exp = Pipeline([
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << ['education'],
            FastLinearRegressor()
        ])

        infos = exp.get_fit_info(X, y)[0]
        for inf in infos:
            if 'operator' in inf:
                del inf['operator']
        exp = [{
            'name': None,
            'schema_after': ['education', 'workclass', 'yy'],
            'type': 'start',
            'outputs': ['education', 'workclass', 'yy']
        }, {
            'name': 'OneHotVectorizer',
            'inputs': ['workclass', 'education'],
            'type': 'transform',
            'outputs': ['workclass', 'education'],
            'schema_after': ['education', 'workclass', 'yy']
        }, {
            'name': 'ColumnDropper',
            'type': 'transform',
            'schema_after': ['workclass', 'yy'],
            'inputs': ['education', 'workclass', 'yy'],
            'outputs': ['workclass', 'yy']
        }, {
            'name': 'FastLinearRegressor',
            'inputs': ['Feature:workclass', 'Label:yy'],
            'type': 'regressor',
            'outputs': ['Score'],
            'schema_after': ['Score']
        }]
        assert infos == exp
Exemple #8
0
    def test_pipeline_exports(self):
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            MeanVarianceScaler() << {
                'new_y': 'yy'
            },
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {
                'Feature': ['workclass', 'education'],
                Role.Label: 'new_y'
            }
        ])

        for node in exp.nodes:
            if hasattr(node, 'label_column'):
                assert node.label_column == 'new_y'
        assert exp.nodes[-1].label_column == 'new_y'

        res = dot_export_pipeline(exp, df).strip("\n\r ")
        exp = """
                digraph{
                  orientation=portrait;
                  sch0[label="<f0> education|<f1> workclass|<f2> yy",
                  shape=record,fontsize=8];

                  node1[label="TypeConverter",shape=box,style="filled,
                  rounded",color=cyan,fontsize=12];
                  sch0:f2 -> node1;
                  sch1[label="<f0> new_y",shape=record,fontsize=8];
                  node1 -> sch1:f0;

                  node2[label="MeanVarianceScaler",shape=box,
                  style="filled,rounded",color=cyan,fontsize=12];
                  sch1:f0 -> node2;
                  sch2[label="<f0> new_y",shape=record,fontsize=8];
                  node2 -> sch2:f0;

                  node3[label="OneHotVectorizer",shape=box,
                  style="filled,rounded",color=cyan,fontsize=12];
                  sch0:f1 -> node3;
                  sch0:f0 -> node3;
                  sch3[label="<f0> workclass|<f1> education",
                  shape=record,fontsize=8];
                  node3 -> sch3:f0;
                  node3 -> sch3:f1;

                  node5[label="FastLinearRegressor",shape=box,
                  style="filled,rounded",color=yellow,fontsize=12];
                  sch3:f1 -> node5 [label="Feature",fontsize=8];
                  sch3:f0 -> node5 [label="Feature",fontsize=8];
                  sch2:f0 -> node5 [label="Label",fontsize=8];
                  sch5[label="<f0> Score",shape=record,fontsize=8];
                  node5 -> sch5:f0;
                }
                """.replace("                ", "").strip("\n\r ")
        if res.replace("\n", "").replace(" ", "") != exp.replace(
                "\n", "").replace(" ", ""):
            raise Exception(res)