Ejemplo n.º 1
0
    def test_word_embedding(self):

        ds_train = pandas.DataFrame(data=dict(
            description=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        ng = NGramFeaturizer(columns=['description'], output_tokens=True)
        we = WordEmbedding(columns='description_TransformedText',
                           model_kind='Sswe')

        model = Pipeline([ng, we])
        dot_vis = dot_export_pipeline(model, ds_train)
        assert 'ch1[label="<f0> description|<f1> ' \
               'description_TransformedText"' in dot_vis
Ejemplo n.º 2
0
    def test_plot_fitted_cloned_pipeline(self):
        df = pd.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1.0, 3, 2, 3, 4]))
        exp = Pipeline([
            OneHotVectorizer() << ['workclass', 'education'],
            FastLinearRegressor(feature=['workclass', 'education'], label='y'),
        ])
        info1 = exp.get_fit_info(df)[0]
        res1 = dot_export_pipeline(exp, df)
        assert res1 is not None
        exp.fit(df)
        info2 = exp.get_fit_info(df)[0]
        assert len(info1) == len(info2)
        exp.fit(df)
        info3 = exp.get_fit_info(df)[0]
        assert len(info1) == len(info3)

        for i, (a, b, c) in enumerate(zip(info1, info2, info3)):
            assert list(sorted(a)) == list(sorted(b))
            assert list(sorted(a)) == list(sorted(c))
            for k in sorted(a):
                if not isinstance(a[k], (list, dict, str, int, float, tuple)):
                    continue
                if b[k] != c[k]:
                    import pprint
                    pprint.pprint(b)
                    pprint.pprint(c)
                    raise Exception(
                        "Issue with "
                        "op={0}\nk='{1}'\n---\n{2}\n---\n{3}".format(
                            i, k, b[k], c[k]))
                if a[k] != b[k]:
                    import pprint
                    pprint.pprint(a)
                    pprint.pprint(b)
                    raise Exception(
                        "Issue with "
                        "op={0}\nk='{1}'\n---\n{2}\n---\n{3}".format(
                            i, k, a[k], b[k]))
        res2 = dot_export_pipeline(exp, df)
        assert res2 is not None
        assert res1 == res2
Ejemplo n.º 3
0
    def test_pipeline_exports_complex(self):

        name = "test_pipeline_exports_complex.csv"
        with open(name, "w") as f:
            f.write(_sentiments)

        transform_1 = NGramFeaturizer() << {'transformed1': 'SentimentText'}
        transform_2 = OneHotVectorizer() << 'SentimentSource'
        transform_3 = ColumnConcatenator() << {
            'finalfeatures': ['transformed1', 'SentimentSource']
        }
        algo = FastTreesBinaryClassifier() << {
            Role.Feature: 'finalfeatures',
            Role.Label: "Positive"
        }

        exp = Pipeline([transform_1, transform_2, transform_3, algo])

        stream = FileDataStream.read_csv(name, sep="\t")
        res = dot_export_pipeline(exp, stream).strip("\n\r ")
        exp = """
                digraph{
                  orientation=portrait;
                  sch0[label="<f0> ItemID|<f1> Sentiment|<f2> \
SentimentSource|<f3> SentimentText|<f4> RowNum|<f5> \
Positive|<f6> Train|<f7> Small",shape=record,fontsize=8];

                  node1[label="NGramFeaturizer",shape=box,style="filled,\
rounded",color=cyan,fontsize=12];
                  sch0:f3 -> node1;
                  sch1[label="<f0> transformed1|<f1> \
transformed1_TransformedText",shape=record,fontsize=8];
                  node1 -> sch1:f0;
                  node1 -> sch1:f1;

                  node2[label="OneHotVectorizer",shape=box,\
style="filled,rounded",color=cyan,fontsize=12];
                  sch0:f2 -> node2;
                  sch2[label="<f0> SentimentSource",shape=record,\
fontsize=8];
                  node2 -> sch2:f0;

                  node3[label="ColumnConcatenator",shape=box,\
style="filled,rounded",color=cyan,fontsize=12];
                  sch1:f0 -> node3;
                  sch2:f0 -> node3;
                  sch3[label="<f0> finalfeatures",shape=record,fontsize=8];
                  node3 -> sch3:f0;

                  node4[label="FastTreesBinaryClassifier",shape=box,\
style="filled,rounded",color=yellow,fontsize=12];
                  sch3:f0 -> node4 [label="Feature",fontsize=8];
                  sch0:f5 -> node4 [label="Label",fontsize=8];
                  sch4[label="<f0> PredictedLabel|<f1> \
PredictedProba|<f2> Score",shape=record,fontsize=8];
                  node4 -> sch4:f0;
                  node4 -> sch4:f1;
                  node4 -> sch4:f2;
                }
                """.replace("                ", "").strip("\n\r ")
        assert res == exp
Ejemplo n.º 4
0
    def test_pipeline_exports(self):
        df = pandas.DataFrame(
            dict(education=['A', 'B', 'A', 'B', 'A'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        exp = Pipeline([
            MeanVarianceScaler() << {
                'new_y': 'yy'
            },
            OneHotVectorizer() << ['workclass', 'education'],
            Drop() << 'yy',
            FastLinearRegressor() << {
                'Feature': ['workclass', 'education'],
                Role.Label: 'new_y'
            }
        ])

        for node in exp.nodes:
            if hasattr(node, 'label_column'):
                assert node.label_column == 'new_y'
        assert exp.nodes[-1].label_column == 'new_y'

        res = dot_export_pipeline(exp, df).strip("\n\r ")
        exp = """
                digraph{
                  orientation=portrait;
                  sch0[label="<f0> education|<f1> workclass|<f2> yy",
                  shape=record,fontsize=8];

                  node1[label="TypeConverter",shape=box,style="filled,
                  rounded",color=cyan,fontsize=12];
                  sch0:f2 -> node1;
                  sch1[label="<f0> new_y",shape=record,fontsize=8];
                  node1 -> sch1:f0;

                  node2[label="MeanVarianceScaler",shape=box,
                  style="filled,rounded",color=cyan,fontsize=12];
                  sch1:f0 -> node2;
                  sch2[label="<f0> new_y",shape=record,fontsize=8];
                  node2 -> sch2:f0;

                  node3[label="OneHotVectorizer",shape=box,
                  style="filled,rounded",color=cyan,fontsize=12];
                  sch0:f1 -> node3;
                  sch0:f0 -> node3;
                  sch3[label="<f0> workclass|<f1> education",
                  shape=record,fontsize=8];
                  node3 -> sch3:f0;
                  node3 -> sch3:f1;

                  node5[label="FastLinearRegressor",shape=box,
                  style="filled,rounded",color=yellow,fontsize=12];
                  sch3:f1 -> node5 [label="Feature",fontsize=8];
                  sch3:f0 -> node5 [label="Feature",fontsize=8];
                  sch2:f0 -> node5 [label="Label",fontsize=8];
                  sch5[label="<f0> Score",shape=record,fontsize=8];
                  node5 -> sch5:f0;
                }
                """.replace("                ", "").strip("\n\r ")
        if res.replace("\n", "").replace(" ", "") != exp.replace(
                "\n", "").replace(" ", ""):
            raise Exception(res)