Esempio n. 1
0
    def test_syntax9_slots_label(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        X = train_reviews.loc[:, train_reviews.columns != 'like']
        y = train_reviews[['like']]

        transform_1 = NGramFeaturizer(word_feature_extractor=n_gram())
        transform_2 = MutualInformationSelector()
        exp = Pipeline([transform_1, transform_2])
        res = exp.fit_transform(X, y)
        assert res is not None

        # Scikit compatibility (Compose transforms inside Scikit Pipeline).
        # In this scenario, we do not provide {input, output} arguments
        transform_1 = NGramFeaturizer(word_feature_extractor=n_gram())
        transform_2 = MutualInformationSelector(slots_in_output=2)
        pipe = Pipeline([transform_1, transform_2])
        res = pipe.fit_transform(X, y)
        assert res is not None
Esempio n. 2
0
    def test_example_fails(self):

        like = [
            True, False, True, False, True, False, True, False, True, False,
            True, False, True, False, True, False, True, False, True, False,
            True, False, True, False, True
        ]
        x1 = [(5. if _ else 4.) for _ in like]
        x2 = [(-5. if _ else -4.) for _ in like]
        x1[0] = 50
        x2[1] = 50
        x2[2] = 50
        train_data = pandas.DataFrame(data=dict(like=like, x1=x2, x2=x2),
                                      dtype=numpy.float32)

        # It works but I'm not sure what it does.
        transform_2 = MutualInformationSelector(slots_in_output=1,
                                                feature=['x1', 'x2'],
                                                label='like')
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column_name == 'like'
        # assert transform_2.input == ['x1', 'x2']  # None
        # assert transform_2.output == ['Feature'] # None
        pipe = Pipeline([transform_2])
        pipe.fit(train_data)
        res = pipe.transform(train_data)
        assert res is not None

        # It works but I'm not sure what it does.
        try:
            transform_2 = MutualInformationSelector(slots_in_output=1,
                                                    feature2=['x1', 'x2'],
                                                    label='like')
            raise AssertionError("feature2 not allowed")
        except NameError as e:
            assert "Parameter 'feature2' is not allowed" in str(e)

        try:
            transform_2 = MutualInformationSelector(slots_in_output=2,
                                                    columns=['x1', 'x2'],
                                                    label='like')
            raise AssertionError("only one output is allowed")
        except RuntimeError as e:
            assert "use a dictionary" in str(e)

        try:
            transform_2 = MutualInformationSelector(slots_in_output=2,
                                                    columns={
                                                        'x1': 'x1',
                                                        'x2': 'x2'
                                                    },
                                                    label='like')
            raise AssertionError("only one output is allowed")
        except RuntimeError as e:
            assert "Output should contain only one output not" in str(e)
Esempio n. 3
0
 'LightGbmRanker': LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                                  label='rank',
                                  group_id='group'),
 'Loader': Loader(columns={'ImgPath': 'Path'}),
 'LpScaler': Pipeline([
     ColumnConcatenator() << {
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     LpScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'MutualInformationSelector': Pipeline([
     ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}),
     MutualInformationSelector(
         columns='Features',
         label='Label',
         slots_in_output=2)  # only accept one column
 ]),
 'NaiveBayesClassifier': NaiveBayesClassifier(feature=['Sepal_Width', 'Sepal_Length']),
 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=Ngram(),
                                    char_feature_extractor=Ngram(),
                                    keep_diacritics=True,
                                    columns={ 'features': ['SentimentText']}),
 'OneHotHashVectorizer': OneHotHashVectorizer(columns=['education_str']),
 'OneHotVectorizer': OneHotVectorizer(columns=['education_str']),
 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \
     OneVsRestClassifier(AveragedPerceptronBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',
                                  'education_str.0-5yrs',
                                  'education_str.6-11yrs',
Esempio n. 4
0
    def test_example_success(self):

        like = [
            True, False, True, False, True, False, True, False, True, False,
            True, False, True, False, True, False, True, False, True, False,
            True, False, True, False, True
        ]
        x1 = [(5. if _ else 4.) for _ in like]
        x2 = [(-5. if _ else -4.) for _ in like]
        x1[0] = 50
        x2[1] = 50
        x2[2] = 50
        train_data = pandas.DataFrame(data=dict(like=like, x1=x2, x2=x2),
                                      dtype=numpy.float32)

        X = train_data.drop('like', axis=1)
        y = train_data[['like']]
        transform_2 = MutualInformationSelector()
        exp = Pipeline([transform_2])
        res = exp.fit_transform(X, y)
        assert res is not None

        transform_2 = MutualInformationSelector(slots_in_output=2)
        pipe = Pipeline([transform_2])
        res = pipe.fit_transform(X, y)
        assert res is not None

        transform_2 = MutualInformationSelector() << {
            Role.Feature: ['x1', 'x2'],
            Role.Label: 'like'
        }
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column_name == 'like'
        assert transform_2.input == ['x1', 'x2']
        assert transform_2.output == ['Feature']
        exp = Pipeline([transform_2])
        res = exp.fit_transform(train_data)
        assert res is not None

        transform_2 = MutualInformationSelector() << {
            "zoo": ['x1', 'x2'],
            Role.Label: 'like'
        }
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column_name == 'like'
        assert transform_2.input == ['x1', 'x2']
        assert transform_2.output == ['zoo']
        exp = Pipeline([transform_2])
        res = exp.fit_transform(train_data)
        assert res is not None

        transform_2 = MutualInformationSelector() << {
            "zoo": ['x1'],
            Role.Label: 'like'
        }
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column_name == 'like'
        assert transform_2.input == ['x1']
        assert transform_2.output == ['zoo']
        exp = Pipeline([transform_2])
        res = exp.fit_transform(train_data)
        assert res is not None

        transform_2 = MutualInformationSelector(slots_in_output=1,
                                                columns=['x1'],
                                                label='like')
        assert transform_2._allowed_roles == {'Label'}
        assert transform_2.label_column_name == 'like'
        assert transform_2.input == ['x1']
        assert transform_2.output == ['x1']
        pipe = Pipeline([transform_2])
        pipe.fit(train_data)
        res = pipe.transform(train_data)
        assert res is not None
    data=dict(review=[
        "This is great", "I hate it", "Love it", "Do not like it",
        "Really like it", "I hate it", "I like it a lot", "I kind of hate it",
        "I do like it", "I really hate it", "It is very good",
        "I hate it a bunch", "I love it a bunch", "I hate it",
        "I like it very much", "I hate it very much.", "I really do love it",
        "I really do hate it", "Love it!", "Hate it!", "I love it",
        "I hate it", "I love it", "I hate it", "I love it"
    ],
              like=[
                  True, False, True, False, True, False, True, False, True,
                  False, True, False, True, False, True, False, True, False,
                  True, False, True, False, True, False, True
              ]))

X = train_reviews.loc[:, train_reviews.columns != 'like']
y = train_reviews['like']

# pipeline of transforms
transform_1 = NGramFeaturizer(word_feature_extractor=Ngram())
transform_2 = MutualInformationSelector(slots_in_output=2)
pipeline = Pipeline([transform_1, transform_2])
print(pipeline.fit_transform(X, y))

# Scikit compatibility (Compose transforms inside Scikit Pipeline).
# In this scenario, we do not provide {input, output} arguments
transform_1 = NGramFeaturizer(word_feature_extractor=Ngram())
transform_2 = MutualInformationSelector(slots_in_output=2)
pipe = Pipeline([('text', transform_1), ('featureselect', transform_2)])
print(pipe.fit_transform(X, y))