Beispiel #1
0
    def test_syntax9_slots_label(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        X = train_reviews.loc[:, train_reviews.columns != 'like']
        y = train_reviews[['like']]

        transform_1 = NGramFeaturizer(word_feature_extractor=n_gram())
        transform_2 = MutualInformationSelector()
        exp = Pipeline([transform_1, transform_2])
        res = exp.fit_transform(X, y)
        assert res is not None

        # Scikit compatibility (Compose transforms inside Scikit Pipeline).
        # In this scenario, we do not provide {input, output} arguments
        transform_1 = NGramFeaturizer(word_feature_extractor=n_gram())
        transform_2 = MutualInformationSelector(slots_in_output=2)
        pipe = Pipeline([transform_1, transform_2])
        res = pipe.fit_transform(X, y)
        assert res is not None
Beispiel #2
0
    def test_lightgbmclassifier(self):
        np.random.seed(0)
        train_file = get_dataset('wiki_detox_train').as_filepath()
        (train,
         label) = get_X_y(train_file,
                          label_column='Sentiment',
                          sep='\t',
                          encoding="utf-8")
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train, max_slots=5000)
        X_test = texttransform.transform(X_test, max_slots=5000)

        mymodel = LightGbmClassifier().fit(X_train, y_train, verbose=0)
        scores = mymodel.predict(X_test)
        accuracy = np.mean(y_test.values.ravel() == scores.values)
        assert_greater(
            accuracy,
            0.58,
            "accuracy should be greater than %s" %
            0.58)
    def test_pipeline_with_no_columns(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None
        ppl.fit(trainData[["SentimentText"]], np.array(trainData["Sentiment"]))
    def test_naivebayesclassifier(self):
        np.random.seed(0)
        train_file = get_dataset("wiki_detox_train").as_filepath()
        (train, label) = get_X_y(train_file, label_column='Sentiment',
                                 sep='\t')
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train)
        X_test = texttransform.transform(X_test)

        mymodel = NaiveBayesClassifier()
        mymodel.fit(X_train, y_train)

        scores = mymodel.predict(X_test)
        accuracy = np.mean(y_test == [i for i in scores])[0]
        assert_greater(
            accuracy,
            0.5,
            "accuracy should be greater than %s" %
            0.5)
Beispiel #5
0
def train_data_type_single(fit_X_type="dataframe",
                           fit_Y_type=None,
                           predict_X_type=None):
    data = [
        "This is sentence 1", "Talk about second", "Thrid one",
        "Final example."
    ]
    model = NGramFeaturizer()
    data_with_new_type = transform_data(data, fit_X_type)
    model.fit(data_with_new_type)
    test_data_with_new_type = transform_data(data, predict_X_type)
    return model.transform(test_data_with_new_type)
    def test_ngramfeaturizer_single(self):

        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        xf = NGramFeaturizer(word_feature_extractor=n_gram(),
                             columns={'features': ['id', 'education']})

        features = xf.fit_transform(data)
        assert features.shape == (248, 652)
Beispiel #7
0
    def test_syntax9_multiple_inputs(self):
        df = pandas.DataFrame(
            dict(education1=['A', 'B', 'A', 'B', 'A'],
                 education2=['c', 'd', 'c', 'd', 'c'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)

        ng4 = NGramFeaturizer(word_feature_extractor=n_gram()) << {
            'out1': ['education1', 'education2']
        }
        output4 = ng4.fit_transform(X)
        assert output4.shape == (5, 13)
    def test_pipeline_with_no_columns_raise(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier()
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        with self.assertRaises(RuntimeError):
            # Message
            # System.InvalidOperationException:
            # 'LightGBM Error, code is -1, error message is
            # 'Cannot construct Dataset since there are not useful features.
            # It should be at least two unique rows.
            # If the num_row (num_data) is small,
            # you can set min_data=1 and min_data_in_bin=1 to fix this.
            # Otherwise please make sure you are using the right dataset.'
            ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])
Beispiel #9
0
    def test_NGramFeaturizer_glove(self):
        # grid search over number_of_trees and then confirm the best number_of_trees by
        # full train
        np.random.seed(0)
        data = pd.DataFrame({
            'review': [
                'I like this movie', 'I don\'t like this', 'It is nice',
                'I like this movie', 'I don\'t like this', 'It is nice',
                'So boring'
            ],
            'sentiment': ['pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg']
        })
        pipeline = Pipeline([
            ('ng',
             NGramFeaturizer(
                 word_feature_extractor=Ngram(),
                 output_tokens_column_name='review_TransformedText',
                 columns='review')),
            WordEmbedding(columns='review_TransformedText',
                          model_kind='GloVe50D'),
            ('lr',
             FastLinearBinaryClassifier(
                 feature=['review', 'review_TransformedText'],
                 number_of_threads=1,
                 shuffle=False))
        ])

        param_grid = dict(lr__maximum_number_of_iterations=[1, 100, 20])
        grid = GridSearchCV(pipeline, param_grid)

        grid.fit(data['review'], 1 * (data['sentiment'] == 'pos'))
        assert grid.best_params_['lr__maximum_number_of_iterations'] == 100
Beispiel #10
0
    def test_word_embedding(self):

        ds_train = pandas.DataFrame(data=dict(
            description=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        ng = NGramFeaturizer(columns=['description'], output_tokens=True)
        we = WordEmbedding(columns='description_TransformedText',
                           model_kind='Sswe')

        model = Pipeline([ng, we])
        dot_vis = dot_export_pipeline(model, ds_train)
        assert 'ch1[label="<f0> description|<f1> ' \
               'description_TransformedText"' in dot_vis
    def test_ngramfeaturizer(self):
        np.random.seed(0)
        train_file = get_dataset('wiki_detox_train').as_filepath()
        (train, label) = get_X_y(train_file,
                                 label_column='Sentiment',
                                 sep='\t',
                                 encoding="utf-8")
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train[:100])
        sum = X_train.iloc[:].sum().sum()
        assert_equal(sum, 30513, "sum of all features is incorrect!")
Beispiel #12
0
    def test_ngramfeaturizer_syntax_dict(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << {
            'outg': ['review']
        }
        X = textt.fit_transform(X)

        assert X.shape == (25, 117)
        # columns ordering changed between 0.22 and 0.23
        assert 'review' in (X.columns[0], X.columns[-1])
        X = X.drop('review', axis=1)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        X_test = X_test.drop('review', axis=1)
        scores = mymodel.predict(X_test)

        # View the scores
        assert scores.shape == (10, )
Beispiel #13
0
    def test_ngramfeaturizer(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
        X = textt.fit_transform(X)

        assert X.shape == (25, 116)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        scores = mymodel.predict(textt.transform(test_reviews))

        # View the scores
        assert scores.shape == (10, )
        assert X_test.shape[0] == 10
 def test_column_list_or_string(self):
     # Bug 142794
     data = pd.DataFrame({
         "Sentiment": [0, 1, 1, 0, 1, 1],
         "SentimentText": [
             "this is train ", "review ", "sentence ", "an apple",
             "sentence 22", "another one one one"
         ]
     })
     data['SentimentText'] = data['SentimentText'].astype(str)
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram()) << {
         "score": 'SentimentText'
     }
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram()) << {
         "score": ['SentimentText']
     }
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram(),
                                  columns=['SentimentText'])
     res1 = featurizer.fit_transform(data)
     featurizer = NGramFeaturizer(
         word_feature_extractor=n_gram()) << 'SentimentText'
     res2 = featurizer.fit_transform(data)
     assert_frame_equal(res1, res2)
 def _test_sklearn_pipeline(self):
     train_reviews = pandas.DataFrame(data=dict(
         review=["This is great", "I hate it", "Love it", "Do not like it"],
         like=[True, False, True, False]))
     y = train_reviews['like']
     int_y = [int(x) for x in y]
     X = train_reviews.loc[:, train_reviews.columns != 'like']
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram())
     svd = TruncatedSVD(random_state=1, n_components=5)
     lr = sklearn.linear_model.LogisticRegression()
     pipe1 = ppl([("featurizer", featurizer), ("svd", svd), ("lr", lr)])
     pipe1.fit(X, int_y)
     pred = pipe1.predict(X)
     assert pred.shape == (4, )
Beispiel #16
0
    def test_ngramfeaturizer_multi(self):

        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        try:
            xf = NGramFeaturizer(word_feature_extractor=n_gram(),
                                 columns={
                                     'features': ['id'],
                                     'features2': ['education']
                                 })
        except TypeError as e:
            assert 'Only one output column is allowed' in str(e)
            return

        try:
            # System.InvalidCastException: 'Cannot cast
            # Newtonsoft.Json.Linq.JArray to Newtonsoft.Json.Linq.JToken.
            xf.fit_transform(data)
            assert False
        except RuntimeError:
            pass
Beispiel #17
0
    def test_automl_usecase(self):
        # train featurization pipeline
        featurization_pipe = Pipeline([NGramFeaturizer(keep_diacritics=True, columns={'Features': ['SentimentText']})])
        featurization_pipe.fit(train_set)

        # train learner pipeline
        learner_pipe = Pipeline([DatasetTransformer(featurization_pipe.model),
                    OneVsRestClassifier(AveragedPerceptronBinaryClassifier(),
                                       feature=['Features'], label='Sentiment')
        ])
        learner_pipe.fit(train_set)

        # Export the learner pipeline to ONNX
        onnx_path = get_tmp_file('.onnx')
        learner_pipe.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable')

        # Perform the transform using the standard ML.Net backend
        start = time.time()
        result_standard = learner_pipe.predict(test_set)
        end = time.time()
        print('%ss done transform using standard backend' % round(end -  start, 3))

        # Perform the transform using the ORT backend
        df_tool = DFT(onnx_path)
        dataset = test_set.to_df()
        start = time.time()
        result_ort = df_tool.execute(dataset, ['PredictedLabel.output', 'Score.output'])
        end = time.time()
        print('%ss done transform using ORT backend (excludes df load time)' % round(end - start, 3))

        # compare the results
        for col_tuple in (('PredictedLabel', 'PredictedLabel.output'), 
                          ('Score.0', 'Score.output.0'),
                          ('Score.1', 'Score.output.1'),
                          ):
            col_expected = result_standard.loc[:, col_tuple[0]]
            col_ort = result_ort.loc[:, col_tuple[1]]

            check_kwargs = {
                'check_names': False,
                'check_exact': False,
                'check_dtype': True,
                'check_less_precise': True
            }

            pd.testing.assert_series_equal(col_expected, col_ort, **check_kwargs)
Beispiel #18
0
    def test_word_embedding_example2(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 header=+'
        data = FileDataStream(path, schema=file_schema)

        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens=True,
                            columns={'features': ['id', 'education']}),
            WordEmbedding(columns='features_TransformedText')
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 802)
        assert 'features_TransformedText.94' in list(features.columns)
Beispiel #19
0
def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None):
    data = [
        "This is sentence 1", "Talk about second", "Thrid one",
        "Final example."
    ]
    label = [1, 0, 1, 1]
    model = Pipeline([
        NGramFeaturizer(),
        LightGbmClassifier(min_data_per_leaf=1, n_thread=1)
    ])
    data_with_new_type = transform_data(data, fit_X_type)
    label_with_new_type = transform_data(label, fit_Y_type)
    model.fit(data_with_new_type, label_with_new_type)
    metrics, scores = model.test(data_with_new_type,
                                 label_with_new_type,
                                 output_scores=True)
    test_data_with_new_type = transform_data(data, predict_X_type)
    return model.predict(test_data_with_new_type), scores, metrics
    def test_pipeline_name_error(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })
        NGramFeaturizer(word_feature_extractor=n_gram()).fit_transform(
            trainData[["SentimentText"]])

        msg = "Parameters ['NumLeaves', 'min_data', 'min_data_in_bin', " \
              "'minsplit'] are not allowed"
        with self.assertRaises(NameError, msg=msg):
            LightGbmClassifier(min_data=1,
                               min_data_in_bin=1,
                               min_data_per_leaf=1,
                               minsplit=1,
                               NumLeaves=2)
Beispiel #21
0
    def test_word_embedding_example_dict_newname(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens_column_name='features_TransformedText',
                            columns={'features': ['id', 'education']}),

            # What is features_TransformedText?
            WordEmbedding(
                columns={
                    'features_TransformedText2': 'features_TransformedText'})
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 409)
Beispiel #22
0
    def test_LightLda(self):
        topics = pandas.DataFrame(data=dict(review=[
            "animals birds cats dogs fish horse",
            "horse birds house fish duck cats",
            "car truck driver bus pickup",
            "car truck driver bus pickup horse ",
            "car truck",
            "bus pickup",
            "space galaxy universe radiation",
            "radiation galaxy universe duck"]))

        pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=n_gram(
        ), vector_normalizer='None') << 'review', LightLda(num_topic=3)])
        y = pipeline.fit_transform(topics)
        assert_almost_equal(
            y.sum().sum(),
            7.000000044,
            decimal=8,
            err_msg="Sum should be %s" %
                    7.000000044)
 def test_transform_only_pipeline_transform_method(self):
     p = Pipeline(
         [NGramFeaturizer(char_feature_extractor=None) << 'SentimentText'])
     p.fit(X)
     xf = p.transform(X)
     assert 'SentimentText.==rude==' in xf.columns
Beispiel #24
0
# data input (as a FileDataStream)
path = get_dataset('wiki_detox_train').as_filepath()
data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
#   Sentiment                                      SentimentText
# 0          1  ==RUDE== Dude, you are rude upload that carl p...
# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2          1  Stop trolling, zapatancas, calling me a liar m...
# 3          1  ==You're cool==  You seem like a really cool g...
# 4          1  ::::: Why are you threatening me? I'm not bein...

# transform usage
pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(),
                    output_tokens=True,
                    columns={'ngram': ['SentimentText']}),
    WordEmbedding(columns='ngram_TransformedText')
])

# fit and transform
features = pipeline.fit_transform(data)

# print features
print(features.head())
#   Sentiment  ...       ngram.douchiest  ngram.award.
# 0          1 ...                   0.0           0.0
# 1          1 ...                   0.0           0.0
# 2          1 ...                   0.0           0.0
# 3          1 ...                   0.0           0.0
# 4          1 ...                   0.0           0.0
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram
from nimbusml.naive_bayes import NaiveBayesClassifier
from nimbusml.utils import get_X_y
from sklearn.model_selection import train_test_split

# use 'wiki_detox_train' data set to create test and train data
# Sentiment	SentimentText
# 1	  ==RUDE== Dude, you are rude upload that carl picture back, or else.
# 1	  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIKI THEN!!!
np.random.seed(0)
train_file = get_dataset("wiki_detox_train").as_filepath()
(train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t')

X_train, X_test, y_train, y_test = train_test_split(train, label)

# map text reviews to vector space
texttransform = NGramFeaturizer(word_feature_extractor=Ngram(),
                                vector_normalizer='None') << 'SentimentText'
nb = NaiveBayesClassifier(feature=['SentimentText'])

ppl = Pipeline([texttransform, nb])

ppl.fit(X_train, y_train)

scores = ppl.predict(X_test)['PredictedLabel']

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))
Beispiel #26
0
import pandas
from nimbusml import Pipeline
from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding
from nimbusml.feature_extraction.text.extractor import Ngram

# create the data
customer_reviews = pandas.DataFrame(data=dict(review=[
    "I really did not like the taste of it", "It was surprisingly quite good!",
    "I will never ever ever go to that place again!!",
    "The best ever!! It was amazingly good and super fast",
    "I wish I had gone earlier, it was that great",
    "somewhat dissapointing. I'd probably wont try again",
    "Never visit again... rascals!"
]))

pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(),
                    output_tokens_column_name='review_TransformedText'),
    WordEmbedding() << 'review_TransformedText'
])
y = pipeline.fit_transform(customer_reviews)

# view a small subset of the review embeddings
print(y.iloc[:5, -3:])
#    review_TransformedText.147  review_TransformedText.148  review_TransformedText.149
# 0                    1.918661                   -0.714531                    3.062141
# 1                    1.891922                   -0.248650                    1.706620
# 2                    1.601611                    0.309785                    3.379576
# 3                    1.970666                    1.477450                    3.110802
# 4                    2.521791                    0.122538                    3.129919
INSTANCES = {
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(minimum_example_count_per_group=1,
                             minimum_example_count_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(minimum_example_count_per_group=1,
                       minimum_example_count_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(minimum_example_count_per_group=1,
                      minimum_example_count_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(minimum_example_count_per_group=1,
                   minimum_example_count_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SkipFilter':
    SkipFilter(count=5),
    'TakeFilter':
    TakeFilter(count=100000),
    'IidSpikeDetector':
    IidSpikeDetector(columns=['F0']),
    'IidChangePointDetector':
    IidChangePointDetector(columns=['F0']),
    'SsaSpikeDetector':
    SsaSpikeDetector(columns=['F0'], seasonal_window_size=2),
    'SsaChangePointDetector':
    SsaChangePointDetector(columns=['F0'], seasonal_window_size=2),
    'SsaForecaster':
    SsaForecaster(columns=['F0'],
                  window_size=2,
Beispiel #28
0
# Get schema from a fitted pipeline example.
import numpy as np
import pandas as pd
from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram

# data input (as a FileDataStream)
path = get_dataset("wiki_detox_train").as_filepath()

data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
#    Sentiment                                      SentimentText
# 0          1  ==RUDE== Dude, you are rude upload that carl p...
# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2          1  Stop trolling, zapatancas, calling me a liar m...
# 3          1  ==You're cool==  You seem like a really cool g...
# 4          1  ::::: Why are you threatening me? I'm not bein...

pipe = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(),
                    columns={'features': ['SentimentText']})
])

pipe.fit(data)
schema = pipe.get_output_columns()

print(schema[0:5])
# ['Sentiment', 'SentimentText', 'features.Char.<␂>|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u']
Beispiel #29
0
# data input (as a FileDataStream)
path = get_dataset('wiki_detox_train').as_filepath()

data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
#   Sentiment                                      SentimentText
# 0          1  ==RUDE== Dude, you are rude upload that carl p...
# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2          1  Stop trolling, zapatancas, calling me a liar m...
# 3          1  ==You're cool==  You seem like a really cool g...
# 4          1  ::::: Why are you threatening me? I'm not bein...

xf = NGramFeaturizer(word_feature_extractor=Ngram(),
                     stop_words_remover=CustomStopWordsRemover(['!',
                                                                '$',
                                                                '%',
                                                                '&',
                                                                '\'',
                                                                '\'d']),
                     columns={'features': ['SentimentText']})

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())

#   Sentiment   ...         features.douchiest  features.award.
# 0          1  ...                        0.0              0.0
# 1          1  ...                        0.0              0.0
# 2          1  ...                        0.0              0.0
# 3          1  ...                        0.0              0.0
Beispiel #30
0
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     LpScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'MutualInformationSelector': Pipeline([
     ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}),
     MutualInformationSelector(
         columns='Features',
         label='Label',
         slots_in_output=2)  # only accept one column
 ]),
 'NaiveBayesClassifier': NaiveBayesClassifier(feature=['Sepal_Width', 'Sepal_Length']),
 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=Ngram(),
                                    char_feature_extractor=Ngram(),
                                    keep_diacritics=True,
                                    columns={ 'features': ['SentimentText']}),
 'OneHotHashVectorizer': OneHotHashVectorizer(columns=['education_str']),
 'OneHotVectorizer': OneHotVectorizer(columns=['education_str']),
 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \
     OneVsRestClassifier(AveragedPerceptronBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',
                                  'education_str.0-5yrs',
                                  'education_str.6-11yrs',
                                  'education_str.12+ yrs'],
                         label='induced'),
 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \
     OneVsRestClassifier(LinearSvmBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',