コード例 #1
0
ファイル: test_syntax.py プロジェクト: zyw400/NimbusML-1
    def test_syntax9_slots_label(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        X = train_reviews.loc[:, train_reviews.columns != 'like']
        y = train_reviews[['like']]

        transform_1 = NGramFeaturizer(word_feature_extractor=n_gram())
        transform_2 = MutualInformationSelector()
        exp = Pipeline([transform_1, transform_2])
        res = exp.fit_transform(X, y)
        assert res is not None

        # Scikit compatibility (Compose transforms inside Scikit Pipeline).
        # In this scenario, we do not provide {input, output} arguments
        transform_1 = NGramFeaturizer(word_feature_extractor=n_gram())
        transform_2 = MutualInformationSelector(slots_in_output=2)
        pipe = Pipeline([transform_1, transform_2])
        res = pipe.fit_transform(X, y)
        assert res is not None
コード例 #2
0
    def test_lightgbmclassifier(self):
        np.random.seed(0)
        train_file = get_dataset('wiki_detox_train').as_filepath()
        (train,
         label) = get_X_y(train_file,
                          label_column='Sentiment',
                          sep='\t',
                          encoding="utf-8")
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train, max_slots=5000)
        X_test = texttransform.transform(X_test, max_slots=5000)

        mymodel = LightGbmClassifier().fit(X_train, y_train, verbose=0)
        scores = mymodel.predict(X_test)
        accuracy = np.mean(y_test.values.ravel() == scores.values)
        assert_greater(
            accuracy,
            0.58,
            "accuracy should be greater than %s" %
            0.58)
コード例 #3
0
    def test_pipeline_with_no_columns(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None
        ppl.fit(trainData[["SentimentText"]], np.array(trainData["Sentiment"]))
コード例 #4
0
    def test_naivebayesclassifier(self):
        np.random.seed(0)
        train_file = get_dataset("wiki_detox_train").as_filepath()
        (train, label) = get_X_y(train_file, label_column='Sentiment',
                                 sep='\t')
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train)
        X_test = texttransform.transform(X_test)

        mymodel = NaiveBayesClassifier()
        mymodel.fit(X_train, y_train)

        scores = mymodel.predict(X_test)
        accuracy = np.mean(y_test == [i for i in scores])[0]
        assert_greater(
            accuracy,
            0.5,
            "accuracy should be greater than %s" %
            0.5)
コード例 #5
0
ファイル: test_text.py プロジェクト: zyw400/NimbusML-1
def train_data_type_single(fit_X_type="dataframe",
                           fit_Y_type=None,
                           predict_X_type=None):
    data = [
        "This is sentence 1", "Talk about second", "Thrid one",
        "Final example."
    ]
    model = NGramFeaturizer()
    data_with_new_type = transform_data(data, fit_X_type)
    model.fit(data_with_new_type)
    test_data_with_new_type = transform_data(data, predict_X_type)
    return model.transform(test_data_with_new_type)
コード例 #6
0
    def test_ngramfeaturizer_single(self):

        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        xf = NGramFeaturizer(word_feature_extractor=n_gram(),
                             columns={'features': ['id', 'education']})

        features = xf.fit_transform(data)
        assert features.shape == (248, 652)
コード例 #7
0
    def test_syntax9_multiple_inputs(self):
        df = pandas.DataFrame(
            dict(education1=['A', 'B', 'A', 'B', 'A'],
                 education2=['c', 'd', 'c', 'd', 'c'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)

        ng4 = NGramFeaturizer(word_feature_extractor=n_gram()) << {
            'out1': ['education1', 'education2']
        }
        output4 = ng4.fit_transform(X)
        assert output4.shape == (5, 13)
コード例 #8
0
    def test_pipeline_with_no_columns_raise(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier()
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        with self.assertRaises(RuntimeError):
            # Message
            # System.InvalidOperationException:
            # 'LightGBM Error, code is -1, error message is
            # 'Cannot construct Dataset since there are not useful features.
            # It should be at least two unique rows.
            # If the num_row (num_data) is small,
            # you can set min_data=1 and min_data_in_bin=1 to fix this.
            # Otherwise please make sure you are using the right dataset.'
            ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])
コード例 #9
0
    def test_NGramFeaturizer_glove(self):
        # grid search over number_of_trees and then confirm the best number_of_trees by
        # full train
        np.random.seed(0)
        data = pd.DataFrame({
            'review': [
                'I like this movie', 'I don\'t like this', 'It is nice',
                'I like this movie', 'I don\'t like this', 'It is nice',
                'So boring'
            ],
            'sentiment': ['pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg']
        })
        pipeline = Pipeline([
            ('ng',
             NGramFeaturizer(
                 word_feature_extractor=Ngram(),
                 output_tokens_column_name='review_TransformedText',
                 columns='review')),
            WordEmbedding(columns='review_TransformedText',
                          model_kind='GloVe50D'),
            ('lr',
             FastLinearBinaryClassifier(
                 feature=['review', 'review_TransformedText'],
                 number_of_threads=1,
                 shuffle=False))
        ])

        param_grid = dict(lr__maximum_number_of_iterations=[1, 100, 20])
        grid = GridSearchCV(pipeline, param_grid)

        grid.fit(data['review'], 1 * (data['sentiment'] == 'pos'))
        assert grid.best_params_['lr__maximum_number_of_iterations'] == 100
コード例 #10
0
ファイル: test_exports.py プロジェクト: zyw400/NimbusML-1
    def test_word_embedding(self):

        ds_train = pandas.DataFrame(data=dict(
            description=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        ng = NGramFeaturizer(columns=['description'], output_tokens=True)
        we = WordEmbedding(columns='description_TransformedText',
                           model_kind='Sswe')

        model = Pipeline([ng, we])
        dot_vis = dot_export_pipeline(model, ds_train)
        assert 'ch1[label="<f0> description|<f1> ' \
               'description_TransformedText"' in dot_vis
コード例 #11
0
    def test_ngramfeaturizer(self):
        np.random.seed(0)
        train_file = get_dataset('wiki_detox_train').as_filepath()
        (train, label) = get_X_y(train_file,
                                 label_column='Sentiment',
                                 sep='\t',
                                 encoding="utf-8")
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train[:100])
        sum = X_train.iloc[:].sum().sum()
        assert_equal(sum, 30513, "sum of all features is incorrect!")
コード例 #12
0
    def test_ngramfeaturizer_syntax_dict(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << {
            'outg': ['review']
        }
        X = textt.fit_transform(X)

        assert X.shape == (25, 117)
        # columns ordering changed between 0.22 and 0.23
        assert 'review' in (X.columns[0], X.columns[-1])
        X = X.drop('review', axis=1)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        X_test = X_test.drop('review', axis=1)
        scores = mymodel.predict(X_test)

        # View the scores
        assert scores.shape == (10, )
コード例 #13
0
    def test_ngramfeaturizer(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
        X = textt.fit_transform(X)

        assert X.shape == (25, 116)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        scores = mymodel.predict(textt.transform(test_reviews))

        # View the scores
        assert scores.shape == (10, )
        assert X_test.shape[0] == 10
コード例 #14
0
 def test_column_list_or_string(self):
     # Bug 142794
     data = pd.DataFrame({
         "Sentiment": [0, 1, 1, 0, 1, 1],
         "SentimentText": [
             "this is train ", "review ", "sentence ", "an apple",
             "sentence 22", "another one one one"
         ]
     })
     data['SentimentText'] = data['SentimentText'].astype(str)
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram()) << {
         "score": 'SentimentText'
     }
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram()) << {
         "score": ['SentimentText']
     }
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram(),
                                  columns=['SentimentText'])
     res1 = featurizer.fit_transform(data)
     featurizer = NGramFeaturizer(
         word_feature_extractor=n_gram()) << 'SentimentText'
     res2 = featurizer.fit_transform(data)
     assert_frame_equal(res1, res2)
コード例 #15
0
 def _test_sklearn_pipeline(self):
     train_reviews = pandas.DataFrame(data=dict(
         review=["This is great", "I hate it", "Love it", "Do not like it"],
         like=[True, False, True, False]))
     y = train_reviews['like']
     int_y = [int(x) for x in y]
     X = train_reviews.loc[:, train_reviews.columns != 'like']
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram())
     svd = TruncatedSVD(random_state=1, n_components=5)
     lr = sklearn.linear_model.LogisticRegression()
     pipe1 = ppl([("featurizer", featurizer), ("svd", svd), ("lr", lr)])
     pipe1.fit(X, int_y)
     pred = pipe1.predict(X)
     assert pred.shape == (4, )
コード例 #16
0
    def test_ngramfeaturizer_multi(self):

        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        try:
            xf = NGramFeaturizer(word_feature_extractor=n_gram(),
                                 columns={
                                     'features': ['id'],
                                     'features2': ['education']
                                 })
        except TypeError as e:
            assert 'Only one output column is allowed' in str(e)
            return

        try:
            # System.InvalidCastException: 'Cannot cast
            # Newtonsoft.Json.Linq.JArray to Newtonsoft.Json.Linq.JToken.
            xf.fit_transform(data)
            assert False
        except RuntimeError:
            pass
コード例 #17
0
    def test_automl_usecase(self):
        # train featurization pipeline
        featurization_pipe = Pipeline([NGramFeaturizer(keep_diacritics=True, columns={'Features': ['SentimentText']})])
        featurization_pipe.fit(train_set)

        # train learner pipeline
        learner_pipe = Pipeline([DatasetTransformer(featurization_pipe.model),
                    OneVsRestClassifier(AveragedPerceptronBinaryClassifier(),
                                       feature=['Features'], label='Sentiment')
        ])
        learner_pipe.fit(train_set)

        # Export the learner pipeline to ONNX
        onnx_path = get_tmp_file('.onnx')
        learner_pipe.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable')

        # Perform the transform using the standard ML.Net backend
        start = time.time()
        result_standard = learner_pipe.predict(test_set)
        end = time.time()
        print('%ss done transform using standard backend' % round(end -  start, 3))

        # Perform the transform using the ORT backend
        df_tool = DFT(onnx_path)
        dataset = test_set.to_df()
        start = time.time()
        result_ort = df_tool.execute(dataset, ['PredictedLabel.output', 'Score.output'])
        end = time.time()
        print('%ss done transform using ORT backend (excludes df load time)' % round(end - start, 3))

        # compare the results
        for col_tuple in (('PredictedLabel', 'PredictedLabel.output'), 
                          ('Score.0', 'Score.output.0'),
                          ('Score.1', 'Score.output.1'),
                          ):
            col_expected = result_standard.loc[:, col_tuple[0]]
            col_ort = result_ort.loc[:, col_tuple[1]]

            check_kwargs = {
                'check_names': False,
                'check_exact': False,
                'check_dtype': True,
                'check_less_precise': True
            }

            pd.testing.assert_series_equal(col_expected, col_ort, **check_kwargs)
コード例 #18
0
    def test_word_embedding_example2(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 header=+'
        data = FileDataStream(path, schema=file_schema)

        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens=True,
                            columns={'features': ['id', 'education']}),
            WordEmbedding(columns='features_TransformedText')
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 802)
        assert 'features_TransformedText.94' in list(features.columns)
コード例 #19
0
ファイル: test_text.py プロジェクト: zyw400/NimbusML-1
def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None):
    data = [
        "This is sentence 1", "Talk about second", "Thrid one",
        "Final example."
    ]
    label = [1, 0, 1, 1]
    model = Pipeline([
        NGramFeaturizer(),
        LightGbmClassifier(min_data_per_leaf=1, n_thread=1)
    ])
    data_with_new_type = transform_data(data, fit_X_type)
    label_with_new_type = transform_data(label, fit_Y_type)
    model.fit(data_with_new_type, label_with_new_type)
    metrics, scores = model.test(data_with_new_type,
                                 label_with_new_type,
                                 output_scores=True)
    test_data_with_new_type = transform_data(data, predict_X_type)
    return model.predict(test_data_with_new_type), scores, metrics
コード例 #20
0
    def test_pipeline_name_error(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })
        NGramFeaturizer(word_feature_extractor=n_gram()).fit_transform(
            trainData[["SentimentText"]])

        msg = "Parameters ['NumLeaves', 'min_data', 'min_data_in_bin', " \
              "'minsplit'] are not allowed"
        with self.assertRaises(NameError, msg=msg):
            LightGbmClassifier(min_data=1,
                               min_data_in_bin=1,
                               min_data_per_leaf=1,
                               minsplit=1,
                               NumLeaves=2)
コード例 #21
0
    def test_word_embedding_example_dict_newname(self):
        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        pipeline = Pipeline([
            NGramFeaturizer(word_feature_extractor=Ngram(),
                            output_tokens_column_name='features_TransformedText',
                            columns={'features': ['id', 'education']}),

            # What is features_TransformedText?
            WordEmbedding(
                columns={
                    'features_TransformedText2': 'features_TransformedText'})
        ])

        features = pipeline.fit_transform(data)
        assert features.shape == (248, 409)
コード例 #22
0
ファイル: test_lightlda.py プロジェクト: zyw400/NimbusML-1
    def test_LightLda(self):
        topics = pandas.DataFrame(data=dict(review=[
            "animals birds cats dogs fish horse",
            "horse birds house fish duck cats",
            "car truck driver bus pickup",
            "car truck driver bus pickup horse ",
            "car truck",
            "bus pickup",
            "space galaxy universe radiation",
            "radiation galaxy universe duck"]))

        pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=n_gram(
        ), vector_normalizer='None') << 'review', LightLda(num_topic=3)])
        y = pipeline.fit_transform(topics)
        assert_almost_equal(
            y.sum().sum(),
            7.000000044,
            decimal=8,
            err_msg="Sum should be %s" %
                    7.000000044)
コード例 #23
0
 def test_transform_only_pipeline_transform_method(self):
     p = Pipeline(
         [NGramFeaturizer(char_feature_extractor=None) << 'SentimentText'])
     p.fit(X)
     xf = p.transform(X)
     assert 'SentimentText.==rude==' in xf.columns
コード例 #24
0
ファイル: WordEmbedding.py プロジェクト: zyw400/NimbusML-1
# data input (as a FileDataStream)
path = get_dataset('wiki_detox_train').as_filepath()
data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
#   Sentiment                                      SentimentText
# 0          1  ==RUDE== Dude, you are rude upload that carl p...
# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2          1  Stop trolling, zapatancas, calling me a liar m...
# 3          1  ==You're cool==  You seem like a really cool g...
# 4          1  ::::: Why are you threatening me? I'm not bein...

# transform usage
pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(),
                    output_tokens=True,
                    columns={'ngram': ['SentimentText']}),
    WordEmbedding(columns='ngram_TransformedText')
])

# fit and transform
features = pipeline.fit_transform(data)

# print features
print(features.head())
#   Sentiment  ...       ngram.douchiest  ngram.award.
# 0          1 ...                   0.0           0.0
# 1          1 ...                   0.0           0.0
# 2          1 ...                   0.0           0.0
# 3          1 ...                   0.0           0.0
# 4          1 ...                   0.0           0.0
コード例 #25
0
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram
from nimbusml.naive_bayes import NaiveBayesClassifier
from nimbusml.utils import get_X_y
from sklearn.model_selection import train_test_split

# use 'wiki_detox_train' data set to create test and train data
# Sentiment	SentimentText
# 1	  ==RUDE== Dude, you are rude upload that carl picture back, or else.
# 1	  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIKI THEN!!!
np.random.seed(0)
train_file = get_dataset("wiki_detox_train").as_filepath()
(train, label) = get_X_y(train_file, label_column='Sentiment', sep='\t')

X_train, X_test, y_train, y_test = train_test_split(train, label)

# map text reviews to vector space
texttransform = NGramFeaturizer(word_feature_extractor=Ngram(),
                                vector_normalizer='None') << 'SentimentText'
nb = NaiveBayesClassifier(feature=['SentimentText'])

ppl = Pipeline([texttransform, nb])

ppl.fit(X_train, y_train)

scores = ppl.predict(X_test)['PredictedLabel']

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))
コード例 #26
0
ファイル: WordEmbedding_df.py プロジェクト: yazici/NimbusML
import pandas
from nimbusml import Pipeline
from nimbusml.feature_extraction.text import NGramFeaturizer, WordEmbedding
from nimbusml.feature_extraction.text.extractor import Ngram

# create the data
customer_reviews = pandas.DataFrame(data=dict(review=[
    "I really did not like the taste of it", "It was surprisingly quite good!",
    "I will never ever ever go to that place again!!",
    "The best ever!! It was amazingly good and super fast",
    "I wish I had gone earlier, it was that great",
    "somewhat dissapointing. I'd probably wont try again",
    "Never visit again... rascals!"
]))

pipeline = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(),
                    output_tokens_column_name='review_TransformedText'),
    WordEmbedding() << 'review_TransformedText'
])
y = pipeline.fit_transform(customer_reviews)

# view a small subset of the review embeddings
print(y.iloc[:5, -3:])
#    review_TransformedText.147  review_TransformedText.148  review_TransformedText.149
# 0                    1.918661                   -0.714531                    3.062141
# 1                    1.891922                   -0.248650                    1.706620
# 2                    1.601611                    0.309785                    3.379576
# 3                    1.970666                    1.477450                    3.110802
# 4                    2.521791                    0.122538                    3.129919
コード例 #27
0
INSTANCES = {
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(minimum_example_count_per_group=1,
                             minimum_example_count_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(minimum_example_count_per_group=1,
                       minimum_example_count_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(minimum_example_count_per_group=1,
                      minimum_example_count_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(minimum_example_count_per_group=1,
                   minimum_example_count_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SkipFilter':
    SkipFilter(count=5),
    'TakeFilter':
    TakeFilter(count=100000),
    'IidSpikeDetector':
    IidSpikeDetector(columns=['F0']),
    'IidChangePointDetector':
    IidChangePointDetector(columns=['F0']),
    'SsaSpikeDetector':
    SsaSpikeDetector(columns=['F0'], seasonal_window_size=2),
    'SsaChangePointDetector':
    SsaChangePointDetector(columns=['F0'], seasonal_window_size=2),
    'SsaForecaster':
    SsaForecaster(columns=['F0'],
                  window_size=2,
コード例 #28
0
ファイル: Schema.py プロジェクト: yazici/NimbusML
# Get schema from a fitted pipeline example.
import numpy as np
import pandas as pd
from nimbusml import Pipeline, FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.feature_extraction.text.extractor import Ngram

# data input (as a FileDataStream)
path = get_dataset("wiki_detox_train").as_filepath()

data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
#    Sentiment                                      SentimentText
# 0          1  ==RUDE== Dude, you are rude upload that carl p...
# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2          1  Stop trolling, zapatancas, calling me a liar m...
# 3          1  ==You're cool==  You seem like a really cool g...
# 4          1  ::::: Why are you threatening me? I'm not bein...

pipe = Pipeline([
    NGramFeaturizer(word_feature_extractor=Ngram(),
                    columns={'features': ['SentimentText']})
])

pipe.fit(data)
schema = pipe.get_output_columns()

print(schema[0:5])
# ['Sentiment', 'SentimentText', 'features.Char.<␂>|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u']
コード例 #29
0
# data input (as a FileDataStream)
path = get_dataset('wiki_detox_train').as_filepath()

data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
#   Sentiment                                      SentimentText
# 0          1  ==RUDE== Dude, you are rude upload that carl p...
# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2          1  Stop trolling, zapatancas, calling me a liar m...
# 3          1  ==You're cool==  You seem like a really cool g...
# 4          1  ::::: Why are you threatening me? I'm not bein...

xf = NGramFeaturizer(word_feature_extractor=Ngram(),
                     stop_words_remover=CustomStopWordsRemover(['!',
                                                                '$',
                                                                '%',
                                                                '&',
                                                                '\'',
                                                                '\'d']),
                     columns={'features': ['SentimentText']})

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())

#   Sentiment   ...         features.douchiest  features.award.
# 0          1  ...                        0.0              0.0
# 1          1  ...                        0.0              0.0
# 2          1  ...                        0.0              0.0
# 3          1  ...                        0.0              0.0
コード例 #30
0
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     LpScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'MutualInformationSelector': Pipeline([
     ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}),
     MutualInformationSelector(
         columns='Features',
         label='Label',
         slots_in_output=2)  # only accept one column
 ]),
 'NaiveBayesClassifier': NaiveBayesClassifier(feature=['Sepal_Width', 'Sepal_Length']),
 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=Ngram(),
                                    char_feature_extractor=Ngram(),
                                    keep_diacritics=True,
                                    columns={ 'features': ['SentimentText']}),
 'OneHotHashVectorizer': OneHotHashVectorizer(columns=['education_str']),
 'OneHotVectorizer': OneHotVectorizer(columns=['education_str']),
 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \
     OneVsRestClassifier(AveragedPerceptronBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',
                                  'education_str.0-5yrs',
                                  'education_str.6-11yrs',
                                  'education_str.12+ yrs'],
                         label='induced'),
 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \
     OneVsRestClassifier(LinearSvmBinaryClassifier(),
                         use_probabilities=True,
                         feature=['age',