Esempi in Python per n_gram, esempi in Python per nimbusml.internal.entrypoints._ngramextractor_ngram.n_gram

Esempio n. 1

0

Mostra file

File: test_pipeline_syntax.py Progetto: zyw400/NimbusML-1

    def test_pipeline_with_no_columns(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier(min_data_per_leaf=1, min_data_per_group=1)
        ])
        assert ppl is not None
        ppl.fit(trainData[["SentimentText"]], np.array(trainData["Sentiment"]))

Esempio n. 2

0

Mostra file

File: test_syntax.py Progetto: zyw400/NimbusML-1

    def test_syntax9_slots_label(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        X = train_reviews.loc[:, train_reviews.columns != 'like']
        y = train_reviews[['like']]

        transform_1 = NGramFeaturizer(word_feature_extractor=n_gram())
        transform_2 = MutualInformationSelector()
        exp = Pipeline([transform_1, transform_2])
        res = exp.fit_transform(X, y)
        assert res is not None

        # Scikit compatibility (Compose transforms inside Scikit Pipeline).
        # In this scenario, we do not provide {input, output} arguments
        transform_1 = NGramFeaturizer(word_feature_extractor=n_gram())
        transform_2 = MutualInformationSelector(slots_in_output=2)
        pipe = Pipeline([transform_1, transform_2])
        res = pipe.fit_transform(X, y)
        assert res is not None

Esempio n. 3

0

Mostra file

    def test_lightgbmclassifier(self):
        np.random.seed(0)
        train_file = get_dataset('wiki_detox_train').as_filepath()
        (train,
         label) = get_X_y(train_file,
                          label_column='Sentiment',
                          sep='\t',
                          encoding="utf-8")
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train, max_slots=5000)
        X_test = texttransform.transform(X_test, max_slots=5000)

        mymodel = LightGbmClassifier().fit(X_train, y_train, verbose=0)
        scores = mymodel.predict(X_test)
        accuracy = np.mean(y_test.values.ravel() == scores.values)
        assert_greater(
            accuracy,
            0.58,
            "accuracy should be greater than %s" %
            0.58)

Esempio n. 4

0

Mostra file

File: test_pipeline_syntax.py Progetto: zyw400/NimbusML-1

    def test_pipeline_with_no_columns_raise(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })

        ppl = Pipeline([
            NGramFeaturizer(word_feature_extractor=n_gram()),
            LightGbmClassifier()
        ])
        assert ppl is not None

        # Bug 147697
        info = ppl.get_fit_info(trainData[["SentimentText"]],
                                trainData["Sentiment"])
        assert len(info) == 2
        assert len(info[0]) == 3
        with self.assertRaises(RuntimeError):
            # Message
            # System.InvalidOperationException:
            # 'LightGBM Error, code is -1, error message is
            # 'Cannot construct Dataset since there are not useful features.
            # It should be at least two unique rows.
            # If the num_row (num_data) is small,
            # you can set min_data=1 and min_data_in_bin=1 to fix this.
            # Otherwise please make sure you are using the right dataset.'
            ppl.fit(trainData[["SentimentText"]], trainData["Sentiment"])

Esempio n. 5

0

Mostra file

File: test_naivebayesclassifier.py Progetto: zyw400/NimbusML-1

    def test_naivebayesclassifier(self):
        np.random.seed(0)
        train_file = get_dataset("wiki_detox_train").as_filepath()
        (train, label) = get_X_y(train_file, label_column='Sentiment',
                                 sep='\t')
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train)
        X_test = texttransform.transform(X_test)

        mymodel = NaiveBayesClassifier()
        mymodel.fit(X_train, y_train)

        scores = mymodel.predict(X_test)
        accuracy = np.mean(y_test == [i for i in scores])[0]
        assert_greater(
            accuracy,
            0.5,
            "accuracy should be greater than %s" %
            0.5)

Esempio n. 6

0

Mostra file

File: test_ngramfeaturizer.py Progetto: yazici/NimbusML

    def test_ngramfeaturizer_single(self):

        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        xf = NGramFeaturizer(word_feature_extractor=n_gram(),
                             columns={'features': ['id', 'education']})

        features = xf.fit_transform(data)
        assert features.shape == (248, 652)

Esempio n. 7

0

Mostra file

    def test_syntax9_multiple_inputs(self):
        df = pandas.DataFrame(
            dict(education1=['A', 'B', 'A', 'B', 'A'],
                 education2=['c', 'd', 'c', 'd', 'c'],
                 workclass=['X', 'X', 'Y', 'Y', 'Y'],
                 y=[1, 0, 1, 0, 0]))
        X = df.drop('y', axis=1)

        ng4 = NGramFeaturizer(word_feature_extractor=n_gram()) << {
            'out1': ['education1', 'education2']
        }
        output4 = ng4.fit_transform(X)
        assert output4.shape == (5, 13)

Esempio n. 8

0

Mostra file

File: test_pipeline_syntax.py Progetto: zyw400/NimbusML-1

 def test_column_list_or_string(self):
     # Bug 142794
     data = pd.DataFrame({
         "Sentiment": [0, 1, 1, 0, 1, 1],
         "SentimentText": [
             "this is train ", "review ", "sentence ", "an apple",
             "sentence 22", "another one one one"
         ]
     })
     data['SentimentText'] = data['SentimentText'].astype(str)
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram()) << {
         "score": 'SentimentText'
     }
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram()) << {
         "score": ['SentimentText']
     }
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram(),
                                  columns=['SentimentText'])
     res1 = featurizer.fit_transform(data)
     featurizer = NGramFeaturizer(
         word_feature_extractor=n_gram()) << 'SentimentText'
     res2 = featurizer.fit_transform(data)
     assert_frame_equal(res1, res2)

Esempio n. 9

0

Mostra file

File: test_data_schema_syntax.py Progetto: zyw400/NimbusML-1

 def _test_sklearn_pipeline(self):
     train_reviews = pandas.DataFrame(data=dict(
         review=["This is great", "I hate it", "Love it", "Do not like it"],
         like=[True, False, True, False]))
     y = train_reviews['like']
     int_y = [int(x) for x in y]
     X = train_reviews.loc[:, train_reviews.columns != 'like']
     featurizer = NGramFeaturizer(word_feature_extractor=n_gram())
     svd = TruncatedSVD(random_state=1, n_components=5)
     lr = sklearn.linear_model.LogisticRegression()
     pipe1 = ppl([("featurizer", featurizer), ("svd", svd), ("lr", lr)])
     pipe1.fit(X, int_y)
     pred = pipe1.predict(X)
     assert pred.shape == (4, )

Esempio n. 10

0

Mostra file

File: test_ngramfeaturizer.py Progetto: geeksperiments/NimbusML

    def test_ngramfeaturizer(self):
        np.random.seed(0)
        train_file = get_dataset('wiki_detox_train').as_filepath()
        (train, label) = get_X_y(train_file,
                                 label_column='Sentiment',
                                 sep='\t',
                                 encoding="utf-8")
        X_train, X_test, y_train, y_test = train_test_split(
            train['SentimentText'], label)

        # map text reviews to vector space
        texttransform = NGramFeaturizer(
            word_feature_extractor=n_gram(),
            vector_normalizer='None') << 'SentimentText'
        X_train = texttransform.fit_transform(X_train[:100])
        sum = X_train.iloc[:].sum().sum()
        assert_equal(sum, 30513, "sum of all features is incorrect!")

Esempio n. 11

0

Mostra file

File: test_ngramfeaturizer.py Progetto: yazici/NimbusML

    def test_ngramfeaturizer_syntax_dict(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << {
            'outg': ['review']
        }
        X = textt.fit_transform(X)

        assert X.shape == (25, 117)
        # columns ordering changed between 0.22 and 0.23
        assert 'review' in (X.columns[0], X.columns[-1])
        X = X.drop('review', axis=1)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        X_test = X_test.drop('review', axis=1)
        scores = mymodel.predict(X_test)

        # View the scores
        assert scores.shape == (10, )

Esempio n. 12

0

Mostra file

File: test_pipeline_syntax.py Progetto: zyw400/NimbusML-1

    def test_pipeline_name_error(self):
        trainData = pd.DataFrame({
            "Sentiment": [0, 1, 1, 0, 1, 1],
            "SentimentText": [
                "this is train ", "review ", "sentence ", "an apple",
                "sentence 22", "another one one one"
            ]
        })
        NGramFeaturizer(word_feature_extractor=n_gram()).fit_transform(
            trainData[["SentimentText"]])

        msg = "Parameters ['NumLeaves', 'min_data', 'min_data_in_bin', " \
              "'minsplit'] are not allowed"
        with self.assertRaises(NameError, msg=msg):
            LightGbmClassifier(min_data=1,
                               min_data_in_bin=1,
                               min_data_per_leaf=1,
                               minsplit=1,
                               NumLeaves=2)

Esempio n. 13

0

Mostra file

File: test_lightlda.py Progetto: zyw400/NimbusML-1

    def test_LightLda(self):
        topics = pandas.DataFrame(data=dict(review=[
            "animals birds cats dogs fish horse",
            "horse birds house fish duck cats",
            "car truck driver bus pickup",
            "car truck driver bus pickup horse ",
            "car truck",
            "bus pickup",
            "space galaxy universe radiation",
            "radiation galaxy universe duck"]))

        pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=n_gram(
        ), vector_normalizer='None') << 'review', LightLda(num_topic=3)])
        y = pipeline.fit_transform(topics)
        assert_almost_equal(
            y.sum().sum(),
            7.000000044,
            decimal=8,
            err_msg="Sum should be %s" %
                    7.000000044)

Esempio n. 14

0

Mostra file

File: test_ngramfeaturizer.py Progetto: yazici/NimbusML

    def test_ngramfeaturizer(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
        X = textt.fit_transform(X)

        assert X.shape == (25, 116)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        scores = mymodel.predict(textt.transform(test_reviews))

        # View the scores
        assert scores.shape == (10, )
        assert X_test.shape[0] == 10

Esempio n. 15

0

Mostra file

File: test_ngramfeaturizer.py Progetto: yazici/NimbusML

    def test_ngramfeaturizer_multi(self):

        path = get_dataset('infert').as_filepath()
        file_schema = 'sep=, col=id:TX:0 col=education:TX:1 col=age:R4:2 ' \
                      'col=parity:R4:3 col=induced:R4:4 col=case:R4:5 ' \
                      'col=spontaneous:R4:6 quote+ header=+'
        data = FileDataStream(path, schema=file_schema)
        try:
            xf = NGramFeaturizer(word_feature_extractor=n_gram(),
                                 columns={
                                     'features': ['id'],
                                     'features2': ['education']
                                 })
        except TypeError as e:
            assert 'Only one output column is allowed' in str(e)
            return

        try:
            # System.InvalidCastException: 'Cannot cast
            # Newtonsoft.Json.Linq.JArray to Newtonsoft.Json.Linq.JToken.
            xf.fit_transform(data)
            assert False
        except RuntimeError:
            pass

Esempio n. 16

0

Mostra file

File: test_estimator_checks.py Progetto: jorezy/NimbusML

INSTANCES = {
    'LightGbmBinaryClassifier':
    LightGbmBinaryClassifier(minimum_example_count_per_group=1,
                             minimum_example_count_per_leaf=1),
    'LightGbmClassifier':
    LightGbmClassifier(minimum_example_count_per_group=1,
                       minimum_example_count_per_leaf=1),
    'LightGbmRegressor':
    LightGbmRegressor(minimum_example_count_per_group=1,
                      minimum_example_count_per_leaf=1),
    'LightGbmRanker':
    LightGbmRanker(minimum_example_count_per_group=1,
                   minimum_example_count_per_leaf=1),
    'NGramFeaturizer':
    NGramFeaturizer(word_feature_extractor=n_gram()),
    'SkipFilter':
    SkipFilter(count=5),
    'TakeFilter':
    TakeFilter(count=100000),
    'IidSpikeDetector':
    IidSpikeDetector(columns=['F0']),
    'IidChangePointDetector':
    IidChangePointDetector(columns=['F0']),
    'SsaSpikeDetector':
    SsaSpikeDetector(columns=['F0'], seasonal_window_size=2),
    'SsaChangePointDetector':
    SsaChangePointDetector(columns=['F0'], seasonal_window_size=2),
    'SsaForecaster':
    SsaForecaster(columns=['F0'],
                  window_size=2,

Esempio n. 17

0

Mostra file

File: NGramFeaturizer_df.py Progetto: vdedyukhin/NimbusML

    data=dict(
        review=[
            "This is great",
            "I hate it",
            "Love it",
            "Really like it",
            "I hate it",
            "I like it a lot",
            "I love it",
            "I do like it",
            "I really hate it",
            "I love it"]))

y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

ngram = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
X = ngram.fit_transform(X)

# view the transformed numerical values and column names
# print(X.head())

mymodel = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = ngram.transform(test_reviews)

scores = mymodel.predict(ngram.transform(test_reviews))

# view the scores
# print(scores.head())