Ejemplo n.º 1
0
    def test_char_tokenizer(self):

        customer_reviews = pd.DataFrame(data=dict(review=[
            "I really did not like the taste of it",
            "It was surprisingly quite good!",
            "I will never ever ever go to that place again!!",
            "The best ever!! It was amazingly good and super fast",
            "I wish I had gone earlier, it was that great",
            "somewhat dissapointing. I'd probably wont try again",
            "Never visit again... rascals!"
        ]))

        tokenize = CharTokenizer(['review'])
        concat = ColumnConcatenator() >> 'features' << [['review']]
        pipeline = nimbusmlPipeline([concat, tokenize])
        y = pipeline.fit_transform(customer_reviews)
        assert y is not None
Ejemplo n.º 2
0
    def test_ngramfeaturizer(self):
        train_df = pandas.DataFrame(data=dict(review=['one', 'two']))

        pipeline = Pipeline([
            CharTokenizer(columns={'review_transform': 'review'}),
            NGramExtractor(ngram_length=3,
                           all_lengths=False,
                           columns={'ngrams': 'review_transform'}),
            ColumnDropper(columns=['review_transform', 'review'])
        ])

        result = pipeline.fit_transform(train_df)
        self.assertEqual(len(result.columns), 6)
        self.assertEqual(result.loc[0, 'ngrams.o|n|e'], 1.0)
        self.assertEqual(result.loc[1, 'ngrams.o|n|e'], 0.0)
        self.assertEqual(result.loc[0, 'ngrams.t|w|o'], 0.0)
        self.assertEqual(result.loc[1, 'ngrams.t|w|o'], 1.0)
Ejemplo n.º 3
0
    'EnsembleRegressor',
    'CharTokenizer',
    'WordTokenizer',
    'MutualInformationSelector',
    'NaiveBayesClassifier',
    'CountSelector',
    'KMeansPlusPlus',
    'ToKey',
    'ColumnSelector'
}

INSTANCES = {
    'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier(
        feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']),
    'Binner': Binner(num_bins=3),
    'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
    'ColumnConcatenator': ColumnConcatenator(columns={'Features': [
        'Sepal_Length',
        'Sepal_Width',
        'Petal_Length',
        'Petal_Width',
        'Setosa']}),
    'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']),
    'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}),
    'CountSelector': CountSelector(count=5, columns=['Sepal_Width']),
    'DateTimeSplitter': DateTimeSplitter(prefix='dt'),
    'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
Ejemplo n.º 4
0
            "This is great",
            "I hate it",
            "Love it",
            "Really like it",
            "I hate it",
            "I like it a lot",
            "I love it",
            "I do like it",
            "I really hate it",
            "I love it"]))

y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

pipeline = Pipeline([
    CharTokenizer(columns={'review_transform': 'review'}),
    NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}),
    ColumnDropper(columns=['review_transform', 'review'])
])
X = pipeline.fit_transform(X)

print(X.head())
#    ngrams.<␂>|T|h  ngrams.T|h|i  ngrams.h|i|s  ngrams.i|s|<␠>  ...  ngrams.i|t|!  ngrams.t|!|<␃>  ngrams.<␂>|H|a  ngrams.H|a|t
# 0             1.0           1.0           1.0             2.0  ...           0.0             0.0             0.0           0.0
# 1             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 2             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 3             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 4             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0

model = LogisticRegressionBinaryClassifier().fit(X, y)
Ejemplo n.º 5
0
###############################################################################
# CharTokenizer: pre-trained transform to analyze sentiment of free-form
# text.
import pandas
from nimbusml import Pipeline, Role
from nimbusml.preprocessing.schema import ColumnConcatenator
from nimbusml.preprocessing.text import CharTokenizer

# create the data
customer_reviews = pandas.DataFrame(data=dict(review=[
    "I really did not like the taste of it",
    "It was surprisingly quite good!",
    "I will never ever ever go to that place again!!",
    "The best ever!! It was amazingly good and super fast",
    "I wish I had gone earlier, it was that great",
    "somewhat dissapointing. I'd probably wont try again",
    "Never visit again... rascals!"]))

tokenize = CharTokenizer() << ['review']
concat = ColumnConcatenator() << {Role.Feature: [['review']]}

pipeline = Pipeline([concat, tokenize])

# REVIEW: System.NullReferenceException
tokenize.fit(customer_reviews)
y = tokenize.transform(customer_reviews)

# view the sentiment scores!!
print(y)