Esempio n. 1
0
    def test_check_estimator_fromkey(self):
        text_df = pandas.DataFrame(
            data=dict(
                text=[
                    "cat",
                    "dog",
                    "fish",
                    "orange",
                    "cat orange",
                    "dog",
                    "fish",
                    "spider"],
                num=[
                    1,
                    2,
                    3,
                    4,
                    5,
                    6,
                    7,
                    8]))

        tokey = ToKey() << ['text']
        data_idv = tokey.fit_transform(text_df)
        assert data_idv is not None
        assert len(data_idv) > 0
        assert str(sorted([str(dt) for dt in data_idv.dtypes])
                   ) == "['category', 'int64']"
        fromkey = FromKey() << ['text']
        data = fromkey.fit_transform(data_idv)
        assert str(list(data_idv['text'])) == str(list(data['text']))
        t = numpy.unique(data_idv['text'].cat.codes)
        assert len(t) == 6
        assert list(data_idv['text'].cat.categories) == [
            "cat", "dog", "fish", "orange", "cat orange", "spider"]
Esempio n. 2
0
 def test_example_key_to_text_typeerror_u4(self):
     text_df = pandas.DataFrame(data=dict(text=[1, 2]), dtype=numpy.uint32)
     tokey = FromKey() << 'text'
     # System.ArgumentOutOfRangeException: 'Source column 'text' has invalid
     # type ('U8'): Expected Key type of known cardinality.
     assert_raise_message(
         RuntimeError,
         "",
         lambda: tokey.fit_transform(text_df))
Esempio n. 3
0
###############################################################################
# FromKey

import pandas
from nimbusml.preprocessing import FromKey, ToKey
from pandas import Categorical

# Create the data
categorical_df = pandas.DataFrame(data=dict(
    key=Categorical.from_codes([0, 1, 2, 1, 2, 0], categories=['a', 'b', 'c']),
    text=['b', 'c', 'a', 'b', 'a', 'c']))

fromkey = FromKey(columns='key')
y = fromkey.fit_transform(categorical_df)
print(y)

tokey = ToKey(columns='text')
y = tokey.fit_transform(categorical_df)
y2 = fromkey.clone().fit_transform(y)
print(y2['text'] == categorical_df['text'])
Esempio n. 4
0
###############################################################################
# FromKey
from nimbusml import FileDataStream, Pipeline
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing import FromKey, ToKey

# data input (as a FileDataStream)
path = get_dataset('topics').as_filepath()

# load data
data = FileDataStream.read_csv(path, sep=',')

# transform usage
pipeline = Pipeline([
    ToKey(columns=['review_reverse']),
    FromKey(columns=['review_reverse'])
])

# fit and transform
output = pipeline.fit_transform(data)
print(output.head())
#   label                              review                   review_reverse
# 0      1  animals birds cats dogs fish horse   radiation galaxy universe duck
# 1      0    horse birds house fish duck cats  space galaxy universe radiation
# 2      1         car truck driver bus pickup                       bus pickup
# 3      0   car truck driver bus pickup horse                        car truck
# 4      1     car truck  car truck driver bus                     pickup horse
Esempio n. 5
0
     'Petal_Length',
     'Petal_Width',
     'Setosa']}),
 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']),
 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}),
 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']),
 'DateTimeSplitter': DateTimeSplitter(prefix='dt'),
 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                          label='Setosa'),
 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                          label='Setosa'),
 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']),
 'FromKey': Pipeline([
     ToKey(columns=['Sepal_Length']),
     FromKey(columns=['Sepal_Length'])
 ]),
 # GlobalContrastRowScaler currently requires a vector input to work
 'GlobalContrastRowScaler': Pipeline([
     ColumnConcatenator() << {
         'concated_columns': [
             'Petal_Length',
             'Sepal_Width',
             'Sepal_Length']},
     GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'})
 ]),
 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}),
 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']),
 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']),
 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}),
 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=3, feature=['Sepal_Width', 'Sepal_Length']),
Esempio n. 6
0
data = FileDataStream(path, schema=file_schema)
print(data.head())

#    Sentiment                                      SentimentText
# 0        1.0  ==RUDE== Dude, you are rude upload that carl p...
# 1        1.0  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2        1.0  Stop trolling, zapatancas, calling me a liar m...
# 3        1.0  ==You're cool==  You seem like a really cool g...
# 4        1.0  ::::: Why are you threatening me? I'm not bein...

# After using Character Tokenizer, it will convert the vector of Char to Key type.
# Use FromKey to retrieve the data from Key first, then send into WordEmbedding.

pipe = Pipeline([
        CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
        FromKey(columns={'SentimentText_FromKey': 'SentimentText_Transform'}),
        WordEmbedding(model_kind='GloVe50D', columns={'Feature': 'SentimentText_FromKey'}),
        ColumnSelector(columns=['Sentiment', 'SentimentText', 'Feature'])
        ])

print(pipe.fit_transform(data).head())

#    Sentiment  ... Feature.149
# 0        1.0  ...     2.67440
# 1        1.0  ...     0.78858
# 2        1.0  ...     2.67440
# 3        1.0  ...     2.67440
# 4        1.0  ...     2.67440

# [5 rows x 152 columns]