Esempio n. 1
0
    def test_check_estimator_fromkey(self):
        text_df = pandas.DataFrame(
            data=dict(
                text=[
                    "cat",
                    "dog",
                    "fish",
                    "orange",
                    "cat orange",
                    "dog",
                    "fish",
                    "spider"],
                num=[
                    1,
                    2,
                    3,
                    4,
                    5,
                    6,
                    7,
                    8]))

        tokey = ToKey() << ['text']
        data_idv = tokey.fit_transform(text_df)
        assert data_idv is not None
        assert len(data_idv) > 0
        assert str(sorted([str(dt) for dt in data_idv.dtypes])
                   ) == "['category', 'int64']"
        fromkey = FromKey() << ['text']
        data = fromkey.fit_transform(data_idv)
        assert str(list(data_idv['text'])) == str(list(data['text']))
        t = numpy.unique(data_idv['text'].cat.codes)
        assert len(t) == 6
        assert list(data_idv['text'].cat.categories) == [
            "cat", "dog", "fish", "orange", "cat orange", "spider"]
Esempio n. 2
0
    def test_check_estimator_fromkey_categories(self):
        text_df = pandas.DataFrame(
            data=dict(
                text=[
                    "cat",
                    "dog",
                    "fish",
                    "orange",
                    "cat orange",
                    "dog",
                    "fish",
                    "spider"]),
            dtype="category")

        tokey = ToKey() << ['text']
        data_idv = tokey.fit_transform(text_df)
        assert data_idv is not None
        assert len(data_idv) > 0
        assert data_idv['text'].dtype == 'category'
Esempio n. 3
0
###############################################################################
# FromKey

import pandas
from nimbusml.preprocessing import FromKey, ToKey
from pandas import Categorical

# Create the data
categorical_df = pandas.DataFrame(data=dict(
    key=Categorical.from_codes([0, 1, 2, 1, 2, 0], categories=['a', 'b', 'c']),
    text=['b', 'c', 'a', 'b', 'a', 'c']))

fromkey = FromKey(columns='key')
y = fromkey.fit_transform(categorical_df)
print(y)

tokey = ToKey(columns='text')
y = tokey.fit_transform(categorical_df)
y2 = fromkey.clone().fit_transform(y)
print(y2['text'] == categorical_df['text'])
Esempio n. 4
0
import numpy
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing import ToKey

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32,
                               names={0: 'id'})
print(data.head())
#    age  case education   id  induced  parity  pooled.stratum  spontaneous ...
# 0  26.0   1.0    0-5yrs  1.0      1.0     6.0             3.0         2.0 ...
# 1  42.0   1.0    0-5yrs  2.0      1.0     1.0             1.0         0.0 ...
# 2  39.0   1.0    0-5yrs  3.0      2.0     6.0             4.0         0.0 ...
# 3  34.0   1.0    0-5yrs  4.0      2.0     4.0             2.0         0.0  ..
# 4  35.0   1.0   6-11yrs  5.0      1.0     3.0            32.0         1.0  ..

# transform usage
xf = ToKey(columns={'id_1': 'id', 'edu_1': 'education'})

# fit and transform
features = xf.fit_transform(data)
print(features.head())
#    age  case    edu_1 education   id  id_1  induced  parity  ...
# 0  26.0   1.0   0-5yrs    0-5yrs  1.0     0      1.0     6.0 ...
# 1  42.0   1.0   0-5yrs    0-5yrs  2.0     1      1.0     1.0 ...
# 2  39.0   1.0   0-5yrs    0-5yrs  3.0     2      2.0     6.0 ...
# 3  34.0   1.0   0-5yrs    0-5yrs  4.0     3      2.0     4.0 ...
# 4  35.0   1.0  6-11yrs   6-11yrs  5.0     4      1.0     3.0 ...
Esempio n. 5
0
###############################################################################
# ToKey

import pandas
from nimbusml.preprocessing import ToKey

# Create the data
text_df = pandas.DataFrame(
    data=dict(
        text=[
            "cat",
            "dog",
            "fish",
            "orange",
            "cat orange",
            "dog",
            "fish",
            "spider"]))

tokey = ToKey() << 'text'
y = tokey.fit_transform(text_df)
print(y)