Ejemplo n.º 1
0
    def test_numeric_columns(self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path, sep=',',
                                       numeric_dtype=np.float32)

        xf = OneHotHashVectorizer(
            columns={
                'edu': 'education',
                'in': 'induced',
                'sp': 'spontaneous'},
            number_of_bits=2)
        xf.fit_transform(data)

        xf = OneHotHashVectorizer(
            columns=[
                'education',
                'induced',
                'spontaneous'],
            number_of_bits=2)
        xf.fit_transform(data)
Ejemplo n.º 2
0
                  True, False, True, False, True, False, True
              ]))

test_reviews = pandas.DataFrame(data=dict(review=[
    "This is great", "I hate it", "Love it", "Really like it", "I hate it",
    "I like it a lot", "I love it", "I do like it", "I really hate it",
    "I love it"
]))

# OneHotHashVectorizer transform: the entire string is treated as a category.
# if output column name is same as input column, original input column values
# are replaced. number_of_bits=6 will hash into 2^6 -1 dimensions

y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

cat = OneHotHashVectorizer(number_of_bits=6) << 'review'
X = cat.fit_transform(X)

# view the transformed numerical values and column names
print(X)

mymodel = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = cat.transform(test_reviews)

scores = mymodel.predict(cat.transform(test_reviews))

# view the scores
print(scores)
Ejemplo n.º 3
0
###############################################################################
# OneHotHashVectorizer
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotHashVectorizer

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()
data = FileDataStream.read_csv(path, sep=',',
                               dtype={'spontaneous': str
                                      })  # Error with numeric input for ohhv
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

xf = OneHotHashVectorizer(columns={'edu': 'education', 'sp': 'spontaneous'})

# fit and transform
features = xf.fit_transform(data)

print(features.head())
#    age  case  edu.0   edu.1003   ...    sp.995    ...   spontaneous  stratum
# 0    26     1    0.0        0.0   ...       0.0   ...           2.0      1.0
# 1    42     1    0.0        0.0   ...       0.0   ...           0.0      2.0
# 2    39     1    0.0        0.0   ...       0.0   ...           0.0      3.0