Ejemplo n.º 1
0
    def test_types_convertable_to_r4_get_output_as_r4(self):
        train_data = {
            'c1': [1, 0, 0, 4],
            'c2': [2, 3, 0, 5],
            'c3': [3, 4, 5, 6],
            'c4': [4, 5, 6, 7]
        }
        train_df = pd.DataFrame(train_data).astype({
            'c1': np.ubyte,
            'c2': np.short,
            'c3': np.float32
        })

        xf = ColumnDropper(columns=['c4'])
        xf.fit(train_df)
        result = xf.transform(train_df, as_csr=True)

        self.assertTrue(type(result) == csr_matrix)
        self.assertEqual(result.nnz, 9)
        result = pd.DataFrame(result.todense())

        train_data = {0: [1, 0, 0, 4], 1: [2, 3, 0, 5], 2: [3, 4, 5, 6]}
        expected_result = pd.DataFrame(train_data).astype(np.float32)

        self.assertTrue(result.equals(expected_result))

        self.assertEqual(result.dtypes[0], np.float32)
        self.assertEqual(result.dtypes[1], np.float32)
        self.assertEqual(result.dtypes[2], np.float32)
Ejemplo n.º 2
0
    def test_ensemble_supports_get_fit_info(self):
        df = pd.DataFrame(dict(education=['A', 'B', 'A', 'B', 'A'],
                               workclass=['X', 'X', 'Y', 'Y', 'Y'],
                               yy=[1.1, 2.2, 1.24, 3.4, 3.4]))

        col_info = {'Feature': ['workclass', 'education'], Role.Label: 'new_y'}

        r1 = OrdinaryLeastSquaresRegressor(normalize="Yes") << col_info
        r2 = OnlineGradientDescentRegressor(normalize="Yes") << col_info
        r3 = LightGbmRegressor(normalize="Yes") << col_info

        pipeline = Pipeline([
            MeanVarianceScaler() << {'new_y': 'yy'},
            OneHotVectorizer() << ['workclass', 'education'],
            ColumnDropper() << 'yy',
            VotingRegressor(estimators=[r1, r2, r3], combiner='Average')
        ])

        info = pipeline.get_fit_info(df)

        last_info_node = info[0][-1]
        self.assertEqual(last_info_node['inputs'],
                         ['Feature:education,workclass', 'Label:new_y'])
        self.assertEqual(last_info_node['name'], 'VotingRegressor')
        self.assertTrue(isinstance(last_info_node['operator'], VotingRegressor))
        self.assertEqual(last_info_node['outputs'], ['Score'])
        self.assertEqual(last_info_node['schema_after'], ['Score'])
        self.assertEqual(last_info_node['type'], 'regressor')
Ejemplo n.º 3
0
 def test_split_start(self):
     long_transforms = [
         OneHotVectorizer(columns={'edu': 'education'}),
         OneHotHashVectorizer(columns={'edu_hash': 'education'}),
         ColumnDropper(columns='education')
     ]
     pipeline = self.pipeline(
         transforms=long_transforms,
         learner_arguments={'feature': ['Features', 'edu', 'edu_hash']})
     check_cv(pipeline, self.data('Label'), split_start='try_all')
Ejemplo n.º 4
0
    def test_fit_transform_produces_expected_result(self):
        train_data = {
            'c1': [1, 0, 0, 4],
            'c2': [2, 3, 0, 5],
            'c3': [3, 4, 5, 6]
        }
        train_df = pd.DataFrame(train_data).astype(np.float32)

        xf = ColumnDropper(columns=['c3'])
        result = xf.fit_transform(train_df, as_csr=True)

        self.assertEqual(result.nnz, 5)
        self.assertTrue(type(result) == csr_matrix)
        result = pd.DataFrame(result.todense())

        train_data = {0: [1, 0, 0, 4], 1: [2, 3, 0, 5]}
        expected_result = pd.DataFrame(train_data).astype(np.float32)

        self.assertTrue(result.equals(expected_result))
Ejemplo n.º 5
0
    def test_types_convertable_to_r8_get_output_as_r8(self):
        large_int64 = 372036854775807
        train_data = {
            'c1': [1, 0, 0, 4],
            'c2': [2, 3, 0, 5],
            'c3': [3, 0, 5, 0],
            'c4': [0, 5, 6, 7],
            'c5': [0, 5, 0, large_int64],
            'c6': [5, 6, 7, 8]
        }
        train_df = pd.DataFrame(train_data).astype({
            'c1': np.ubyte,
            'c2': np.short,
            'c3': np.float32,
            'c4': np.float64,
            'c5': np.int64
        })

        xf = ColumnDropper(columns=['c6'])
        xf.fit(train_df)
        result = xf.transform(train_df, as_csr=True)

        self.assertTrue(type(result) == csr_matrix)
        self.assertEqual(result.nnz, 12)
        result = pd.DataFrame(result.todense())

        train_data = {
            0: [1, 0, 0, 4],
            1: [2, 3, 0, 5],
            2: [3, 0, 5, 0],
            3: [0, 5, 6, 7],
            4: [0, 5, 0, large_int64]
        }
        expected_result = pd.DataFrame(train_data).astype(np.float64)

        self.assertTrue(result.equals(expected_result))

        self.assertEqual(result.dtypes[0], np.float64)
        self.assertEqual(result.dtypes[1], np.float64)
        self.assertEqual(result.dtypes[2], np.float64)
        self.assertEqual(result.dtypes[3], np.float64)

        self.assertEqual(result.loc[3, 4], large_int64)
Ejemplo n.º 6
0
    def test_ngramfeaturizer(self):
        train_df = pandas.DataFrame(data=dict(review=['one', 'two']))

        pipeline = Pipeline([
            CharTokenizer(columns={'review_transform': 'review'}),
            NGramExtractor(ngram_length=3,
                           all_lengths=False,
                           columns={'ngrams': 'review_transform'}),
            ColumnDropper(columns=['review_transform', 'review'])
        ])

        result = pipeline.fit_transform(train_df)
        self.assertEqual(len(result.columns), 6)
        self.assertEqual(result.loc[0, 'ngrams.o|n|e'], 1.0)
        self.assertEqual(result.loc[1, 'ngrams.o|n|e'], 0.0)
        self.assertEqual(result.loc[0, 'ngrams.t|w|o'], 0.0)
        self.assertEqual(result.loc[1, 'ngrams.t|w|o'], 1.0)
Ejemplo n.º 7
0
    def test_fit_transform_with_idv(self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(
            data, as_binary_data_stream=True)

        schema = featurized_data.schema
        num_columns = len(schema)
        self.assertTrue('case' in schema)
        self.assertTrue('row_num' in schema)

        pipeline = Pipeline([ColumnDropper() << ['case', 'row_num']])
        pipeline.fit(featurized_data)
        result = pipeline.transform(featurized_data,
                                    as_binary_data_stream=True)

        schema = result.schema
        self.assertEqual(len(schema), num_columns - 2)
        self.assertTrue('case' not in schema)
        self.assertTrue('row_num' not in schema)
Ejemplo n.º 8
0
###############################################################################
# ColumnDropper
import numpy
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.schema import ColumnDropper

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32)

# transform usage
xf = ColumnDropper(columns=['education', 'age'])

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
#   case  induced  parity  pooled.stratum  row_num  spontaneous  stratum
# 0   1.0      1.0     6.0             3.0      1.0          2.0      1.0
# 1   1.0      1.0     1.0             1.0      2.0          0.0      2.0
# 2   1.0      2.0     6.0             4.0      3.0          0.0      3.0
# 3   1.0      2.0     4.0             2.0      4.0          0.0      4.0
# 4   1.0      1.0     3.0            32.0      5.0          1.0      5.0
Ejemplo n.º 9
0
from nimbusml.datasets import get_dataset
from nimbusml.linear_model import LogisticRegressionClassifier
from nimbusml.preprocessing.schema import PrefixColumnConcatenator
from nimbusml.preprocessing.schema import ColumnDropper
from sklearn.model_selection import train_test_split

# use 'iris' data set to create test and train data
#    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label Species  Setosa
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
df = get_dataset("iris").as_df()

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

concat = PrefixColumnConcatenator() << {'Sepal': 'Sepal_'}
concat1 = PrefixColumnConcatenator() << {'Petal': 'Petal_'}
dropcols = ColumnDropper() << [
    'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa',
    'Species'
]

pipeline = Pipeline(
    [concat, concat1, dropcols,
     LogisticRegressionClassifier()])
pipeline.fit(X_train, y_train)

# Evaluate the model
metrics, scores = pipeline.test(X_test, y_test, output_scores=True)
print(metrics)
#    Sepal_Length  Sepal_Width  Petal_Length  Petal_Width Label Species  Setosa
# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
np.random.seed(0)
df = get_dataset("iris").as_df()

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

mycols = [
    'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa'
]

# drop 'Species' column using ColumnDropper Transform
# select mycols for training using ColumnConcatenator transform
dropcols = ColumnDropper() << 'Species'
concat = ColumnConcatenator() << {Role.Feature: mycols}

pipeline = Pipeline([dropcols, concat, LogisticRegressionClassifier()])
pipeline.fit(X_train, y_train)
scores1 = pipeline.predict(X_test)

# Select mycols using SelectColumns Transform
select = ColumnSelector() << mycols
pipeline.fit(X_train, y_train)
pipeline2 = Pipeline([select, LogisticRegressionClassifier()])
scores2 = pipeline.predict(X_test)

# Verify that we get identical results in both Experiments
print(scores1.head())
print(scores2.head())
Ejemplo n.º 11
0
data = FileDataStream.read_csv(path, sep='\t')
print(data.head())
#   Sentiment                                      SentimentText
# 0          1  ==RUDE== Dude, you are rude upload that carl p...
# 1          1  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2          1  Stop trolling, zapatancas, calling me a liar m...
# 3          1  ==You're cool==  You seem like a really cool g...
# 4          1  ::::: Why are you threatening me? I'm not bein...

# transform usage
pipe = Pipeline([
    CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
    NGramExtractor(ngram_length=1,
                   all_lengths=False,
                   columns={'Ngrams': 'SentimentText_Transform'}),
    ColumnDropper(
        columns=['SentimentText_Transform', 'SentimentText', 'Sentiment'])
])

# fit and transform
features = pipe.fit_transform(data)

print(features.head())
#    Ngrams.<␂>  Ngrams.=  Ngrams.R  Ngrams.U  Ngrams.D  Ngrams.E  ...
# 0         1.0       4.0       1.0       1.0       2.0       1.0  ...
# 1         1.0       4.0       0.0       0.0       2.0       3.0  ...
# 2         1.0       0.0       0.0       0.0       0.0       0.0  ...
# 3         1.0       4.0       0.0       0.0       0.0       0.0  ...
# 4         1.0       0.0       0.0       0.0       0.0       0.0  ...
Ejemplo n.º 12
0
            "Love it",
            "Really like it",
            "I hate it",
            "I like it a lot",
            "I love it",
            "I do like it",
            "I really hate it",
            "I love it"]))

y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

pipeline = Pipeline([
    CharTokenizer(columns={'review_transform': 'review'}),
    NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}),
    ColumnDropper(columns=['review_transform', 'review'])
])
X = pipeline.fit_transform(X)

print(X.head())
#    ngrams.<␂>|T|h  ngrams.T|h|i  ngrams.h|i|s  ngrams.i|s|<␠>  ...  ngrams.i|t|!  ngrams.t|!|<␃>  ngrams.<␂>|H|a  ngrams.H|a|t
# 0             1.0           1.0           1.0             2.0  ...           0.0             0.0             0.0           0.0
# 1             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 2             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 3             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 4             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0

model = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = pipeline.transform(test_reviews)
result = model.predict(X_test)