Python ColumnSelector Examples

Programming Language: Python

Namespace/Package Name: nimbusml.preprocessing.schema

Class/Type: ColumnSelector

Examples at hotexamples.com: 6

Python ColumnSelector - 6 examples found. These are the top rated real world Python examples of nimbusml.preprocessing.schema.ColumnSelector extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

ColumnSelector(6)

fit_transform(1)

Example #1

Show file

File: test_datetimesplitter.py Project: geeksperiments/NimbusML

    def test_holidays(self):
        df = pandas.DataFrame(
            data=dict(tokens1=[1, 2, 3, 157161600], tokens2=[10, 11, 12, 13]))

        cols_to_drop = [
            'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', 'dtDayOfYear',
            'dtWeekOfMonth', 'dtQuarterOfYear', 'dtHalfOfYear', 'dtWeekIso',
            'dtYearIso', 'dtMonthLabel', 'dtAmPmLabel', 'dtDayOfWeekLabel',
            'dtIsPaidTimeOff'
        ]

        dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'
        pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
        y = pipeline.fit_transform(df)

        self.assertEqual(y.loc[3, 'dtHolidayName'], 'Christmas Day')

Example #2

Show file

from nimbusml import Pipeline
from nimbusml.preprocessing import DateTimeSplitter
from nimbusml.preprocessing.schema import ColumnSelector

df = pandas.DataFrame(data=dict(
    tokens1=[1, 2, 3, 157161600],
    tokens2=[10, 11, 12, 13]
))

cols_to_drop = [
    'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
    'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
    'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
    'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff'
]

dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'

pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
y = pipeline.fit_transform(df)

# view the three columns
pandas.set_option('display.max_columns', None)
pandas.set_option('display.width', 1000)
print(y)
#      tokens1  tokens2  dtYear  dtMonth  dtDay  dtHour  dtMinute  dtSecond  dtAmPm   dtHolidayName
# 0          1       10    1970        1      1       0         0         1       0  New Year's Day
# 1          2       11    1970        1      1       0         0         2       0  New Year's Day
# 2          3       12    1970        1      1       0         0         3       0  New Year's Day
# 3  157161600       13    1974       12     25       0         0         0       0   Christmas Day

Example #3

Show file

File: test_export_to_onnx.py Project: xadupre/NimbusML

    'ToKey',
    'ColumnSelector'
}

INSTANCES = {
    'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier(
        feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']),
    'Binner': Binner(num_bins=3),
    'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
    'ColumnConcatenator': ColumnConcatenator(columns={'Features': [
        'Sepal_Length',
        'Sepal_Width',
        'Petal_Length',
        'Petal_Width',
        'Setosa']}),
    'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']),
    'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}),
    'CountSelector': CountSelector(count=5, columns=['Sepal_Width']),
    'DateTimeSplitter': DateTimeSplitter(prefix='dt'),
    'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
                                                             label='Setosa'),
    'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
    'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']),
    'FromKey': Pipeline([
        ToKey(columns=['Sepal_Length']),
        FromKey(columns=['Sepal_Length'])
    ]),
    # GlobalContrastRowScaler currently requires a vector input to work
    'GlobalContrastRowScaler': Pipeline([

Example #4

Show file

###############################################################################
# ColumnSelector
from nimbusml import FileDataStream
from nimbusml.datasets import get_dataset
from nimbusml.preprocessing.schema import ColumnSelector

# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path, sep=',')

# transform usage
xf = ColumnSelector(columns=['education', 'age'])

# fit and transform
features = xf.fit_transform(data)

# print features
print(features.head())
#   age education
# 0   26    0-5yrs
# 1   42    0-5yrs
# 2   39    0-5yrs
# 3   34    0-5yrs
# 4   35   6-11yrs

Example #5

Show file

data = FileDataStream(path, schema=file_schema)
print(data.head())

#    Sentiment                                      SentimentText
# 0        1.0  ==RUDE== Dude, you are rude upload that carl p...
# 1        1.0  == OK! ==  IM GOING TO VANDALIZE WILD ONES WIK...
# 2        1.0  Stop trolling, zapatancas, calling me a liar m...
# 3        1.0  ==You're cool==  You seem like a really cool g...
# 4        1.0  ::::: Why are you threatening me? I'm not bein...

# After using Character Tokenizer, it will convert the vector of Char to Key type.
# Use FromKey to retrieve the data from Key first, then send into WordEmbedding.

pipe = Pipeline([
        CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
        FromKey(columns={'SentimentText_FromKey': 'SentimentText_Transform'}),
        WordEmbedding(model_kind='GloVe50D', columns={'Feature': 'SentimentText_FromKey'}),
        ColumnSelector(columns=['Sentiment', 'SentimentText', 'Feature'])
        ])

print(pipe.fit_transform(data).head())

#    Sentiment  ... Feature.149
# 0        1.0  ...     2.67440
# 1        1.0  ...     0.78858
# 2        1.0  ...     2.67440
# 3        1.0  ...     2.67440
# 4        1.0  ...     2.67440

# [5 rows x 152 columns]

Example #6

Show file

File: Concat_Drop_Select_Columns_df.py Project: zyw400/NimbusML-1

# 0           5.1          3.5           1.4          0.2     0  setosa     1.0
# 1           4.9          3.0           1.4          0.2     0  setosa     1.0
np.random.seed(0)
df = get_dataset("iris").as_df()

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

mycols = [
    'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa'
]

# drop 'Species' column using ColumnDropper Transform
# select mycols for training using ColumnConcatenator transform
dropcols = ColumnDropper() << 'Species'
concat = ColumnConcatenator() << {Role.Feature: mycols}

pipeline = Pipeline([dropcols, concat, LogisticRegressionClassifier()])
pipeline.fit(X_train, y_train)
scores1 = pipeline.predict(X_test)

# Select mycols using SelectColumns Transform
select = ColumnSelector() << mycols
pipeline.fit(X_train, y_train)
pipeline2 = Pipeline([select, LogisticRegressionClassifier()])
scores2 = pipeline.predict(X_test)

# Verify that we get identical results in both Experiments
print(scores1.head())
print(scores2.head())