def test_holidays(self): df = pandas.DataFrame( data=dict(tokens1=[1, 2, 3, 157161600], tokens2=[10, 11, 12, 13])) cols_to_drop = [ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff' ] dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1' pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)]) y = pipeline.fit_transform(df) self.assertEqual(y.loc[3, 'dtHolidayName'], 'Christmas Day')
from nimbusml import Pipeline from nimbusml.preprocessing import DateTimeSplitter from nimbusml.preprocessing.schema import ColumnSelector df = pandas.DataFrame(data=dict( tokens1=[1, 2, 3, 157161600], tokens2=[10, 11, 12, 13] )) cols_to_drop = [ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff' ] dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1' pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)]) y = pipeline.fit_transform(df) # view the three columns pandas.set_option('display.max_columns', None) pandas.set_option('display.width', 1000) print(y) # tokens1 tokens2 dtYear dtMonth dtDay dtHour dtMinute dtSecond dtAmPm dtHolidayName # 0 1 10 1970 1 1 0 0 1 0 New Year's Day # 1 2 11 1970 1 1 0 0 2 0 New Year's Day # 2 3 12 1970 1 1 0 0 3 0 New Year's Day # 3 157161600 13 1974 12 25 0 0 0 0 Christmas Day
'ToKey', 'ColumnSelector' } INSTANCES = { 'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier( feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']), 'Binner': Binner(num_bins=3), 'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), 'ColumnConcatenator': ColumnConcatenator(columns={'Features': [ 'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa']}), 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']), 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}), 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']), 'DateTimeSplitter': DateTimeSplitter(prefix='dt'), 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'), 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']), 'FromKey': Pipeline([ ToKey(columns=['Sepal_Length']), FromKey(columns=['Sepal_Length']) ]), # GlobalContrastRowScaler currently requires a vector input to work 'GlobalContrastRowScaler': Pipeline([
############################################################################### # ColumnSelector from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.preprocessing.schema import ColumnSelector # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv(path, sep=',') # transform usage xf = ColumnSelector(columns=['education', 'age']) # fit and transform features = xf.fit_transform(data) # print features print(features.head()) # age education # 0 26 0-5yrs # 1 42 0-5yrs # 2 39 0-5yrs # 3 34 0-5yrs # 4 35 6-11yrs
data = FileDataStream(path, schema=file_schema) print(data.head()) # Sentiment SentimentText # 0 1.0 ==RUDE== Dude, you are rude upload that carl p... # 1 1.0 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... # 2 1.0 Stop trolling, zapatancas, calling me a liar m... # 3 1.0 ==You're cool== You seem like a really cool g... # 4 1.0 ::::: Why are you threatening me? I'm not bein... # After using Character Tokenizer, it will convert the vector of Char to Key type. # Use FromKey to retrieve the data from Key first, then send into WordEmbedding. pipe = Pipeline([ CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), FromKey(columns={'SentimentText_FromKey': 'SentimentText_Transform'}), WordEmbedding(model_kind='GloVe50D', columns={'Feature': 'SentimentText_FromKey'}), ColumnSelector(columns=['Sentiment', 'SentimentText', 'Feature']) ]) print(pipe.fit_transform(data).head()) # Sentiment ... Feature.149 # 0 1.0 ... 2.67440 # 1 1.0 ... 0.78858 # 2 1.0 ... 2.67440 # 3 1.0 ... 2.67440 # 4 1.0 ... 2.67440 # [5 rows x 152 columns]
# 0 5.1 3.5 1.4 0.2 0 setosa 1.0 # 1 4.9 3.0 1.4 0.2 0 setosa 1.0 np.random.seed(0) df = get_dataset("iris").as_df() X_train, X_test, y_train, y_test = \ train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) mycols = [ 'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa' ] # drop 'Species' column using ColumnDropper Transform # select mycols for training using ColumnConcatenator transform dropcols = ColumnDropper() << 'Species' concat = ColumnConcatenator() << {Role.Feature: mycols} pipeline = Pipeline([dropcols, concat, LogisticRegressionClassifier()]) pipeline.fit(X_train, y_train) scores1 = pipeline.predict(X_test) # Select mycols using SelectColumns Transform select = ColumnSelector() << mycols pipeline.fit(X_train, y_train) pipeline2 = Pipeline([select, LogisticRegressionClassifier()]) scores2 = pipeline.predict(X_test) # Verify that we get identical results in both Experiments print(scores1.head()) print(scores2.head())