def test_ensemble_supports_cv_with_user_defined_transforms(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) ind_args = {'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R'} handler_args = {'Solar_R': 'Solar_R', 'Ozone': 'Ozone'} lgbm_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ols_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'normalize': 'Yes' } ogd_args = { 'feature': ['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], 'label': 'Wind', 'shuffle': False, 'normalize': 'Yes' } for split_start in ['before_transforms', 'after_transforms']: pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, LightGbmRegressor(**lgbm_args) ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_lgbm = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] r1 = OrdinaryLeastSquaresRegressor(**ols_args) r2 = OnlineGradientDescentRegressor(**ogd_args) r3 = LightGbmRegressor(**lgbm_args) data = FileDataStream(path, schema) pipeline_steps = [ Indicator() << ind_args, Handler(replace_with='Mean') << handler_args, VotingRegressor(estimators=[r1, r2, r3], combiner='Average') ] cv_results = CV(pipeline_steps).fit(data, split_start=split_start) l2_avg_ensemble = cv_results['metrics_summary'].loc['Average', 'L2(avg)'] self.assertTrue(l2_avg_ensemble < l2_avg_lgbm)
def test_datetime_column_parsed_from_string(self): dates = ["2018-01-02", "2018-02-01"] df = pd.DataFrame({'c1': dates, 'c2': [3, 4]}) file_name = get_temp_file('.csv') df.to_csv(file_name) df = pd.read_csv(file_name, parse_dates=['c1'], index_col=0) self.assertEqual(df.dtypes[0], np.dtype('datetime64[ns]')) pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})]) result = pipeline.fit_transform(df) self.assertEqual(result.loc[0, 'c1'].year, 2018) self.assertEqual(result.loc[0, 'c1'].month, 1) self.assertEqual(result.loc[0, 'c1'].day, 2) self.assertEqual(result.loc[0, 'c1'].hour, 0) self.assertEqual(result.loc[0, 'c1'].minute, 0) self.assertEqual(result.loc[0, 'c1'].second, 0) self.assertEqual(result.loc[1, 'c1'].year, 2018) self.assertEqual(result.loc[1, 'c1'].month, 2) self.assertEqual(result.loc[1, 'c1'].day, 1) self.assertEqual(result.loc[1, 'c1'].hour, 0) self.assertEqual(result.loc[1, 'c1'].minute, 0) self.assertEqual(result.loc[1, 'c1'].second, 0) self.assertEqual(len(result), 2) self.assertEqual(result.dtypes[0], np.dtype('datetime64[ns]')) os.remove(file_name)
def test_input_types(self): df = DataFrame(data=dict(Label=[1, 2, 3, 4, 5], f=[1.1, 2.2, 3.3, np.nan, 5.5], f1=[2.2, np.nan, 4.4, 5.5, 6.6])) h = Handler(replace_with='Mean') ft = FastLinearRegressor(shuffle=False, number_of_threads=1) p = Pipeline([h, ft]) p.fit(df[['f', 'f1']].values, df['Label']) res = p.predict(df[['f', 'f1']].values) print(res) print(p.summary()) assert_allclose(res['Score'].values, [4.965541, 0.519701, 4.992831, 3.877400, 5.020121], rtol=1e-4)
def test_input_conversion_to_float_retains_other_column_types(self): data = { 'f0': [0, 1, 2, 3], 'f1': ['2', '3', '4', '5'], 'f2': [4, 5, np.nan, 9] } data = DataFrame(data).astype({ 'f0': np.int32, 'f1': str, 'f2': np.float64 }) # Check Indicator xf = Indicator(columns={'f2.ind': 'f2'}) result = xf.fit_transform(data) assert_equal(result.dtypes['f0'], np.int32) assert_equal(result.dtypes['f1'], np.object) assert_equal(result.dtypes['f2'], np.float64) assert_equal(result.dtypes['f2.ind'], np.bool) assert_equal(result.loc[2, 'f2.ind'], True) assert_equal(len(result), 4) # Check Filter xf = Filter(columns=['f2']) result = xf.fit_transform(data) assert_equal(len(result), 3) assert_equal(result.loc[2, 'f2'], 9.0) assert_equal(result.dtypes['f0'], np.int32) assert_equal(result.dtypes['f1'], np.object) assert_equal(result.dtypes['f2'], np.float32) xf = Filter(columns=['f1']) result = xf.fit_transform(data) assert_equal(len(result), 4) assert_equal(result.loc[3, 'f2'], 9.0) assert_equal(result.dtypes['f0'], np.int32) assert_equal(result.dtypes['f1'], np.float32) assert_equal(result.dtypes['f2'], np.float64) # Check Handler xf = Handler(columns=['f2'], replace_with='Mean') result = xf.fit_transform(data) assert_equal(len(result), 4) assert_equal(result.loc[2, 'f2.f2'], 6.0) assert_equal(result.dtypes['f0'], np.int32) assert_equal(result.dtypes['f1'], np.object) assert_equal(result.dtypes['f2.f2'], np.float32)
def test_dprep_datastream(self): import azureml.dataprep as dprep dates = ["2018-01-02 00:00:00", "2018-02-01 10:00:00"] col2 = ['0', '1'] label_array = np.repeat([0], 2) train_df = pd.DataFrame({ 'col1': dates, 'col2': col2, 'label': label_array }) pipeline = Pipeline(steps=[ Handler(columns={'2': 'col2'}, concat=False, impute_by_slot=True, replace_with='Mean') ]) file_name = get_temp_file('.csv') train_df.to_csv(file_name) dataflow = dprep.read_csv(file_name, infer_column_types=True) dprepDataStream = DprepDataStream(dataflow) result = pipeline.fit_transform(dprepDataStream) self.assertEqual(result.loc[:, 'col1'].dtype, np.dtype('datetime64[ns]')) self.assertEqual(result.loc[0, 'col1'].year, 2018) self.assertEqual(result.loc[0, 'col1'].month, 1) self.assertEqual(result.loc[0, 'col1'].day, 2) self.assertEqual(result.loc[0, 'col1'].hour, 0) self.assertEqual(result.loc[0, 'col1'].minute, 0) self.assertEqual(result.loc[0, 'col1'].second, 0) self.assertEqual(result.loc[1, 'col1'].year, 2018) self.assertEqual(result.loc[1, 'col1'].month, 2) self.assertEqual(result.loc[1, 'col1'].day, 1) self.assertEqual(result.loc[1, 'col1'].hour, 10) self.assertEqual(result.loc[1, 'col1'].minute, 0) self.assertEqual(result.loc[1, 'col1'].second, 0) os.remove(file_name)
def test_negative_values(self): milliseconds_in_year = 365 * 24 * 60 * 60 * 1000 data = [i * milliseconds_in_year for i in [-1, -2, -3, -3.3]] df = pd.DataFrame({'c1': data, 'c2': [3, 4, 5, 6]}) df = df.astype({'c1': np.dtype('datetime64[ms]')}) pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})]) result = pipeline.fit_transform(df) self.assertTrue(result.loc[:, 'c1'].equals(df.loc[:, 'c1'])) self.assertEqual(result.loc[:, 'c1'].dtype, np.dtype('datetime64[ns]')) self.assertEqual(result.loc[0, 'c1'].year, 1969) self.assertEqual(result.loc[0, 'c1'].hour, 0) self.assertEqual(result.loc[0, 'c1'].minute, 0) self.assertEqual(result.loc[0, 'c1'].second, 0) self.assertEqual(result.loc[3, 'c1'].year, 1966)
def test_input_conversion_to_float(self): data = { 'f0': [0, 1, 2, 3], 'f1': [1, 2, 3, 4], 'f2': [1, 2, 3, 4], 'f3': [1, 2, 3, 4], 'f4': ['2', '3', '4', '5'], 'f5': [4, 5, np.nan, 9] } data = DataFrame(data).astype({ 'f0': np.int8, 'f1': np.int16, 'f2': np.int32, 'f3': np.int64, 'f4': str, 'f5': np.float64 }) # Check Indicator xf = Indicator() result = xf.fit_transform(data) assert_equal(result.loc[2, 'f5'], True) result.loc[2, 'f5'] = False result = ~result for val in result.all().tolist(): self.assertTrue(val) # Check Filter xf = Filter() result = xf.fit_transform(data) assert_equal(len(result), 3) assert_equal(result.loc[2, 'f5'], 9.0) # Check Handler xf = Handler(replace_with='Mean') result = xf.fit_transform(data) assert_equal(len(result), 4) assert_equal(result.loc[2, 'f5.f5'], 6.0) assert_equal(result.loc[2, 'f5.IsMissing.f5'], 1.0)
def test_timestamp_boundaries(self): # Here are the current min and max for a Pandas Timestamp # 1677-09-21 00:12:43.145225 # 2262-04-11 23:47:16.854775807 data = [pd.Timestamp(1677, 9, 22, 1), pd.Timestamp.max] df = pd.DataFrame({'c1': data, 'c2': [3, 4]}) df = df.astype({'c1': np.dtype('datetime64[ms]')}) pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})]) result = pipeline.fit_transform(df) self.assertTrue(result.loc[:, 'c1'].equals(df.loc[:, 'c1'])) self.assertEqual(result.dtypes[0], np.dtype('datetime64[ns]')) self.assertEqual(result.loc[0, 'c1'].year, 1677) self.assertEqual(result.loc[0, 'c1'].month, 9) self.assertEqual(result.loc[0, 'c1'].day, 22) self.assertEqual(result.loc[1, 'c1'].year, 2262) self.assertEqual(result.loc[1, 'c1'].month, 4) self.assertEqual(result.loc[1, 'c1'].day, 11)
def test_split_start_with_transforms_with_presteps(self): path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) pipeline_steps = [ Indicator() << { 'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R' }, Handler(replace_with='Mean') << { 'Solar_R': 'Solar_R', 'Ozone': 'Ozone' }, LightGbmRegressor(feature=[ 'Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp' ], label='Wind') ] results = CV(pipeline_steps).fit(data, split_start='after_transforms', dry_run=True) results = json.loads(results) node_names = [ep['Name'] for ep in results['nodes']] cv_node = [ ep for ep in results['nodes'] if 'Models.CrossValidator' in ep['Name'] ][0] cv_sub_node_names = [ep['Name'] for ep in cv_node['Inputs']['Nodes']] self.assertTrue('Transforms.MissingValueHandler' in node_names) self.assertTrue( 'Transforms.MissingValueHandler' not in cv_sub_node_names) self.assertTrue('Transforms.ModelCombiner' in node_names)
def test_performance_syntax(self): train_file = get_dataset('uciadult_train').as_filepath() test_file = get_dataset('uciadult_test').as_filepath() file_schema = 'sep=, col=label:R4:0 col=Features:R4:9-14 ' \ 'col=workclass:TX:1 col=education:TX:2 ' \ 'col=marital-status:TX:3 col=occupation:TX:4 ' \ 'col=relationship:TX:5 col=ethnicity:TX:6 ' \ 'col=sex:TX:7 col=native-country-region:TX:8 header+' categorical_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'ethnicity', 'sex', 'native-country-region' ] label_column = 'label' na_columns = ['Features'] feature_columns_idv = na_columns + categorical_columns exp = Pipeline([ OneHotHashVectorizer(columns=categorical_columns), Handler(columns=na_columns), FastLinearBinaryClassifier(feature=feature_columns_idv, label=label_column) ]) train_data = FileDataStream(train_file, schema=file_schema) exp.fit(train_data, label_column, verbose=0) print("train time %s" % exp._run_time) test_data = FileDataStream(test_file, schema=file_schema) out_data = exp.predict(test_data) print("predict time %s" % exp._run_time) (test, label_test) = get_X_y(test_file, label_column, sep=',') (acc1, auc1) = evaluate_binary_classifier( label_test.iloc[:, 0].values, out_data.loc[:, 'PredictedLabel'].values, out_data.loc[:, 'Probability'].values) print('ACC %s, AUC %s' % (acc1, auc1)) exp = Pipeline([ OneHotHashVectorizer() << categorical_columns, Handler() << na_columns, FastLinearBinaryClassifier() << feature_columns_idv ]) train_data = FileDataStream(train_file, schema=file_schema) exp.fit(train_data, label_column, verbose=0) print("train time %s" % exp._run_time) test_data = FileDataStream(test_file, schema=file_schema) out_data = exp.predict(test_data) print("predict time %s" % exp._run_time) (test, label_test) = get_X_y(test_file, label_column, sep=',') (acc2, auc2) = evaluate_binary_classifier( label_test.iloc[:, 0].values, out_data.loc[:, 'PredictedLabel'].values, out_data.loc[:, 'Probability'].values) print('ACC %s, AUC %s' % (acc2, auc2)) assert abs(acc1 - acc2) < 0.02 assert abs(auc1 - auc2) < 0.02
'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'), 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']), 'FromKey': Pipeline([ ToKey(columns=['Sepal_Length']), FromKey(columns=['Sepal_Length']) ]), # GlobalContrastRowScaler currently requires a vector input to work 'GlobalContrastRowScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'}) ]), 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}), 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']), 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']), 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}), 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=3, feature=['Sepal_Width', 'Sepal_Length']), 'LightGbmRanker': LightGbmRanker(feature=['Class', 'dep_day', 'duration'], label='rank', group_id='group'), 'Loader': Loader(columns={'ImgPath': 'Path'}), 'LpScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, LpScaler(columns={'normed_columns': 'concated_columns'})
cm = cv_results['confusion_matrix'] print(cm[cm.Fold == 1]) # Case 2: Using CV with split_start option path = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(path) data = FileDataStream(path, schema) # CV also accepts the list of pipeline steps directly as input pipeline_steps = [ Indicator() << { 'Ozone_ind': 'Ozone', 'Solar_R_ind': 'Solar_R' }, Handler(replace_with='Mean') << { 'Solar_R': 'Solar_R', 'Ozone': 'Ozone' }, FastLinearRegressor( feature=['Ozone', 'Solar_R', 'Ozone_ind', 'Solar_R_ind', 'Temp'], label='Wind') ] # Since the Indicator and Handler transforms don't learn from data, # they could be run once before splitting the data into folds, instead of # repeating them once per fold. We use 'split_start=after_transforms' option # to achieve this optimization. cv_results = CV(pipeline_steps).fit(data, split_start='after_transforms') # Results can be accessed the same way as in Case 1 above.
from nimbusml import FileDataStream from nimbusml.preprocessing.missing_values import Handler with_nans = pd.DataFrame( data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[np.nan, 2.5, 2.6, 2.4], Petal_Width=[.8, .7, .9, 0.7], Species=["setosa", "viginica", "", 'versicolor'])) # write NaNs to file to show how this transform work tmpfile = 'tmpfile_with_nans.csv' with_nans.to_csv(tmpfile, index=False) data = FileDataStream.read_csv(tmpfile, sep=',', numeric_dtype=np.float32) # transform usage xf = Handler(columns={'PL': 'Petal_Length'}) # fit and transform features = xf.fit_transform(data) # print features print(features.head()) # PL.IsMissing.Petal_Length PL.Petal_Length Petal_Length Petal_Width ... # 0 1.0 0.0 NaN 0.8 ... # 1 0.0 2.5 2.5 0.7 ... # 2 0.0 2.6 2.6 0.9 ... # 3 0.0 2.4 2.4 0.7 ...
from nimbusml import FileDataStream from nimbusml.preprocessing.missing_values import Handler with_nans = pd.DataFrame( data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[np.nan, 2.5, 2.6, 2.4], Petal_Width=[.8, .7, .9, 0.7], Species=["setosa", "viginica", "", 'versicolor'])) # Write NaNs to file to see how transforms work tmpfile = 'tmpfile_with_nans.csv' with_nans.to_csv(tmpfile, index=False) # schema for reading directly from text files schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1 ' \ 'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 col=Species:TX:4 header+' data = FileDataStream.read_csv(tmpfile, collapse=True) print(data.schema) # Handler # Creates 2 new columns, # - 'Sepal_length.1' containing imputed values # - 'IsMissing.Sepal_Length' flag indicating if the value was imputed # replace_with is one of ['Mean', 'Max', 'Min', 'Def'] nahandle = Handler(replace_with='Mean') << {'NewVals': 'Sepal_Length'} print(with_nans) data = FileDataStream(tmpfile, schema) print('NAHandle\n', nahandle.fit_transform(data))