def test_check_series(self): data = pd.DataFrame( data=dict(a=[1.2, 2, 3], b=[2, 3, 4], c=[1, 2, 3], d=[2, 2, 2])) norm = MinMaxScaler() << "a" normalized1 = norm.fit_transform(data) normalized2 = norm.fit_transform(data['a']) assert_array_equal(normalized1['a'].values, normalized2['a'].values)
def test_minmaxscaler_float_order_noint(self): in_df = pandas.DataFrame(data=OrderedDict(xpetal=[-1.1, -2.2, -3.3], ipetal=[1.0, 2.0, 3.0])) normed = MinMaxScaler() << ['xpetal', 'ipetal'] pipeline = Pipeline([normed]) out_df = pipeline.fit_transform(in_df, verbose=0) assert_equal(out_df.shape, (3, 2)) assert_equal(list(out_df.columns), list(in_df.columns))
def test_minmaxscaler_int(self): in_df = pandas.DataFrame( data=dict(xpetal=[-1, -2, -3], ipetal=[1, 2, 3])) normed = MinMaxScaler() << ['xpetal', 'ipetal'] pipeline = Pipeline([normed]) out_df = pipeline.fit_transform(in_df, verbose=0) assert_equal(out_df.shape, (3, 2)) if out_df.loc[2, 'xpetal'] != -1: raise Exception("Unexpected:\n" + str(out_df)) assert_equal(out_df.loc[2, 'ipetal'], 1)
def test_fit_transform(self): import azureml.dataprep as dprep path = get_dataset('infert').as_filepath() dflow = dprep.auto_read_file(path=path) dprep_data = DprepDataStream(dflow) file_data = FileDataStream.read_csv(path) xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'}) pipe = Pipeline([xf]) transformed_data = pipe.fit_transform(file_data) transformed_data1 = pipe.fit_transform(dprep_data) assert_array_equal(transformed_data.columns, transformed_data1.columns) assert_2d_array_equal(transformed_data.values, transformed_data1.values)
def transform_data(): xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'}) pipe = Pipeline([xf]) transformed_data = pipe.fit_transform(data, as_binary_data_stream=True) transformed_data_df = pipe.fit_transform(data) return transformed_data, transformed_data_df
# Generate the train and test data np.random.seed(0) x = np.arange(100, step=0.1) y = x * 10 + (np.random.standard_normal(len(x)) * 10) train_data = {'c1': x, 'c2': y} train_df = pd.DataFrame(train_data).astype({ 'c1': np.float32, 'c2': np.float32 }) test_data = {'c1': [2.5, 30.5], 'c2': [1, 1]} test_df = pd.DataFrame(test_data).astype({'c1': np.float32, 'c2': np.float32}) # Fit a MinMaxScaler Pipeline r1 = Pipeline([MinMaxScaler()]) r1.fit(train_df) # Export the pipeline to ONNX onnx_path = get_tmp_file('.onnx') r1.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable') # Perform the transform using the standard ML.Net backend result_standard = r1.transform(test_df) print(result_standard) # c1 c2 # 0 0.025025 0.000998 # 1 0.305305 0.000998 # Perform the transform using the ONNX backend. # Note, the extra columns and column name differences
from nimbusml import FileDataStream from nimbusml.datasets import get_dataset from nimbusml.preprocessing.normalization import MinMaxScaler # data input (as a FileDataStream) path = get_dataset('infert').as_filepath() data = FileDataStream.read_csv( path, sep=',', numeric_dtype=numpy.float32) # Error with integer input print(data.head()) # age case education induced parity pooled.stratum row_num ... # 0 26.0 1.0 0-5yrs 1.0 6.0 3.0 1.0 ... # 1 42.0 1.0 0-5yrs 1.0 1.0 1.0 2.0 ... # 2 39.0 1.0 0-5yrs 2.0 6.0 4.0 3.0 ... # 3 34.0 1.0 0-5yrs 2.0 4.0 2.0 4.0 ... # 4 35.0 1.0 6-11yrs 1.0 3.0 32.0 5.0 ... # transform usage xf = MinMaxScaler(columns={'in': 'induced', 'sp': 'spontaneous'}) # fit and transform features = xf.fit_transform(data) # print features print(features.head()) # age case education in ... pooled.stratum row_num sp ... # 0 26.0 1.0 0-5yrs 0.5 ... 3.0 1.0 1.0 ... # 1 42.0 1.0 0-5yrs 0.5 ... 1.0 2.0 0.0 ... # 2 39.0 1.0 0-5yrs 1.0 ... 4.0 3.0 0.0 ... # 3 34.0 1.0 0-5yrs 1.0 ... 2.0 4.0 0.0 ... # 4 35.0 1.0 6-11yrs 0.5 ... 32.0 5.0 0.5 ...
############################################################################### # MinMaxScaler import pandas as pd from nimbusml.preprocessing.normalization import MinMaxScaler in_df = pd.DataFrame( data=dict(Sepal_Length=[2.5, 1, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[0, 2.5, 2.6, 2.4], Species=["setosa", "viginica", "setosa", 'versicolor'])) # generate two new Columns - Petal_Normed and Sepal_Normed normed = MinMaxScaler() << { 'Petal_Normed': 'Petal_Length', 'Sepal_Normed': 'Sepal_Width' } out_df = normed.fit_transform(in_df) print('MinMaxScaler\n', (out_df))