def test_filter(self): with_nans = pd.DataFrame( data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[np.nan, 2.5, 2.6, 2.4], Petal_Width=[.8, .7, .9, 0.7])) tmpfile = 'tmpfile_with_nans.csv' with_nans.to_csv(tmpfile, index=False, na_rep='?') file_schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1 ' \ 'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 header+' data = FileDataStream(tmpfile, schema=file_schema) xf = Filter(columns=[ 'Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width' ]) features = xf.fit_transform(data) assert features.shape == (2, 4) print(features.columns) # columns ordering changed between 0.22 and 0.23 assert set(features.columns) == { 'Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width' } os.remove(tmpfile)
def test_inf(self): data = DataFrame(data=dict(f0=[np.inf, 1, 2, 3, 4, 5, 6], f1=[1, 2, -np.Infinity, 3, 4, 5, 6])) xf = Filter(columns=['f0']) filtered = xf.fit_transform(data) assert_equal(filtered['f0'][0], np.inf) assert_equal(filtered['f1'][2], -np.inf)
def test_missing(self): data = DataFrame(data=dict(f0=[np.nan, 1, 2, 3, 4, 5, 6], f1=[1, 2, np.nan, 3, 4, 5, 6], f2=[np.nan, 1, np.nan, 2, 3, np.nan, 4])) for col in data.columns: xf = Filter(columns=[col]) filtered = xf.fit_transform(data) count = [isinstance(x, str) or not isnan(x) for x in data[col]].count(True) assert_equal(filtered.shape[0], count)
def test_check_estimator_filter(self): dataTrain = pd.DataFrame( data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[np.nan, 2.5, 2.6, 2.4], Petal_Width=[.8, .7, .9, 0.7], Species=["setosa", "virginica", "", 'versicolor'])) filter = Filter() << ["Sepal_Length", "Petal_Length"] data_idv = filter.fit_transform(dataTrain) assert data_idv is not None assert len(data_idv) > 0
def test_filter_no_renaming(self): with_nans = pd.DataFrame( data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[np.nan, 2.5, 2.6, 2.4], Petal_Width=[.8, .7, .9, 0.7], Species=["setosa", "viginica", "", 'versicolor'])) tmpfile = 'tmpfile_with_nans.csv' with_nans.to_csv(tmpfile, index=False) file_schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1 ' \ 'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 ' \ 'col=Species:TX:4 header+' data = FileDataStream(tmpfile, schema=file_schema) try: xf = Filter(columns={'Petal_Length': 'Petal_Length'}) xf.fit(data) except TypeError as e: assert 'Dictionaries are not allowed to specify input ' \ 'columns.' in str( e) try: xf = Filter(columns={'Petal_Length2': 'Petal_Length'}) xf.fit(data) except TypeError as e: assert 'Dictionaries are not allowed to specify input ' \ 'columns.' in str( e)
def test_input_conversion_to_float_retains_other_column_types(self): data = { 'f0': [0, 1, 2, 3], 'f1': ['2', '3', '4', '5'], 'f2': [4, 5, np.nan, 9] } data = DataFrame(data).astype({ 'f0': np.int32, 'f1': str, 'f2': np.float64 }) # Check Indicator xf = Indicator(columns={'f2.ind': 'f2'}) result = xf.fit_transform(data) assert_equal(result.dtypes['f0'], np.int32) assert_equal(result.dtypes['f1'], np.object) assert_equal(result.dtypes['f2'], np.float64) assert_equal(result.dtypes['f2.ind'], np.bool) assert_equal(result.loc[2, 'f2.ind'], True) assert_equal(len(result), 4) # Check Filter xf = Filter(columns=['f2']) result = xf.fit_transform(data) assert_equal(len(result), 3) assert_equal(result.loc[2, 'f2'], 9.0) assert_equal(result.dtypes['f0'], np.int32) assert_equal(result.dtypes['f1'], np.object) assert_equal(result.dtypes['f2'], np.float32) xf = Filter(columns=['f1']) result = xf.fit_transform(data) assert_equal(len(result), 4) assert_equal(result.loc[3, 'f2'], 9.0) assert_equal(result.dtypes['f0'], np.int32) assert_equal(result.dtypes['f1'], np.float32) assert_equal(result.dtypes['f2'], np.float64) # Check Handler xf = Handler(columns=['f2'], replace_with='Mean') result = xf.fit_transform(data) assert_equal(len(result), 4) assert_equal(result.loc[2, 'f2.f2'], 6.0) assert_equal(result.dtypes['f0'], np.int32) assert_equal(result.dtypes['f1'], np.object) assert_equal(result.dtypes['f2.f2'], np.float32)
def test_get_fit_info_fastl(self): train_file = get_dataset("airquality").as_filepath() schema = DataSchema.read_schema(train_file) data = FileDataStream(train_file, schema) pipeline = Pipeline([ Filter(columns=['Ozone']), FastLinearRegressor(feature=['Solar_R', 'Temp'], label='Ozone') ]) info = pipeline.get_fit_info(data) exp = [{ 'name': None, 'outputs': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'schema_after': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'type': 'start' }, { 'inputs': ['Ozone'], 'name': 'Filter', 'outputs': ['Ozone'], 'schema_after': ['Unnamed0', 'Ozone', 'Solar_R', 'Wind', 'Temp', 'Month', 'Day'], 'type': 'transform' }] for el in info[0]: if 'operator' in el: del el['operator'] self.assertEqual(exp, info[0][:2])
def test_input_conversion_to_float(self): data = { 'f0': [0, 1, 2, 3], 'f1': [1, 2, 3, 4], 'f2': [1, 2, 3, 4], 'f3': [1, 2, 3, 4], 'f4': ['2', '3', '4', '5'], 'f5': [4, 5, np.nan, 9] } data = DataFrame(data).astype({ 'f0': np.int8, 'f1': np.int16, 'f2': np.int32, 'f3': np.int64, 'f4': str, 'f5': np.float64 }) # Check Indicator xf = Indicator() result = xf.fit_transform(data) assert_equal(result.loc[2, 'f5'], True) result.loc[2, 'f5'] = False result = ~result for val in result.all().tolist(): self.assertTrue(val) # Check Filter xf = Filter() result = xf.fit_transform(data) assert_equal(len(result), 3) assert_equal(result.loc[2, 'f5'], 9.0) # Check Handler xf = Handler(replace_with='Mean') result = xf.fit_transform(data) assert_equal(len(result), 4) assert_equal(result.loc[2, 'f5.f5'], 6.0) assert_equal(result.loc[2, 'f5.IsMissing.f5'], 1.0)
'ColumnConcatenator': ColumnConcatenator(columns={'Features': [ 'Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width', 'Setosa']}), 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']), 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}), 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']), 'DateTimeSplitter': DateTimeSplitter(prefix='dt'), 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], label='Setosa'), 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'), 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']), 'FromKey': Pipeline([ ToKey(columns=['Sepal_Length']), FromKey(columns=['Sepal_Length']) ]), # GlobalContrastRowScaler currently requires a vector input to work 'GlobalContrastRowScaler': Pipeline([ ColumnConcatenator() << { 'concated_columns': [ 'Petal_Length', 'Sepal_Width', 'Sepal_Length']}, GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'}) ]), 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}), 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']),
############################################################################### # OrdinaryLeastSquaresRegressor from nimbusml import Pipeline, FileDataStream, Role from nimbusml.datasets import get_dataset from nimbusml.linear_model import OrdinaryLeastSquaresRegressor from nimbusml.preprocessing.missing_values import Filter # use the built-in data set 'airquality' to create test and train data # Unnamed: 0 Ozone Solar_R Wind Temp Month Day # 0 1 41.0 190.0 7.4 67 5 1 # 1 2 36.0 118.0 8.0 72 5 2 train_file = get_dataset("airquality").as_filepath() schema = "col=none:R4:0 col=ozone:R4:1 col=solar:R4:2 col=wind:R4:3 " \ "col=temp:R4:4 col=month:R4:5 col=day:R4:6 sep=, header=+" fds = FileDataStream(train_file, schema=schema) # set up pipeline pipe = Pipeline([ Filter() << ['ozone'], OrdinaryLeastSquaresRegressor() << { Role.Label: 'ozone', Role.Feature: ['solar', 'wind', 'temp', 'month', 'day'] } ]) # train and evaluate the model metrics, scores = pipe.fit(fds).test(fds, "ozone", output_scores=True) print(metrics)
############################################################################### # Filter import numpy as np import pandas as pd from nimbusml import FileDataStream from nimbusml.preprocessing.missing_values import Filter with_nans = pd.DataFrame( data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[np.nan, 2.5, 2.6, 2.4], Petal_Width=[.8, .7, .9, 0.7], Species=["setosa", "viginica", "", 'versicolor'])) # write NaNs to file to see how transforms work tmpfile = 'tmpfile_with_nans.csv' with_nans.to_csv(tmpfile, index=False) # schema for reading directly from text files schema = 'sep=, col=Petal_Length:R4:0 col=Petal_Width:R4:1' \ 'col=Sepal_Length:R4:2 col=Sepal_Width:R4:3 col=Species:TX:4 header+' data = FileDataStream.read_csv(tmpfile) print(data.schema) # filter out rows where Sepal_Length is NaN nafilter = Filter() << ['Sepal_Length', 'Petal_Length'] print(with_nans) print('NAFilter\n', nafilter.fit_transform(data))
############################################################################### # Filter import numpy as np import pandas as pd from nimbusml import FileDataStream from nimbusml.preprocessing.missing_values import Filter with_nans = pd.DataFrame(data=dict(Sepal_Length=[2.5, np.nan, 2.1, 1.0], Sepal_Width=[.75, .9, .8, .76], Petal_Length=[np.nan, 2.5, 2.6, 2.4], Petal_Width=[.8, .7, .9, 0.7])) # write NaNs to file to show how this transform work tmpfile = 'tmpfile_with_nans.csv' with_nans.to_csv(tmpfile, index=False) data = FileDataStream.read_csv(tmpfile, sep=',', numeric_dtype=np.float32) # transform usage xf = Filter( columns=['Petal_Length', 'Petal_Width', 'Sepal_Length', 'Sepal_Width']) # fit and transform features = xf.fit_transform(data) # print features print(features.head()) # Petal_Length Petal_Width Sepal_Length Sepal_Width # 0 2.4 0.7 1.0 0.76