def fit(self, data): LOGGER.info("Start scale data fit ...") scale_value_results = [] self.header = data.schema.get('header') if self.scale_param.method == consts.MINMAXSCALE: min_max_scaler = MinMaxScaler(mode=self.scale_param.mode, area=self.scale_param.area, feat_upper=self.scale_param.feat_upper, feat_lower=self.scale_param.feat_lower, out_upper=self.scale_param.out_upper, out_lower=self.scale_param.out_lower) data, cols_scale_value = min_max_scaler.fit(data) scale_value_results.append(cols_scale_value) self.cols_scale_value = cols_scale_value elif self.scale_param.method == consts.STANDARDSCALE: standard_scaler = StandardScaler(with_mean=self.scale_param.with_mean, with_std=self.scale_param.with_std) data, mean, std = standard_scaler.fit(data) scale_value_results.append(mean) scale_value_results.append(std) self.mean = mean self.std = std else: LOGGER.info("Scale method is {}, do nothing and return!".format(self.scale_param.method)) data.schema['header'] = self.header LOGGER.info("End fit data ...") return data, scale_value_results
def test_transform1(self): standard_scaler = StandardScaler(with_mean=True, with_std=True) fit_instance, mean, std = standard_scaler.fit(self.table_instance) transform_data = standard_scaler.transform(self.table_instance, mean, std) self.assertListEqual(self.get_table_instance_feature(transform_data), self.get_table_instance_feature(fit_instance))
def test_cols_select_fit_and_transform_repeat(self): scale_column_idx = [1, 1, 2, 2, 4, 5, 5] standard_scaler = StandardScaler(area='col', scale_column_idx=scale_column_idx, with_mean=True, with_std=True) fit_data, scale_conf = standard_scaler.fit(self.table_instance) mean, std, scale_column_idx = scale_conf[0], scale_conf[1], scale_conf[ 2] scaler = SSL(with_mean=True, with_std=True) scaler.fit(self.test_data) transform_data = np.around(scaler.transform(self.test_data), 4).tolist() for i, line in enumerate(transform_data): for j, cols in enumerate(line): if j not in scale_column_idx: transform_data[i][j] = self.test_data[i][j] self.assertListEqual(self.get_table_instance_feature(fit_data), transform_data) std_scale_transform_data = standard_scaler.transform( self.table_instance, mean, std, scale_column_idx) self.assertListEqual( self.get_table_instance_feature(std_scale_transform_data), transform_data)
def test_transform4(self): standard_scaler = StandardScaler(with_mean=False, with_std=False) fit_instance, scale_conf = standard_scaler.fit(self.table_instance) mean, std, scale_column_idx = scale_conf[0], scale_conf[1], scale_conf[ 2] transform_data = standard_scaler.transform(self.table_instance, mean, std, scale_column_idx) self.assertListEqual(self.get_table_instance_feature(transform_data), self.get_table_instance_feature(fit_instance))
def test_fit4(self): standard_scaler = StandardScaler(with_mean=False, with_std=False) fit_instance, mean, std = standard_scaler.fit(self.table_instance) scaler = SSL(with_mean=False, with_std=False) scaler.fit(self.test_data) self.assertListEqual( self.get_table_instance_feature(fit_instance), np.around(scaler.transform(self.test_data), 4).tolist()) self.assertEqual(mean, [0 for _ in range(len(self.test_data[0]))]) self.assertEqual(std, [1 for _ in range(len(self.test_data[0]))])
def test_fit3(self): standard_scaler = StandardScaler(with_mean=True, with_std=False) fit_instance, mean, std = standard_scaler.fit(self.table_instance) scaler = SSL(with_std=False) scaler.fit(self.test_data) self.assertListEqual( self.get_table_instance_feature(fit_instance), np.around(scaler.transform(self.test_data), 4).tolist()) self.assertListEqual(list(np.around(mean, 4)), list(np.around(scaler.mean_, 4))) self.assertListEqual(list(np.around(std, 4)), [1 for _ in std])
def test_fit2(self): standard_scaler = StandardScaler(with_mean=False, with_std=True) fit_instance, scale_conf = standard_scaler.fit(self.table_instance) mean, std = scale_conf[0], scale_conf[1] scaler = SSL(with_mean=False) scaler.fit(self.test_data) self.assertListEqual( self.get_table_instance_feature(fit_instance), np.around(scaler.transform(self.test_data), 4).tolist()) self.assertListEqual(list(np.around(mean, 4)), [0 for _ in mean]) self.assertListEqual(list(np.around(std, 4)), list(np.around(scaler.scale_, 4)))
def test_fit6(self): standard_scaler = StandardScaler(area='col', with_mean=True, with_std=True) fit_instance, scale_conf = standard_scaler.fit(self.table_instance) mean, std, scale_column_idx = scale_conf[0], scale_conf[1], scale_conf[ 2] scaler = SSL() scaler.fit(self.test_data) self.assertListEqual( self.get_table_instance_feature(fit_instance), np.around(scaler.transform(self.test_data), 4).tolist()) self.assertListEqual(list(np.around(mean, 4)), list(np.around(scaler.mean_, 4))) self.assertListEqual(list(np.around(std, 4)), list(np.around(scaler.scale_, 4)))
def fit(self, data): """ Apply scale for input data Parameters ---------- data: data_instance, input data Returns ---------- data:data_instance, data after scale scale_value_results: list, the fit results information of scale """ LOGGER.info("Start scale data fit ...") self.header = data.schema.get('header') if self.scale_param.method == consts.MINMAXSCALE: min_max_scaler = MinMaxScaler(mode=self.scale_param.mode, area=self.scale_param.area, scale_column_idx=self.scale_param.scale_column_idx, feat_upper=self.scale_param.feat_upper, feat_lower=self.scale_param.feat_lower, out_upper=self.scale_param.out_upper, out_lower=self.scale_param.out_lower) data, cols_scale_value = min_max_scaler.fit(data) self.cols_scale_res = cols_scale_value elif self.scale_param.method == consts.STANDARDSCALE: standard_scaler = StandardScaler(area=self.scale_param.area, scale_column_idx=self.scale_param.scale_column_idx, with_mean=self.scale_param.with_mean, with_std=self.scale_param.with_std) data, cols_scale_value = standard_scaler.fit(data) self.mean = cols_scale_value[0] self.std = cols_scale_value[1] self.std_scale_column_idx = cols_scale_value[2] self.cols_scale_res = cols_scale_value else: LOGGER.info("Scale method is {}, do nothing and return!".format(self.scale_param.method)) data.schema['header'] = self.header LOGGER.info("End fit data ...") return data, self.cols_scale_res