def test_fit1(self): scale_param = self.get_scale_param() scale_param.scale_column_idx = [] scale_param.feat_upper = 2 scale_param.feat_lower = 1 scale_obj = MinMaxScale(scale_param) fit_instance = scale_obj.fit(self.table_instance) column_min_value = scale_obj.column_min_value column_max_value = scale_obj.column_max_value for i, line in enumerate(self.test_data): for j, value in enumerate(line): if value > 2: self.test_data[i][j] = 2 elif value < 1: self.test_data[i][j] = 1 scaler = MMS() scaler.fit(self.test_data) self.assertListEqual(self.get_table_instance_feature(fit_instance), np.around(scaler.transform(self.test_data), 6).tolist()) data_min = list(scaler.data_min_) data_max = list(scaler.data_max_) self.assertListEqual(column_min_value, data_min) self.assertListEqual(column_max_value, data_max) transform_data = scale_obj.transform(self.table_instance) self.assertListEqual(self.get_table_instance_feature(fit_instance), self.get_table_instance_feature(transform_data))
def test_fit5(self): scale_column_idx = [1, 2, 4] scale_names = ['fid1', 'fid2', 'fid1000'] scale_param = self.get_scale_param() scale_param.mode = "cap" # scale_param.area = "col" scale_param.feat_upper = 0.8 scale_param.feat_lower = 0.2 scale_param.scale_names = scale_names scale_param.scale_col_indexes = [2, 4] scale_obj = MinMaxScale(scale_param) fit_instance = scale_obj.fit(self.table_instance) column_min_value = scale_obj.column_min_value column_max_value = scale_obj.column_max_value raw_data = copy.deepcopy(self.test_data) gt_cap_lower_list = [0, 2, 2, 2, 3, 1] gt_cap_upper_list = [1, 8, 8, 8, 7, 8] for i, line in enumerate(self.test_data): for j, value in enumerate(line): if value > gt_cap_upper_list[j]: self.test_data[i][j] = gt_cap_upper_list[j] elif value < gt_cap_lower_list[j]: self.test_data[i][j] = gt_cap_lower_list[j] scaler = MMS() scaler.fit(self.test_data) sklearn_transform_data = np.around(scaler.transform(self.test_data), 6).tolist() for i, line in enumerate(sklearn_transform_data): for j, cols in enumerate(line): if j not in scale_column_idx: sklearn_transform_data[i][j] = raw_data[i][j] fit_data = np.round(self.get_table_instance_feature(fit_instance), 6).tolist() self.assertListEqual(fit_data, sklearn_transform_data) for i, line in enumerate(sklearn_transform_data): for j, cols in enumerate(line): if j not in scale_column_idx: sklearn_transform_data[i][j] = raw_data[i][j] data_min = list(scaler.data_min_) data_max = list(scaler.data_max_) self.assertListEqual(column_min_value, data_min) self.assertListEqual(column_max_value, data_max) transform_data = scale_obj.transform(self.table_instance) self.assertListEqual(self.get_table_instance_feature(fit_instance), self.get_table_instance_feature(transform_data))
def fit(self, data): """ Apply scale for input data Parameters ---------- data: data_instance, input data Returns ---------- data:data_instance, data after scale scale_value_results: list, the fit results information of scale """ LOGGER.info("Start scale data fit ...") if self.model_param.method == consts.MINMAXSCALE: self.scale_obj = MinMaxScale(self.model_param) elif self.model_param.method == consts.STANDARDSCALE: self.scale_obj = StandardScale(self.model_param) else: LOGGER.warning("Scale method is {}, do nothing and return!".format(self.model_param.method)) if self.scale_obj: fit_data = self.scale_obj.fit(data) fit_data.schema = data.schema self.callback_meta(metric_name="scale", metric_namespace="train", metric_meta=MetricMeta(name="scale", metric_type="SCALE", extra_metas={"method":self.model_param.method})) else: fit_data = data LOGGER.info("End fit data ...") return fit_data
def test_fit4(self): scale_column_idx = [1, 2, 4] scale_param = self.get_scale_param() # scale_param.area = "col" scale_param.feat_upper = 2 scale_param.feat_lower = 1 scale_param.scale_col_indexes = scale_column_idx scale_obj = MinMaxScale(scale_param) fit_instance = scale_obj.fit(self.table_instance) column_min_value = scale_obj.column_min_value column_max_value = scale_obj.column_max_value raw_data = copy.deepcopy(self.test_data) for i, line in enumerate(self.test_data): for j, value in enumerate(line): if j in scale_column_idx: if value > 2: self.test_data[i][j] = 2 elif value < 1: self.test_data[i][j] = 1 scaler = MMS() scaler.fit(self.test_data) sklearn_transform_data = np.around(scaler.transform(self.test_data), 6).tolist() for i, line in enumerate(sklearn_transform_data): for j, cols in enumerate(line): if j not in scale_column_idx: sklearn_transform_data[i][j] = raw_data[i][j] self.assertListEqual(self.get_table_instance_feature(fit_instance), sklearn_transform_data) for i, line in enumerate(sklearn_transform_data): for j, cols in enumerate(line): if j not in scale_column_idx: sklearn_transform_data[i][j] = raw_data[i][j] data_min = list(scaler.data_min_) data_max = list(scaler.data_max_) self.assertListEqual(column_min_value, data_min) self.assertListEqual(column_max_value, data_max) transform_data = scale_obj.transform(self.table_instance) self.assertListEqual(self.get_table_instance_feature(fit_instance), self.get_table_instance_feature(transform_data))
def export_model(self): if not self.scale_obj: if self.model_param.method == consts.MINMAXSCALE: self.scale_obj = MinMaxScale(self.model_param) else: self.scale_obj = StandardScale(self.model_param) return self.scale_obj.export_model(self.need_run)
def test_fit_instance_default(self): scale_param = self.get_scale_param() scale_param.scale_col_indexes = -1 scale_obj = MinMaxScale(scale_param) fit_instance = scale_obj.fit(self.table_instance) column_min_value = scale_obj.column_min_value column_max_value = scale_obj.column_max_value scaler = MMS() scaler.fit(self.test_data) self.assertListEqual(self.get_table_instance_feature(fit_instance), np.around(scaler.transform(self.test_data), 6).tolist()) data_min = list(scaler.data_min_) data_max = list(scaler.data_max_) self.assertListEqual(column_min_value, data_min) self.assertListEqual(column_max_value, data_max) transform_data = scale_obj.transform(self.table_instance) self.assertListEqual(self.get_table_instance_feature(fit_instance), self.get_table_instance_feature(transform_data))
def transform(self, data, fit_config=None): """ Transform input data using scale with fit results Parameters ---------- data: data_instance, input data fit_config: list, the fit results information of scale Returns ---------- transform_data:data_instance, data after transform """ LOGGER.info("Start scale data transform ...") if self.model_param.method == consts.MINMAXSCALE: self.scale_obj = MinMaxScale(self.model_param) elif self.model_param.method == consts.STANDARDSCALE: self.scale_obj = StandardScale(self.model_param) self.scale_obj.set_param(self.mean, self.std) else: LOGGER.info( "DataTransform method is {}, do nothing and return!".format( self.model_param.method)) if self.scale_obj: self.scale_obj.header = self.header self.scale_obj.scale_column_idx = self.scale_column_idx self.scale_obj.set_column_range(self.column_max_value, self.column_min_value) transform_data = self.scale_obj.transform(data) transform_data.schema = data.schema self.callback_meta( metric_name="scale", metric_namespace="train", metric_meta=MetricMeta( name="scale", metric_type="SCALE", extra_metas={"method": self.model_param.method})) else: transform_data = data LOGGER.info("End transform data.") return transform_data