def convert_feature_to_bin(self, data_instances, split_points=None): is_sparse = data_overview.is_sparse_data(data_instances) schema = data_instances.schema if split_points is None: split_points = self.bin_results.all_split_points else: for col_name, sp in split_points.items(): self.bin_results.put_col_split_points(col_name, sp) if is_sparse: f = functools.partial(self._convert_sparse_data, bin_inner_param=self.bin_inner_param, bin_results=self.bin_results, abnormal_list=self.abnormal_list, convert_type='bin_num') new_data = data_instances.mapValues(f) else: f = functools.partial(self._convert_dense_data, bin_inner_param=self.bin_inner_param, bin_results=self.bin_results, abnormal_list=self.abnormal_list, convert_type='bin_num') new_data = data_instances.mapValues(f) new_data.schema = schema header = get_header(data_instances) bin_sparse = self.get_sparse_bin( self.bin_inner_param.transform_bin_indexes, split_points, header) split_points_result = self.bin_results.get_split_points_array( self.bin_inner_param.transform_bin_names) return new_data, split_points_result, bin_sparse
def fit(self, data_instances): if self.sample_weight_name is None and self.class_weight is None: return data_instances self.header = data_overview.get_header(data_instances) if self.class_weight: self.weight_mode = "class weight" if self.sample_weight_name and self.class_weight: LOGGER.warning(f"Both 'sample_weight_name' and 'class_weight' provided. " f"Only weight from 'sample_weight_name' is used.") new_schema = copy.deepcopy(data_instances.schema) new_schema["sample_weight"] = "weight" weight_loc = None if self.sample_weight_name: self.weight_mode = "sample weight name" weight_loc = SampleWeight.get_weight_loc(data_instances, self.sample_weight_name) if weight_loc is not None: new_schema["header"].pop(weight_loc) else: raise ValueError(f"Cannot find weight column of given sample_weight_name '{self.sample_weight_name}'.") result_instances = self.transform_weighted_instance(data_instances, weight_loc) result_instances.schema = new_schema self.callback_info() if result_instances.mapPartitions(check_negative_sample_weight).reduce(lambda x, y: x or y): LOGGER.warning(f"Negative weight found in weighted instances.") return result_instances
def fit(self, data_instances, validate_data=None): if not self.need_run: return # check if empty table LOGGER.info("Enter Local Baseline fit") abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) # get model model = self.get_model() # get header self.header = data_overview.get_header(data_instances) X_table = data_instances.mapValues(lambda v: v.features) y_table = data_instances.mapValues(lambda v: v.label) X = np.array([v[1] for v in list(X_table.collect())]) y = np.array([v[1] for v in list(y_table.collect())]) w = None if data_overview.with_weight(data_instances): LOGGER.info( f"Input Data with Weight. Weight will be used to fit model.") weight_table = data_instances.mapValues(lambda v: v.weight) w = np.array([v[1] for v in list(weight_table.collect())]) self.model_fit = model.fit(X, y, w) self.need_one_vs_rest = len(self.model_fit.classes_) > 2 self.set_summary(self.get_model_summary())
def _init_cols(self, data_instances): # Already initialized if len(self.cols_dict) != 0: return header = data_overview.get_header(data_instances) self.header = header if self.cols_index == -1: self.cols = header self.cols_index = [i for i in range(len(header))] else: cols = [] for idx in self.cols_index: try: idx = int(idx) except ValueError: raise ValueError( "In binning module, selected index: {} is not integer". format(idx)) if idx >= len(header): raise ValueError( "In binning module, selected index: {} exceed length of data dimension" .format(idx)) cols.append(header[idx]) self.cols = cols self.cols_dict = {} for col in self.cols: col_index = header.index(col) self.cols_dict[col] = col_index
def __transform_replace(self, data, transform_value, replace_area, output_format, skip_cols): skip_cols = [get_header(data).index(v) for v in skip_cols] if replace_area == 'all': if output_format is not None: f = functools.partial( Imputer.replace_missing_value_with_replace_value_format, replace_value=transform_value, missing_value_list=self.abnormal_value_list, output_format=output_format) else: f = functools.partial( Imputer.replace_missing_value_with_replace_value, replace_value=transform_value, missing_value_list=self.abnormal_value_list) elif replace_area == 'col': if output_format is not None: f = functools.partial( Imputer. replace_missing_value_with_cols_transform_value_format, transform_list=transform_value, missing_value_list=self.abnormal_value_list, output_format=output_format, skip_cols=set(skip_cols)) else: f = functools.partial( Imputer.replace_missing_value_with_cols_transform_value, transform_list=transform_value, missing_value_list=self.abnormal_value_list, skip_cols=set(skip_cols)) else: raise ValueError( "Unknown replace area {} in Imputer".format(replace_area)) return data.mapValues(f)
def __get_cols_transform_value(self, data, replace_method, quantile=None): summary_obj = MultivariateStatisticalSummary( data, -1, abnormal_list=self.missing_value_list) header = get_header(data) if replace_method == consts.MIN: cols_transform_value = summary_obj.get_min() elif replace_method == consts.MAX: cols_transform_value = summary_obj.get_max() elif replace_method == consts.MEAN: cols_transform_value = summary_obj.get_mean() elif replace_method == consts.MEDIAN: cols_transform_value = summary_obj.get_median() elif replace_method == consts.QUANTILE: if quantile > 1 or quantile < 0: raise ValueError( "quantile should between 0 and 1, but get:{}".format( quantile)) cols_transform_value = summary_obj.get_quantile_point(quantile) else: raise ValueError( "Unknown replace method:{}".format(replace_method)) cols_transform_value = [ round(cols_transform_value[key], 6) for key in header ] return cols_transform_value
def _get_scale_column_idx(self, data): data_shape = self._get_data_shape(data) if self.param_scale_col_indexes != -1: if isinstance(self.param_scale_col_indexes, list): if len(self.param_scale_col_indexes) > 0: max_col_idx = max(self.param_scale_col_indexes) if max_col_idx >= data_shape: raise ValueError( "max column index in area is:{}, should less than data shape:{}" .format(max_col_idx, data_shape)) scale_column_idx = self.param_scale_col_indexes header = data_overview.get_header(data) scale_names = set(header).intersection( set(self.param_scale_names)) idx_from_name = list( map(lambda n: header.index(n), scale_names)) scale_column_idx = scale_column_idx + idx_from_name scale_column_idx = list(set(scale_column_idx)) scale_column_idx.sort() else: LOGGER.warning( "parameter scale_column_idx should be a list, but not:{}, set scale column to all columns" .format(type(self.param_scale_col_indexes))) scale_column_idx = [i for i in range(data_shape)] else: scale_column_idx = [i for i in range(data_shape)] return scale_column_idx
def _setup_bin_inner_param(self, data_instances, params: FeatureBinningParam): if self.schema is not None: return self.header = get_header(data_instances) LOGGER.debug("_setup_bin_inner_param, get header: {}".format( self.header)) self.schema = data_instances.schema self.bin_inner_param.set_header(self.header) if params.bin_indexes == -1: self.bin_inner_param.set_bin_all() else: self.bin_inner_param.add_bin_indexes(params.bin_indexes) self.bin_inner_param.add_bin_names(params.bin_names) self.bin_inner_param.add_category_indexes(params.category_indexes) self.bin_inner_param.add_category_names(params.category_names) if params.transform_param.transform_cols == -1: self.bin_inner_param.set_transform_all() else: self.bin_inner_param.add_transform_bin_indexes( params.transform_param.transform_cols) self.bin_inner_param.add_transform_bin_names( params.transform_param.transform_names) # LOGGER.debug("After _setup_bin_inner_param: {}".format(self.bin_inner_param.__dict__)) self.binning_obj.set_bin_inner_param(self.bin_inner_param) LOGGER.debug("After _setup_bin_inner_param, header: {}".format( self.header))
def _init_cols(self, data_instances): self.schema = data_instances.schema header = get_header(data_instances) self.original_header = copy.deepcopy(header) LOGGER.debug("When init, original_header: {}".format(self.original_header)) if self.cols_index == -1: self.cols = [i for i in range(len(header))] else: cols = [] for idx in self.cols_index: try: idx = int(idx) except ValueError: raise ValueError("In binning module, selected index: {} is not integer".format(idx)) if idx >= len(header): raise ValueError( "In binning module, selected index: {} exceed length of data dimension".format(idx)) cols.append(idx) self.cols = cols # self.left_col_names = self.cols.copy() # Set all columns are left at the beginning. self.left_cols_index = [i for i in range(len(header))] for col_idx in self.cols: self.left_cols[col_idx] = True self.header = header
def _parse_cols(self, data_instances): if self.header is not None and len(self.header) != 0: return LOGGER.debug("Before Binning, schema is : {}".format( data_instances.schema)) header = get_header(data_instances) self.schema = data_instances.schema self.header = header # LOGGER.debug("data_instance count: {}, header: {}".format(data_instances.count(), header)) if self.cols_index == -1: if header is None: raise RuntimeError( 'Cannot get feature header, please check input data') self.cols = [i for i in range(len(header))] else: self.cols = self.cols_index if self.transform_cols_idx == -1: self.transform_cols_idx = self.cols self.cols_dict = {} for col in self.cols: col_name = header[col] self.cols_dict[col_name] = col
def fit_split_points(self, data_instances): """ Apply the binning method Parameters ---------- data_instances : DTable The input data Returns ------- split_points : dict. Each value represent for the split points for a feature. The element in each row represent for the corresponding split point. e.g. split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...], # The first feature 'x2': [1, 2, 3, 4, ...], # The second feature ... # Other features } """ header = data_overview.get_header(data_instances) self._default_setting(header) # self._init_cols(data_instances) percent_value = 1.0 / self.bin_num # calculate the split points percentile_rate = [i * percent_value for i in range(1, self.bin_num)] percentile_rate.append(1.0) is_sparse = data_overview.is_sparse_data(data_instances) # self._fit_split_point_deprecate(data_instances, is_sparse, percentile_rate) self._fit_split_point(data_instances, is_sparse, percentile_rate) self.fit_category_features(data_instances) return self.bin_results.all_split_points
def _setup_bin_inner_param(self, data_instances, params): if self.bin_inner_param is not None: return self.bin_inner_param = BinInnerParam() header = get_header(data_instances) LOGGER.debug("_setup_bin_inner_param, get header length: {}".format( len(self.header))) self.schema = data_instances.schema self.bin_inner_param.set_header(header) if params.bin_indexes == -1: self.bin_inner_param.set_bin_all() else: self.bin_inner_param.add_bin_indexes(params.bin_indexes) self.bin_inner_param.add_bin_names(params.bin_names) self.bin_inner_param.add_category_indexes(params.category_indexes) self.bin_inner_param.add_category_names(params.category_names) if params.transform_param.transform_cols == -1: self.bin_inner_param.set_transform_all() else: self.bin_inner_param.add_transform_bin_indexes( params.transform_param.transform_cols) self.bin_inner_param.add_transform_bin_names( params.transform_param.transform_names) self.set_bin_inner_param(self.bin_inner_param)
def _init_cols(self, data_instances): header = get_header(data_instances) if self.cols == -1: self.cols = header for col in self.cols: col_index = header.index(col) self.cols_dict[col] = col_index
def fit_split_points(self, data_instances): """ Apply the binning method Parameters ---------- data_instances : DTable The input data Returns ------- split_points : dict. Each value represent for the split points for a feature. The element in each row represent for the corresponding split point. e.g. split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...], # The first feature 'x2': [1, 2, 3, 4, ...], # The second feature ...] # Other features """ header = data_overview.get_header(data_instances) self._default_setting(header) # self._init_cols(data_instances) percent_value = 1.0 / self.bin_num # calculate the split points percentile_rate = [i * percent_value for i in range(1, self.bin_num)] percentile_rate.append(1.0) is_sparse = data_overview.is_sparse_data(data_instances) if self.summary_dict is None: f = functools.partial(self.approxi_quantile, params=self.params, abnormal_list=self.abnormal_list, cols_dict=self.bin_inner_param.bin_cols_map, header=self.header, is_sparse=is_sparse) summary_dict = data_instances.mapPartitions(f) summary_dict = summary_dict.reduce(self.merge_summary_dict) if is_sparse: total_count = data_instances.count() for _, summary_obj in summary_dict.items(): summary_obj.set_total_count(total_count) self.summary_dict = summary_dict else: summary_dict = self.summary_dict # split_points = {} for col_name, summary in summary_dict.items(): split_point = [] for percen_rate in percentile_rate: s_p = summary.query(percen_rate) if s_p not in split_point: split_point.append(s_p) self.bin_results.put_col_split_points(col_name, split_point) self.fit_category_features(data_instances) return self.bin_results.all_split_points
def init_bucket(self, data_instances): header = data_overview.get_header(data_instances) self._default_setting(header) init_bucket_param = copy.deepcopy(self.params) init_bucket_param.bin_num = self.optimal_param.init_bin_nums if self.optimal_param.init_bucket_method == consts.QUANTILE: init_binning_obj = QuantileBinningTool(param_obj=init_bucket_param, allow_duplicate=False) else: init_binning_obj = BucketBinning(params=init_bucket_param) init_binning_obj.set_bin_inner_param(self.bin_inner_param) init_split_points = init_binning_obj.fit_split_points(data_instances) is_sparse = data_overview.is_sparse_data(data_instances) bucket_dict = dict() for col_name, sps in init_split_points.items(): bucket_list = [] for idx, sp in enumerate(sps): bucket = bucket_info.Bucket(idx, self.adjustment_factor, right_bound=sp) if idx == 0: bucket.left_bound = -math.inf bucket.set_left_neighbor(None) else: bucket.left_bound = sps[idx - 1] bucket.event_total = self.event_total bucket.non_event_total = self.non_event_total bucket_list.append(bucket) bucket_list[-1].set_right_neighbor(None) bucket_dict[col_name] = bucket_list # LOGGER.debug(f"col_name: {col_name}, length of sps: {len(sps)}, " # f"length of list: {len(bucket_list)}") convert_func = functools.partial( self.convert_data_to_bucket, split_points=init_split_points, headers=self.header, bucket_dict=copy.deepcopy(bucket_dict), is_sparse=is_sparse, get_bin_num_func=self.get_bin_num) bucket_table = data_instances.mapReducePartitions( convert_func, self.merge_bucket_list) # bucket_table = dict(bucket_table.collect()) # for k, v in bucket_table.items(): # LOGGER.debug(f"[feature] {k}, length of list: {len(v)}") # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table))) # bucket_table = [(k, v) for k, v in bucket_table.items()] # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table))) # bucket_table = session.parallelize(bucket_table, include_key=True, partition=data_instances.partitions) return bucket_table
def __get_cols_transform_value(self, data, replace_method, replace_value=None): """ Parameters ---------- data: input data replace_method: dictionary of (column name, replace_method_name) pairs Returns ------- list of transform value for each column, length equal to feature count of input data """ summary_obj = MultivariateStatisticalSummary( data, -1, abnormal_list=self.abnormal_value_list) header = get_header(data) cols_transform_value = {} if isinstance(replace_value, list): if len(replace_value) != len(header): raise ValueError( f"replace value {replace_value} length does not match with header {header}, please check." ) for i, feature in enumerate(header): if replace_method[feature] is None: transform_value = 0 elif replace_method[feature] == consts.MIN: transform_value = summary_obj.get_min()[feature] elif replace_method[feature] == consts.MAX: transform_value = summary_obj.get_max()[feature] elif replace_method[feature] == consts.MEAN: transform_value = summary_obj.get_mean()[feature] elif replace_method[feature] == consts.MEDIAN: transform_value = summary_obj.get_median()[feature] elif replace_method[feature] == consts.DESIGNATED: if isinstance(replace_value, list): transform_value = replace_value[i] else: transform_value = replace_value LOGGER.debug( f"replace value for feature {feature} is: {transform_value}" ) else: raise ValueError( "Unknown replace method:{}".format(replace_method)) cols_transform_value[feature] = transform_value LOGGER.debug(f"cols_transform value is: {cols_transform_value}") cols_transform_value = [cols_transform_value[key] for key in header] # cols_transform_value = {i: round(cols_transform_value[key], 6) for i, key in enumerate(header)} LOGGER.debug(f"cols_transform value is: {cols_transform_value}") return cols_transform_value
def __init_cols(self, data_instances, cols_index, stat_order, bias): header = data_overview.get_header(data_instances) self.header = header if cols_index == -1: self.cols_index = [i for i in range(len(header))] else: self.cols_index = cols_index LOGGER.debug(f"col_index: {cols_index}, self.col_index: {self.cols_index}") self.cols_dict = {header[indices]: indices for indices in self.cols_index} self.summary_statistics = SummaryStatistics(length=len(self.cols_index), abnormal_list=self.abnormal_list, stat_order=stat_order, bias=bias)
def fit_split_points(self, data_instances): header = data_overview.get_header(data_instances) self._default_setting(header) if (self.event_total and self.non_event_total) is None: self.event_total, self.non_event_total = self.get_histogram(data_instances) LOGGER.debug("In fit split points, event_total: {}, non_event_total: {}".format(self.event_total, self.non_event_total)) bucket_table = self.init_bucket(data_instances) sample_count = data_instances.count() self.fit_buckets(bucket_table, sample_count) self.fit_category_features(data_instances)
def fit_split_points(self, data_instances): """ Apply the binning method Parameters ---------- data_instances : DTable The input data Returns ------- split_points : dict. Each value represent for the split points for a feature. The element in each row represent for the corresponding split point. e.g. split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...], # The first feature 'x2': [1, 2, 3, 4, ...], # The second feature ...] # Other features """ header = data_overview.get_header(data_instances) self._default_setting(header) # is_sparse = data_overview.is_sparse_data(data_instances) # if is_sparse: # raise RuntimeError("Bucket Binning method has not supported sparse data yet.") # self._init_cols(data_instances) statistics = MultivariateStatisticalSummary( data_instances, self.bin_inner_param.bin_indexes, abnormal_list=self.abnormal_list) max_dict = statistics.get_max() min_dict = statistics.get_min() for col_name, max_value in max_dict.items(): min_value = min_dict.get(col_name) split_points = [] # L = (max_value - min_value) / self.bin_num # for k in range(self.bin_num - 1): L = (max_value - min_value) / (self.bin_num - 1) split_points.append(min_value - 1) for k in range(self.bin_num - 2): s_p = min_value + (k + 1) * L split_points.append(s_p) split_points.append(max_value) # final_split_points[col_name] = split_point self.bin_results.put_col_split_points(col_name, split_points) self.fit_category_features(data_instances) return self.bin_results.all_split_points
def _parse_cols(self, data_instances): if self.header is not None and len(self.header) != 0: return header = get_header(data_instances) self.header = header # LOGGER.debug("data_instance count: {}, header: {}".format(data_instances.count(), header)) if self.cols == -1: if header is None: raise RuntimeError('Cannot get feature header, please check input data') self.cols = header self.cols_dict = {} for col in self.cols: col_index = header.index(col) self.cols_dict[col] = col_index
def _init_cols(self, data_instances): # Already initialized if self.header is not None: return if data_instances is None: return header = get_header(data_instances) self.header = header if self.cols == -1: self.cols = [x for x in range(len(header))] for col_index in self.cols: col_name = header[col_index] self.cols_dict[col_name] = col_index
def __get_cols_transform_method(data, replace_method, col_replace_method): header = get_header(data) if col_replace_method: replace_method_per_col = { col_name: col_replace_method.get(col_name, replace_method) for col_name in header } else: replace_method_per_col = { col_name: replace_method for col_name in header } skip_cols = [v for v in header if replace_method_per_col[v] is None] return replace_method_per_col, skip_cols
def __init__(self, data_instances, cols): self.finish_fit = False self.summary_statistics = [] self.medians = None self.data_instances = data_instances header = get_header(data_instances) if cols == -1: self.cols = header else: self.cols = cols self.cols_dict = {} for col in self.cols: col_index = header.index(col) self.cols_dict[col] = col_index
def fit(self, data): """ Apply standard scale for input data Parameters ---------- data: data_instance, input data Returns ---------- data:data_instance, data after scale mean: list, each column mean value std: list, each column standard deviation """ if not self.with_mean and not self.with_std: shape = data_overview.get_features_shape(data) mean = [0 for _ in range(shape)] std = [1 for _ in range(shape)] return data, mean, std else: summary_obj = MultivariateStatisticalSummary(data, -1) mean = None std = None header = get_header(data) if self.with_mean: mean = summary_obj.get_mean() mean = [mean[key] for key in header] if self.with_std: std = summary_obj.get_std_variance() std = [std[key] for key in header] if not mean and std: mean = [0 for _ in std] elif mean and not std: std = [1 for _ in mean] if not mean or not std: raise ValueError("mean or std is None") f = functools.partial(self.__scale, mean=mean, std=std) data = data.mapValues(f) return data, mean, std
def _init_select_params(self, data_instances): if self.schema is not None: return self.schema = data_instances.schema header = get_header(data_instances) self.curt_select_properties.set_header(header) self.curt_select_properties.set_last_left_col_indexes( [x for x in range(len(header))]) if self.model_param.select_col_indexes == -1: self.curt_select_properties.set_select_all_cols() else: self.curt_select_properties.add_select_col_indexes( self.model_param.select_col_indexes) self.curt_select_properties.add_select_col_names( self.model_param.select_names) self.completed_selection_result.set_header(header) self.completed_selection_result.set_select_col_names( self.curt_select_properties.select_col_names)
def _init_params(self, data_instances): if len(self.schema) == 0: self.schema = data_instances.schema if self.inner_param is not None: return self.inner_param = OneHotInnerParam() # self.schema = data_instances.schema LOGGER.debug("In _init_params, schema is : {}".format(self.schema)) header = get_header(data_instances) self.add_summary("original_dimension", len(header)) self.inner_param.set_header(header) if self.model_param.transform_col_indexes == -1: self.inner_param.set_transform_all() else: self.inner_param.add_transform_indexes(self.model_param.transform_col_indexes) self.inner_param.add_transform_names(self.model_param.transform_col_names)
def fit(self, data_instances, validate_data=None): if not self.need_run: return # check if empty table abnormal_detection.empty_table_detection(data_instances) abnormal_detection.empty_feature_detection(data_instances) # get model model = self.get_model() # get header self.header = data_overview.get_header(data_instances) X_table = data_instances.mapValues(lambda v: v.features) y_table = data_instances.mapValues(lambda v: v.label) X = np.array([v[1] for v in list(X_table.collect())]) y = np.array(list(y_table.collect()))[:, 1] self.model_fit = model.fit(X, y)
def _init_param(self, data_instances): if self.schema is None or len(self.schema) == 0: self.schema = data_instances.schema if self.inner_param is not None: return self.inner_param = StatisticInnerParam() # self.schema = data_instances.schema LOGGER.debug("In _init_params, schema is : {}".format(self.schema)) header = get_header(data_instances) self.inner_param.set_header(header) if self.model_param.column_indexes == -1: self.inner_param.set_static_all() else: self.inner_param.add_static_indices(self.model_param.column_indexes) self.inner_param.add_static_names(self.model_param.column_names) LOGGER.debug(f"column_indexes: {self.model_param.column_indexes}, inner_param" f" static_indices: {self.inner_param.static_indices}") return self
def _init_cols(self, data_instances): self.schema = data_instances.schema header = get_header(data_instances) if self.cols_index == -1: to_select_cols_all = header else: to_select_cols_all = [] for idx in self.cols_index: try: idx = int(idx) except ValueError: raise ValueError("In binning module, selected index: {} is not integer".format(idx)) if idx >= len(header): raise ValueError( "In binning module, selected index: {} exceed length of data dimension".format(idx)) to_select_cols_all.append(header[idx]) self.filter_result = SelfFilterResult(header=header, to_select_cols_all=to_select_cols_all) self.header = header
def _init_cols(self, data_instances): header = get_header(data_instances) self.schema = data_instances.schema self.header = header if self.cols_index == -1: self.cols = header else: cols = [] for idx in self.cols_index: try: idx = int(idx) except ValueError: raise ValueError( "In binning module, selected index: {} is not integer". format(idx)) if idx >= len(header): raise ValueError( "In binning module, selected index: {} exceed length of data dimension" .format(idx)) cols.append(header[idx]) self.cols = cols