Example #1
0
    def convert_feature_to_bin(self, data_instances, split_points=None):
        is_sparse = data_overview.is_sparse_data(data_instances)
        schema = data_instances.schema

        if split_points is None:
            split_points = self.bin_results.all_split_points
        else:
            for col_name, sp in split_points.items():
                self.bin_results.put_col_split_points(col_name, sp)

        if is_sparse:
            f = functools.partial(self._convert_sparse_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='bin_num')
            new_data = data_instances.mapValues(f)
        else:
            f = functools.partial(self._convert_dense_data,
                                  bin_inner_param=self.bin_inner_param,
                                  bin_results=self.bin_results,
                                  abnormal_list=self.abnormal_list,
                                  convert_type='bin_num')
            new_data = data_instances.mapValues(f)
        new_data.schema = schema
        header = get_header(data_instances)
        bin_sparse = self.get_sparse_bin(
            self.bin_inner_param.transform_bin_indexes, split_points, header)
        split_points_result = self.bin_results.get_split_points_array(
            self.bin_inner_param.transform_bin_names)

        return new_data, split_points_result, bin_sparse
Example #2
0
    def fit(self, data_instances):
        if self.sample_weight_name is None and self.class_weight is None:
            return data_instances

        self.header = data_overview.get_header(data_instances)

        if self.class_weight:
            self.weight_mode = "class weight"

        if self.sample_weight_name and self.class_weight:
            LOGGER.warning(f"Both 'sample_weight_name' and 'class_weight' provided. "
                           f"Only weight from 'sample_weight_name' is used.")

        new_schema = copy.deepcopy(data_instances.schema)
        new_schema["sample_weight"] = "weight"
        weight_loc = None
        if self.sample_weight_name:
            self.weight_mode = "sample weight name"
            weight_loc = SampleWeight.get_weight_loc(data_instances, self.sample_weight_name)
            if weight_loc is not None:
                new_schema["header"].pop(weight_loc)
            else:
                raise ValueError(f"Cannot find weight column of given sample_weight_name '{self.sample_weight_name}'.")
        result_instances = self.transform_weighted_instance(data_instances, weight_loc)
        result_instances.schema = new_schema

        self.callback_info()
        if result_instances.mapPartitions(check_negative_sample_weight).reduce(lambda x, y: x or y):
            LOGGER.warning(f"Negative weight found in weighted instances.")
        return result_instances
Example #3
0
    def fit(self, data_instances, validate_data=None):
        if not self.need_run:
            return
        # check if empty table
        LOGGER.info("Enter Local Baseline fit")
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)
        # get model
        model = self.get_model()
        # get header
        self.header = data_overview.get_header(data_instances)

        X_table = data_instances.mapValues(lambda v: v.features)
        y_table = data_instances.mapValues(lambda v: v.label)

        X = np.array([v[1] for v in list(X_table.collect())])
        y = np.array([v[1] for v in list(y_table.collect())])

        w = None
        if data_overview.with_weight(data_instances):
            LOGGER.info(
                f"Input Data with Weight. Weight will be used to fit model.")
            weight_table = data_instances.mapValues(lambda v: v.weight)
            w = np.array([v[1] for v in list(weight_table.collect())])

        self.model_fit = model.fit(X, y, w)
        self.need_one_vs_rest = len(self.model_fit.classes_) > 2
        self.set_summary(self.get_model_summary())
Example #4
0
    def _init_cols(self, data_instances):

        # Already initialized
        if len(self.cols_dict) != 0:
            return

        header = data_overview.get_header(data_instances)
        self.header = header
        if self.cols_index == -1:
            self.cols = header
            self.cols_index = [i for i in range(len(header))]
        else:
            cols = []
            for idx in self.cols_index:
                try:
                    idx = int(idx)
                except ValueError:
                    raise ValueError(
                        "In binning module, selected index: {} is not integer".
                        format(idx))

                if idx >= len(header):
                    raise ValueError(
                        "In binning module, selected index: {} exceed length of data dimension"
                        .format(idx))
                cols.append(header[idx])
            self.cols = cols

        self.cols_dict = {}
        for col in self.cols:
            col_index = header.index(col)
            self.cols_dict[col] = col_index
Example #5
0
    def __transform_replace(self, data, transform_value, replace_area,
                            output_format, skip_cols):
        skip_cols = [get_header(data).index(v) for v in skip_cols]
        if replace_area == 'all':
            if output_format is not None:
                f = functools.partial(
                    Imputer.replace_missing_value_with_replace_value_format,
                    replace_value=transform_value,
                    missing_value_list=self.abnormal_value_list,
                    output_format=output_format)
            else:
                f = functools.partial(
                    Imputer.replace_missing_value_with_replace_value,
                    replace_value=transform_value,
                    missing_value_list=self.abnormal_value_list)
        elif replace_area == 'col':
            if output_format is not None:
                f = functools.partial(
                    Imputer.
                    replace_missing_value_with_cols_transform_value_format,
                    transform_list=transform_value,
                    missing_value_list=self.abnormal_value_list,
                    output_format=output_format,
                    skip_cols=set(skip_cols))
            else:
                f = functools.partial(
                    Imputer.replace_missing_value_with_cols_transform_value,
                    transform_list=transform_value,
                    missing_value_list=self.abnormal_value_list,
                    skip_cols=set(skip_cols))
        else:
            raise ValueError(
                "Unknown replace area {} in Imputer".format(replace_area))

        return data.mapValues(f)
Example #6
0
    def __get_cols_transform_value(self, data, replace_method, quantile=None):
        summary_obj = MultivariateStatisticalSummary(
            data, -1, abnormal_list=self.missing_value_list)
        header = get_header(data)

        if replace_method == consts.MIN:
            cols_transform_value = summary_obj.get_min()
        elif replace_method == consts.MAX:
            cols_transform_value = summary_obj.get_max()
        elif replace_method == consts.MEAN:
            cols_transform_value = summary_obj.get_mean()
        elif replace_method == consts.MEDIAN:
            cols_transform_value = summary_obj.get_median()
        elif replace_method == consts.QUANTILE:
            if quantile > 1 or quantile < 0:
                raise ValueError(
                    "quantile should between 0 and 1, but get:{}".format(
                        quantile))
            cols_transform_value = summary_obj.get_quantile_point(quantile)
        else:
            raise ValueError(
                "Unknown replace method:{}".format(replace_method))

        cols_transform_value = [
            round(cols_transform_value[key], 6) for key in header
        ]
        return cols_transform_value
Example #7
0
    def _get_scale_column_idx(self, data):
        data_shape = self._get_data_shape(data)
        if self.param_scale_col_indexes != -1:
            if isinstance(self.param_scale_col_indexes, list):
                if len(self.param_scale_col_indexes) > 0:
                    max_col_idx = max(self.param_scale_col_indexes)
                    if max_col_idx >= data_shape:
                        raise ValueError(
                            "max column index in area is:{}, should less than data shape:{}"
                            .format(max_col_idx, data_shape))
                scale_column_idx = self.param_scale_col_indexes

                header = data_overview.get_header(data)

                scale_names = set(header).intersection(
                    set(self.param_scale_names))
                idx_from_name = list(
                    map(lambda n: header.index(n), scale_names))

                scale_column_idx = scale_column_idx + idx_from_name
                scale_column_idx = list(set(scale_column_idx))
                scale_column_idx.sort()
            else:
                LOGGER.warning(
                    "parameter scale_column_idx should be a list, but not:{}, set scale column to all columns"
                    .format(type(self.param_scale_col_indexes)))
                scale_column_idx = [i for i in range(data_shape)]
        else:
            scale_column_idx = [i for i in range(data_shape)]

        return scale_column_idx
Example #8
0
    def _setup_bin_inner_param(self, data_instances,
                               params: FeatureBinningParam):
        if self.schema is not None:
            return

        self.header = get_header(data_instances)
        LOGGER.debug("_setup_bin_inner_param, get header: {}".format(
            self.header))

        self.schema = data_instances.schema
        self.bin_inner_param.set_header(self.header)
        if params.bin_indexes == -1:
            self.bin_inner_param.set_bin_all()
        else:
            self.bin_inner_param.add_bin_indexes(params.bin_indexes)
            self.bin_inner_param.add_bin_names(params.bin_names)

        self.bin_inner_param.add_category_indexes(params.category_indexes)
        self.bin_inner_param.add_category_names(params.category_names)

        if params.transform_param.transform_cols == -1:
            self.bin_inner_param.set_transform_all()
        else:
            self.bin_inner_param.add_transform_bin_indexes(
                params.transform_param.transform_cols)
            self.bin_inner_param.add_transform_bin_names(
                params.transform_param.transform_names)
        # LOGGER.debug("After _setup_bin_inner_param: {}".format(self.bin_inner_param.__dict__))
        self.binning_obj.set_bin_inner_param(self.bin_inner_param)
        LOGGER.debug("After _setup_bin_inner_param, header: {}".format(
            self.header))
Example #9
0
    def _init_cols(self, data_instances):
        self.schema = data_instances.schema
        header = get_header(data_instances)
        self.original_header = copy.deepcopy(header)
        LOGGER.debug("When init, original_header: {}".format(self.original_header))
        if self.cols_index == -1:
            self.cols = [i for i in range(len(header))]
        else:
            cols = []
            for idx in self.cols_index:
                try:
                    idx = int(idx)
                except ValueError:
                    raise ValueError("In binning module, selected index: {} is not integer".format(idx))

                if idx >= len(header):
                    raise ValueError(
                        "In binning module, selected index: {} exceed length of data dimension".format(idx))
                cols.append(idx)
            self.cols = cols

        # self.left_col_names = self.cols.copy()

        # Set all columns are left at the beginning.
        self.left_cols_index = [i for i in range(len(header))]
        for col_idx in self.cols:
            self.left_cols[col_idx] = True
        self.header = header
Example #10
0
    def _parse_cols(self, data_instances):
        if self.header is not None and len(self.header) != 0:
            return

        LOGGER.debug("Before Binning, schema is : {}".format(
            data_instances.schema))
        header = get_header(data_instances)
        self.schema = data_instances.schema
        self.header = header

        # LOGGER.debug("data_instance count: {}, header: {}".format(data_instances.count(), header))
        if self.cols_index == -1:
            if header is None:
                raise RuntimeError(
                    'Cannot get feature header, please check input data')
            self.cols = [i for i in range(len(header))]
        else:
            self.cols = self.cols_index

        if self.transform_cols_idx == -1:
            self.transform_cols_idx = self.cols

        self.cols_dict = {}
        for col in self.cols:
            col_name = header[col]
            self.cols_dict[col_name] = col
Example #11
0
    def fit_split_points(self, data_instances):
        """
        Apply the binning method

        Parameters
        ----------
        data_instances : DTable
            The input data

        Returns
        -------
        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...                         # Other features
                            }
        """
        header = data_overview.get_header(data_instances)
        self._default_setting(header)
        # self._init_cols(data_instances)
        percent_value = 1.0 / self.bin_num

        # calculate the split points
        percentile_rate = [i * percent_value for i in range(1, self.bin_num)]
        percentile_rate.append(1.0)
        is_sparse = data_overview.is_sparse_data(data_instances)

        # self._fit_split_point_deprecate(data_instances, is_sparse, percentile_rate)
        self._fit_split_point(data_instances, is_sparse, percentile_rate)

        self.fit_category_features(data_instances)
        return self.bin_results.all_split_points
Example #12
0
    def _setup_bin_inner_param(self, data_instances, params):
        if self.bin_inner_param is not None:
            return
        self.bin_inner_param = BinInnerParam()

        header = get_header(data_instances)
        LOGGER.debug("_setup_bin_inner_param, get header length: {}".format(
            len(self.header)))

        self.schema = data_instances.schema
        self.bin_inner_param.set_header(header)
        if params.bin_indexes == -1:
            self.bin_inner_param.set_bin_all()
        else:
            self.bin_inner_param.add_bin_indexes(params.bin_indexes)
            self.bin_inner_param.add_bin_names(params.bin_names)

        self.bin_inner_param.add_category_indexes(params.category_indexes)
        self.bin_inner_param.add_category_names(params.category_names)

        if params.transform_param.transform_cols == -1:
            self.bin_inner_param.set_transform_all()
        else:
            self.bin_inner_param.add_transform_bin_indexes(
                params.transform_param.transform_cols)
            self.bin_inner_param.add_transform_bin_names(
                params.transform_param.transform_names)
        self.set_bin_inner_param(self.bin_inner_param)
Example #13
0
    def _init_cols(self, data_instances):
        header = get_header(data_instances)
        if self.cols == -1:
            self.cols = header

        for col in self.cols:
            col_index = header.index(col)
            self.cols_dict[col] = col_index
Example #14
0
    def fit_split_points(self, data_instances):
        """
        Apply the binning method

        Parameters
        ----------
        data_instances : DTable
            The input data

        Returns
        -------
        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        """
        header = data_overview.get_header(data_instances)
        self._default_setting(header)
        # self._init_cols(data_instances)
        percent_value = 1.0 / self.bin_num

        # calculate the split points
        percentile_rate = [i * percent_value for i in range(1, self.bin_num)]
        percentile_rate.append(1.0)
        is_sparse = data_overview.is_sparse_data(data_instances)

        if self.summary_dict is None:
            f = functools.partial(self.approxi_quantile,
                                  params=self.params,
                                  abnormal_list=self.abnormal_list,
                                  cols_dict=self.bin_inner_param.bin_cols_map,
                                  header=self.header,
                                  is_sparse=is_sparse)
            summary_dict = data_instances.mapPartitions(f)
            summary_dict = summary_dict.reduce(self.merge_summary_dict)
            if is_sparse:
                total_count = data_instances.count()
                for _, summary_obj in summary_dict.items():
                    summary_obj.set_total_count(total_count)

            self.summary_dict = summary_dict
        else:
            summary_dict = self.summary_dict
        # split_points = {}
        for col_name, summary in summary_dict.items():
            split_point = []
            for percen_rate in percentile_rate:
                s_p = summary.query(percen_rate)
                if s_p not in split_point:
                    split_point.append(s_p)
            self.bin_results.put_col_split_points(col_name, split_point)

        self.fit_category_features(data_instances)
        return self.bin_results.all_split_points
Example #15
0
    def init_bucket(self, data_instances):
        header = data_overview.get_header(data_instances)
        self._default_setting(header)

        init_bucket_param = copy.deepcopy(self.params)
        init_bucket_param.bin_num = self.optimal_param.init_bin_nums
        if self.optimal_param.init_bucket_method == consts.QUANTILE:
            init_binning_obj = QuantileBinningTool(param_obj=init_bucket_param,
                                                   allow_duplicate=False)
        else:
            init_binning_obj = BucketBinning(params=init_bucket_param)
        init_binning_obj.set_bin_inner_param(self.bin_inner_param)
        init_split_points = init_binning_obj.fit_split_points(data_instances)
        is_sparse = data_overview.is_sparse_data(data_instances)

        bucket_dict = dict()
        for col_name, sps in init_split_points.items():

            bucket_list = []
            for idx, sp in enumerate(sps):
                bucket = bucket_info.Bucket(idx,
                                            self.adjustment_factor,
                                            right_bound=sp)
                if idx == 0:
                    bucket.left_bound = -math.inf
                    bucket.set_left_neighbor(None)
                else:
                    bucket.left_bound = sps[idx - 1]
                bucket.event_total = self.event_total
                bucket.non_event_total = self.non_event_total
                bucket_list.append(bucket)
            bucket_list[-1].set_right_neighbor(None)
            bucket_dict[col_name] = bucket_list
            # LOGGER.debug(f"col_name: {col_name}, length of sps: {len(sps)}, "
            #              f"length of list: {len(bucket_list)}")

        convert_func = functools.partial(
            self.convert_data_to_bucket,
            split_points=init_split_points,
            headers=self.header,
            bucket_dict=copy.deepcopy(bucket_dict),
            is_sparse=is_sparse,
            get_bin_num_func=self.get_bin_num)
        bucket_table = data_instances.mapReducePartitions(
            convert_func, self.merge_bucket_list)
        # bucket_table = dict(bucket_table.collect())

        # for k, v in bucket_table.items():
        #     LOGGER.debug(f"[feature] {k}, length of list: {len(v)}")

        # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table)))
        # bucket_table = [(k, v) for k, v in bucket_table.items()]
        # LOGGER.debug("bucket_table: {}, length: {}".format(type(bucket_table), len(bucket_table)))

        # bucket_table = session.parallelize(bucket_table, include_key=True, partition=data_instances.partitions)

        return bucket_table
Example #16
0
    def __get_cols_transform_value(self,
                                   data,
                                   replace_method,
                                   replace_value=None):
        """

        Parameters
        ----------
        data: input data
        replace_method: dictionary of (column name, replace_method_name) pairs

        Returns
        -------
        list of transform value for each column, length equal to feature count of input data

        """
        summary_obj = MultivariateStatisticalSummary(
            data, -1, abnormal_list=self.abnormal_value_list)
        header = get_header(data)
        cols_transform_value = {}
        if isinstance(replace_value, list):
            if len(replace_value) != len(header):
                raise ValueError(
                    f"replace value {replace_value} length does not match with header {header}, please check."
                )
        for i, feature in enumerate(header):
            if replace_method[feature] is None:
                transform_value = 0
            elif replace_method[feature] == consts.MIN:
                transform_value = summary_obj.get_min()[feature]
            elif replace_method[feature] == consts.MAX:
                transform_value = summary_obj.get_max()[feature]
            elif replace_method[feature] == consts.MEAN:
                transform_value = summary_obj.get_mean()[feature]
            elif replace_method[feature] == consts.MEDIAN:
                transform_value = summary_obj.get_median()[feature]
            elif replace_method[feature] == consts.DESIGNATED:
                if isinstance(replace_value, list):
                    transform_value = replace_value[i]
                else:
                    transform_value = replace_value
                LOGGER.debug(
                    f"replace value for feature {feature} is: {transform_value}"
                )
            else:
                raise ValueError(
                    "Unknown replace method:{}".format(replace_method))
            cols_transform_value[feature] = transform_value

        LOGGER.debug(f"cols_transform value is: {cols_transform_value}")
        cols_transform_value = [cols_transform_value[key] for key in header]
        # cols_transform_value = {i: round(cols_transform_value[key], 6) for i, key in enumerate(header)}
        LOGGER.debug(f"cols_transform value is: {cols_transform_value}")
        return cols_transform_value
Example #17
0
 def __init_cols(self, data_instances, cols_index, stat_order, bias):
     header = data_overview.get_header(data_instances)
     self.header = header
     if cols_index == -1:
         self.cols_index = [i for i in range(len(header))]
     else:
         self.cols_index = cols_index
     LOGGER.debug(f"col_index: {cols_index}, self.col_index: {self.cols_index}")
     self.cols_dict = {header[indices]: indices for indices in self.cols_index}
     self.summary_statistics = SummaryStatistics(length=len(self.cols_index),
                                                 abnormal_list=self.abnormal_list,
                                                 stat_order=stat_order,
                                                 bias=bias)
Example #18
0
    def fit_split_points(self, data_instances):
        header = data_overview.get_header(data_instances)
        self._default_setting(header)

        if (self.event_total and self.non_event_total) is None:
            self.event_total, self.non_event_total = self.get_histogram(data_instances)
        LOGGER.debug("In fit split points, event_total: {}, non_event_total: {}".format(self.event_total,
                                                                                        self.non_event_total))

        bucket_table = self.init_bucket(data_instances)
        sample_count = data_instances.count()
        self.fit_buckets(bucket_table, sample_count)
        self.fit_category_features(data_instances)
Example #19
0
    def fit_split_points(self, data_instances):
        """
        Apply the binning method

        Parameters
        ----------
        data_instances : DTable
            The input data

        Returns
        -------
        split_points : dict.
            Each value represent for the split points for a feature. The element in each row represent for
            the corresponding split point.
            e.g.
            split_points = {'x1': [0.1, 0.2, 0.3, 0.4 ...],    # The first feature
                            'x2': [1, 2, 3, 4, ...],           # The second feature
                            ...]                         # Other features

        """
        header = data_overview.get_header(data_instances)
        self._default_setting(header)

        # is_sparse = data_overview.is_sparse_data(data_instances)
        # if is_sparse:
        #     raise RuntimeError("Bucket Binning method has not supported sparse data yet.")

        # self._init_cols(data_instances)

        statistics = MultivariateStatisticalSummary(
            data_instances,
            self.bin_inner_param.bin_indexes,
            abnormal_list=self.abnormal_list)
        max_dict = statistics.get_max()
        min_dict = statistics.get_min()
        for col_name, max_value in max_dict.items():
            min_value = min_dict.get(col_name)
            split_points = []
            # L = (max_value - min_value) / self.bin_num
            # for k in range(self.bin_num - 1):
            L = (max_value - min_value) / (self.bin_num - 1)
            split_points.append(min_value - 1)
            for k in range(self.bin_num - 2):
                s_p = min_value + (k + 1) * L
                split_points.append(s_p)
            split_points.append(max_value)
            # final_split_points[col_name] = split_point
            self.bin_results.put_col_split_points(col_name, split_points)
        self.fit_category_features(data_instances)
        return self.bin_results.all_split_points
Example #20
0
 def _parse_cols(self, data_instances):
     if self.header is not None and len(self.header) != 0:
         return
     header = get_header(data_instances)
     self.header = header
     # LOGGER.debug("data_instance count: {}, header: {}".format(data_instances.count(), header))
     if self.cols == -1:
         if header is None:
             raise RuntimeError('Cannot get feature header, please check input data')
         self.cols = header
     self.cols_dict = {}
     for col in self.cols:
         col_index = header.index(col)
         self.cols_dict[col] = col_index
Example #21
0
    def _init_cols(self, data_instances):
        # Already initialized
        if self.header is not None:
            return
        if data_instances is None:
            return

        header = get_header(data_instances)
        self.header = header
        if self.cols == -1:
            self.cols = [x for x in range(len(header))]

        for col_index in self.cols:
            col_name = header[col_index]
            self.cols_dict[col_name] = col_index
Example #22
0
    def __get_cols_transform_method(data, replace_method, col_replace_method):
        header = get_header(data)
        if col_replace_method:
            replace_method_per_col = {
                col_name: col_replace_method.get(col_name, replace_method)
                for col_name in header
            }
        else:
            replace_method_per_col = {
                col_name: replace_method
                for col_name in header
            }
        skip_cols = [v for v in header if replace_method_per_col[v] is None]

        return replace_method_per_col, skip_cols
Example #23
0
    def __init__(self, data_instances, cols):
        self.finish_fit = False
        self.summary_statistics = []
        self.medians = None
        self.data_instances = data_instances

        header = get_header(data_instances)

        if cols == -1:
            self.cols = header
        else:
            self.cols = cols

        self.cols_dict = {}
        for col in self.cols:
            col_index = header.index(col)
            self.cols_dict[col] = col_index
Example #24
0
    def fit(self, data):
        """
         Apply standard scale for input data
         Parameters
         ----------
         data: data_instance, input data

         Returns
         ----------
         data:data_instance, data after scale
         mean: list, each column mean value
         std: list, each column standard deviation
         """
        if not self.with_mean and not self.with_std:
            shape = data_overview.get_features_shape(data)
            mean = [0 for _ in range(shape)]
            std = [1 for _ in range(shape)]
            return data, mean, std

        else:
            summary_obj = MultivariateStatisticalSummary(data, -1)
            mean = None
            std = None
            header = get_header(data)

            if self.with_mean:
                mean = summary_obj.get_mean()
                mean = [mean[key] for key in header]

            if self.with_std:
                std = summary_obj.get_std_variance()
                std = [std[key] for key in header]

            if not mean and std:
                mean = [0 for _ in std]
            elif mean and not std:
                std = [1 for _ in mean]

            if not mean or not std:
                raise ValueError("mean or std is None")

            f = functools.partial(self.__scale, mean=mean, std=std)
            data = data.mapValues(f)

            return data, mean, std
 def _init_select_params(self, data_instances):
     if self.schema is not None:
         return
     self.schema = data_instances.schema
     header = get_header(data_instances)
     self.curt_select_properties.set_header(header)
     self.curt_select_properties.set_last_left_col_indexes(
         [x for x in range(len(header))])
     if self.model_param.select_col_indexes == -1:
         self.curt_select_properties.set_select_all_cols()
     else:
         self.curt_select_properties.add_select_col_indexes(
             self.model_param.select_col_indexes)
     self.curt_select_properties.add_select_col_names(
         self.model_param.select_names)
     self.completed_selection_result.set_header(header)
     self.completed_selection_result.set_select_col_names(
         self.curt_select_properties.select_col_names)
Example #26
0
    def _init_params(self, data_instances):
        if len(self.schema) == 0:
            self.schema = data_instances.schema

        if self.inner_param is not None:
            return
        self.inner_param = OneHotInnerParam()
        # self.schema = data_instances.schema
        LOGGER.debug("In _init_params, schema is : {}".format(self.schema))
        header = get_header(data_instances)
        self.add_summary("original_dimension", len(header))
        self.inner_param.set_header(header)

        if self.model_param.transform_col_indexes == -1:
            self.inner_param.set_transform_all()
        else:
            self.inner_param.add_transform_indexes(self.model_param.transform_col_indexes)
            self.inner_param.add_transform_names(self.model_param.transform_col_names)
Example #27
0
    def fit(self, data_instances, validate_data=None):
        if not self.need_run:
            return
        # check if empty table
        abnormal_detection.empty_table_detection(data_instances)
        abnormal_detection.empty_feature_detection(data_instances)
        # get model
        model = self.get_model()
        # get header
        self.header = data_overview.get_header(data_instances)

        X_table = data_instances.mapValues(lambda v: v.features)
        y_table = data_instances.mapValues(lambda v: v.label)

        X = np.array([v[1] for v in list(X_table.collect())])
        y = np.array(list(y_table.collect()))[:, 1]

        self.model_fit = model.fit(X, y)
Example #28
0
    def _init_param(self, data_instances):
        if self.schema is None or len(self.schema) == 0:
            self.schema = data_instances.schema

        if self.inner_param is not None:
            return
        self.inner_param = StatisticInnerParam()
        # self.schema = data_instances.schema
        LOGGER.debug("In _init_params, schema is : {}".format(self.schema))
        header = get_header(data_instances)
        self.inner_param.set_header(header)
        if self.model_param.column_indexes == -1:
            self.inner_param.set_static_all()
        else:
            self.inner_param.add_static_indices(self.model_param.column_indexes)
            self.inner_param.add_static_names(self.model_param.column_names)
        LOGGER.debug(f"column_indexes: {self.model_param.column_indexes}, inner_param"
                     f" static_indices: {self.inner_param.static_indices}")

        return self
Example #29
0
    def _init_cols(self, data_instances):
        self.schema = data_instances.schema
        header = get_header(data_instances)

        if self.cols_index == -1:
            to_select_cols_all = header
        else:
            to_select_cols_all = []
            for idx in self.cols_index:
                try:
                    idx = int(idx)
                except ValueError:
                    raise ValueError("In binning module, selected index: {} is not integer".format(idx))

                if idx >= len(header):
                    raise ValueError(
                        "In binning module, selected index: {} exceed length of data dimension".format(idx))
                to_select_cols_all.append(header[idx])

        self.filter_result = SelfFilterResult(header=header, to_select_cols_all=to_select_cols_all)
        self.header = header
Example #30
0
    def _init_cols(self, data_instances):
        header = get_header(data_instances)
        self.schema = data_instances.schema
        self.header = header
        if self.cols_index == -1:
            self.cols = header
        else:
            cols = []
            for idx in self.cols_index:
                try:
                    idx = int(idx)
                except ValueError:
                    raise ValueError(
                        "In binning module, selected index: {} is not integer".
                        format(idx))

                if idx >= len(header):
                    raise ValueError(
                        "In binning module, selected index: {} exceed length of data dimension"
                        .format(idx))
                cols.append(header[idx])
            self.cols = cols