Ejemplo n.º 1
0
class HeteroBinningGuestWorkflow(WorkFlow):
    def _initialize(self, config_path):
        self._initialize_role_and_mode()
        self._initialize_model(config_path)
        self._initialize_workflow_param(config_path)

    def _initialize_role_and_mode(self):
        self.role = consts.GUEST
        self.mode = consts.HETERO

    def _initialize_intersect(self, config):
        pass

    def _initialize_model(self, runtime_conf_path):
        binning_param = FeatureBinningParam()
        self.binning_param = ParamExtract.parse_param_from_config(
            binning_param, runtime_conf_path)
        FeatureBinningParamChecker.check_param(self.binning_param)
        self.model = HeteroFeatureBinningGuest(self.binning_param)
        LOGGER.debug("Guest model started")

    def save_binning_result(self):

        meta_table = self.model.save_model(self.workflow_param.model_table,
                                           self.workflow_param.model_namespace)
        return meta_table

    @status_tracer_decorator.status_trace
    def run(self):
        self._init_argument()

        if self.workflow_param.method == "binning":

            if self.binning_param.process_method == 'fit':
                train_data_instance = self.gen_data_instance(
                    self.workflow_param.train_input_table,
                    self.workflow_param.train_input_namespace,
                    mode='fit')
                if self.binning_param.local_only:
                    self.model.fit_local(train_data_instance)
                else:
                    self.model.fit(train_data_instance)
                self.save_binning_result()
            else:
                train_data_instance = self.gen_data_instance(
                    self.workflow_param.train_input_table,
                    self.workflow_param.train_input_namespace,
                    mode='transform')
                self.load_model()

                if self.binning_param.local_only:
                    self.model.transform_local(train_data_instance)
                else:
                    self.model.transform(train_data_instance)
                self.save_binning_result()
        else:
            raise TypeError("method %s is not support yet" %
                            (self.workflow_param.method))

        LOGGER.info("Task end")
class HeteroBinningGuestWorkflow(WorkFlow):
    def _initialize(self, config_path):
        self._initialize_role_and_mode()
        self._initialize_model(config_path)
        self._initialize_workflow_param(config_path)

    def _initialize_role_and_mode(self):
        self.role = consts.GUEST
        self.mode = consts.HETERO

    def _initialize_intersect(self, config):
        pass

    def _initialize_model(self, runtime_conf_path):
        binning_param = FeatureBinningParam()
        self.binning_param = ParamExtract.parse_param_from_config(binning_param, runtime_conf_path)
        FeatureBinningParamChecker.check_param(self.binning_param)
        self.model = HeteroFeatureBinningGuest(self.binning_param)
        LOGGER.debug("Guest model started")

    def save_binning_result(self):

        meta_table = self.model.save_model(self.workflow_param.model_table, self.workflow_param.model_namespace)
        return meta_table

    @status_tracer_decorator.status_trace
    def run(self):
        self._init_argument()

        if self.workflow_param.method == "binning":

            if self.binning_param.process_method == 'fit':
                train_data_instance = self.gen_data_instance(self.workflow_param.train_input_table,
                                                             self.workflow_param.train_input_namespace,
                                                             mode='fit')
                LOGGER.debug("After dataio, header is : {}".format(train_data_instance.schema))
                if self.binning_param.local_only:
                    self.model.fit_local(train_data_instance)
                else:
                    LOGGER.debug("Start model fit")
                    self.model.fit(train_data_instance)
                self.model.save_model(self.workflow_param.model_table, self.workflow_param.model_namespace)
                train_data_instance = self.one_hot_encoder_fit_transform(train_data_instance)

            else:
                train_data_instance = self.gen_data_instance(self.workflow_param.train_input_table,
                                                             self.workflow_param.train_input_namespace,
                                                             mode='transform')
                LOGGER.debug("After dataio, header is : {}".format(train_data_instance.schema))
                self.model.load_model(self.workflow_param.model_table, self.workflow_param.model_namespace)
                if self.binning_param.local_only:
                    self.model.transform_local(train_data_instance)
                else:
                    self.model.transform(train_data_instance)
                self.save_binning_result()
                train_data_instance = self.one_hot_encoder_transform(train_data_instance)
            self._show_data(train_data_instance)
        else:
            raise TypeError("method %s is not support yet" % (self.workflow_param.method))

        LOGGER.info("Task end")

    def _show_data(self, data_instances):
        local_data = data_instances.collect()
        LOGGER.debug("data header: {}".format(data_instances.schema))
        n = 0
        for k, v in local_data:
            LOGGER.debug("new data is :{}".format(v.features))
            n += 1
            if n >= 20:
                break
Ejemplo n.º 3
0
class HeteroFeatureSelectionGuest(BaseHeteroFeatureSelection):
    def __init__(self, params):
        super(HeteroFeatureSelectionGuest, self).__init__(params)
        self.left_cols = None
        self.host_left_cols = None
        self.local_only = params.local_only
        self.guest_iv_attrs = None
        self.host_iv_attrs = None
        self.bin_param = self.params.bin_param
        self.static_obj = None
        self.send_times = 0
        self.binning_model = None
        self.results = []
        self.flowid = ''

    def fit(self, data_instances):
        self._abnormal_detection(data_instances)
        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]

        self._parse_cols(data_instances)
        self.left_cols = self.cols.copy()

        for method in self.filter_method:
            self.filter_one_method(data_instances, method)
            if len(self.left_cols) == 0:
                LOGGER.warning(
                    "After filter methods, none of feature left. Please check your filter parameters"
                )
                break

    def fit_local(self, data_instances):
        self._abnormal_detection(data_instances)
        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]

        feature_selection_obj = FeatureSelection(self.params)
        self.left_cols = feature_selection_obj.filter(data_instances)
        if self.cols == -1:
            self.cols = feature_selection_obj.select_cols

        self.left_cols = feature_selection_obj.filter(data_instances)
        self.results = feature_selection_obj.results

    def fit_local_transform(self, data_instances):
        self._abnormal_detection(data_instances)

        self._parse_cols(data_instances)
        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]
        self.fit_local(data_instances)
        new_data = self.transform(data_instances)
        new_data.schema['header'] = self.header

        return new_data

    def transform(self, data_instances):
        self._abnormal_detection(data_instances)

        self._parse_cols(data_instances)
        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]
        new_data = self._transfer_data(data_instances)
        new_data.schema['header'] = self.header

        return new_data

    def fit_transform(self, data_instances):
        self._abnormal_detection(data_instances)

        self.header = data_instances.schema.get(
            'header')  # ['x1', 'x2', 'x3' ... ]
        self.fit(data_instances)
        new_data = self.transform(data_instances)
        new_data.schema['header'] = self.header
        return new_data

    def filter_one_method(self, data_instances, method):

        if method == consts.IV_VALUE_THRES:
            self._calculates_iv_attrs(data_instances,
                                      flowid_postfix='iv_value')
            iv_param = self.params.iv_param
            iv_filter = feature_selection.IVValueSelectFilter(
                iv_param, self.left_cols, self.guest_iv_attrs)
            new_left_cols = iv_filter.filter()

            self.results.append(iv_filter.to_result())

            # Renew current left cols and iv_attrs
            new_iv_list = self._renew_iv_attrs(new_left_cols, self.left_cols,
                                               self.guest_iv_attrs)
            self.guest_iv_attrs = new_iv_list
            self.left_cols = new_left_cols

            if not self.local_only:
                self._filter_host_iv_value()
            LOGGER.info(
                "Finish iv value threshold filter. Current left cols are: {}".
                format(self.left_cols))

        if method == consts.IV_PERCENTILE:

            self._calculates_iv_attrs(data_instances,
                                      flowid_postfix='iv_percentile')
            iv_param = self.params.iv_param
            iv_filter = feature_selection.IVPercentileFilter(iv_param)
            iv_filter.add_attrs(self.guest_iv_attrs, self.left_cols)
            if not self.local_only:
                iv_filter.add_attrs(self.host_iv_attrs, self.host_left_cols)
            left_cols = iv_filter.filter_multiple_parties()
            new_left_cols = left_cols[0]
            self.results.append(iv_filter.to_result())

            # Renew current left cols and iv_attrs
            new_iv_list = self._renew_iv_attrs(new_left_cols, self.left_cols,
                                               self.guest_iv_attrs)
            self.guest_iv_attrs = new_iv_list
            self.left_cols = new_left_cols

            # If host has participated, send result to host
            if len(left_cols) > 1:
                new_host_left_cols = left_cols[1]
                new_host_iv_list = self._renew_iv_attrs(
                    new_host_left_cols, self.host_left_cols,
                    self.host_iv_attrs)
                self.host_iv_attrs = new_host_iv_list
                self.host_left_cols = new_host_left_cols
                self._send_host_result_cols()
            LOGGER.info(
                "Finish iv percentile filter. Current left cols are: {}".
                format(self.left_cols))

        if method == consts.COEFFICIENT_OF_VARIATION_VALUE_THRES:
            coe_param = self.params.coe_param
            coe_filter = feature_selection.CoeffOfVarValueFilter(
                coe_param, self.left_cols, self.static_obj)
            self.left_cols = coe_filter.filter(data_instances)
            self.static_obj = coe_filter.statics_obj
            self.results.append(coe_filter.to_result())

            LOGGER.info(
                "Finish coeffiecient_of_variation value threshold filter. Current left cols are: {}"
                .format(self.left_cols))

        if method == consts.UNIQUE_VALUE:
            unique_param = self.params.unique_param
            unique_filter = feature_selection.UniqueValueFilter(
                unique_param, self.left_cols, self.static_obj)
            self.left_cols = unique_filter.filter(data_instances)
            self.static_obj = unique_filter.statics_obj
            self.results.append(unique_filter.to_result())

            LOGGER.info(
                "Finish unique value filter. Current left cols are: {}".format(
                    self.left_cols))

        if method == consts.OUTLIER_COLS:
            outlier_param = self.params.outlier_param
            outlier_filter = feature_selection.OutlierFilter(
                outlier_param, self.left_cols)
            self.left_cols = outlier_filter.filter(data_instances)
            self.results.append(outlier_filter.to_result())
            LOGGER.info(
                "Finish outlier cols filter. Current left cols are: {}".format(
                    self.left_cols))

    def _transfer_data(self, data_instances):
        if self.left_cols == -1:
            features_shape = get_features_shape(data_instances)
            if features_shape is None:
                raise RuntimeError(
                    'Cannot get feature shape, please check input data')
            self.left_cols = [i for i in range(features_shape)]

        f = functools.partial(self.select_cols, left_cols=self.left_cols)

        new_data = data_instances.mapValues(f)
        self._reset_header()
        return new_data

    def _calculates_iv_attrs(self, data_instances, flowid_postfix=''):
        if self.local_only and self.guest_iv_attrs is not None:
            return

        bin_flow_id = self.flowid + flowid_postfix
        self.bin_param.cols = self.left_cols
        if self.binning_model is None:
            self.binning_model = HeteroFeatureBinningGuest(self.bin_param)
            self.binning_model.set_flowid(bin_flow_id)
        else:
            self.binning_model.reset(self.bin_param, flowid=bin_flow_id)

        if self.local_only:
            if self.guest_iv_attrs is None:
                self.guest_iv_attrs = self.binning_model.fit_local(
                    data_instances=data_instances)
        else:
            iv_attrs = self.binning_model.fit(data_instances)
            self.guest_iv_attrs = iv_attrs.get('local')
            self.host_iv_attrs = iv_attrs.get('remote')
            self.host_left_cols = [i for i in range(len(self.host_iv_attrs))]
            LOGGER.debug("Host left cols: {}".format(self.host_left_cols))
        LOGGER.info("Finish federated binning with host.")

    def _send_host_result_cols(self):
        result_cols_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.result_left_cols, self.send_times)
        federation.remote(self.host_left_cols,
                          name=self.transfer_variable.result_left_cols.name,
                          tag=result_cols_id,
                          role=consts.HOST,
                          idx=0)
        self.send_times += 1
        LOGGER.info(
            "Sent result cols from guest to host, result cols are: {}".format(
                self.host_left_cols))

    def _filter_host_iv_value(self):
        host_iv_thres_id = self.transfer_variable.generate_transferid(
            self.transfer_variable.host_iv_threshold)
        host_iv_thres = federation.get(
            name=self.transfer_variable.host_iv_threshold.name,
            tag=host_iv_thres_id,
            idx=0)
        LOGGER.info("Received iv threshold from host, threshold is :{}".format(
            host_iv_thres))
        iv_param = IVSelectionParam(value_threshold=host_iv_thres)
        host_filter = feature_selection.IVValueSelectFilter(
            iv_param, self.host_left_cols, self.host_iv_attrs)
        new_host_left_cols = host_filter.filter()

        # Renew current host left cols and host iv_attrs
        self.host_iv_attrs = self._renew_iv_attrs(new_host_left_cols,
                                                  self.host_left_cols,
                                                  self.host_iv_attrs)
        self.host_left_cols = new_host_left_cols

        self._send_host_result_cols()

    def _renew_iv_attrs(self, new_left_cols, pre_left_cols, iv_attrs):
        new_iv_list = []
        for left_col in new_left_cols:
            idx = pre_left_cols.index(left_col)
            new_iv_list.append(iv_attrs[idx])
        return new_iv_list

    def _parse_cols(self, data_instances):
        if self.cols == -1:
            features_shape = get_features_shape(data_instances)
            if features_shape is None:
                raise RuntimeError(
                    'Cannot get feature shape, please check input data')
            self.cols = [i for i in range(features_shape)]