Exemple #1
0
 def _display_result(self, block_num=None):
     if block_num is None:
         self.callback_metric(metric_name=self.metric_name,
                              metric_namespace=self.metric_namespace,
                              metric_data=[
                                  Metric("Coverage", self.coverage),
                                  Metric("Block number", self.block_num)
                              ])
         self.tracker.set_metric_meta(
             metric_namespace=self.metric_namespace,
             metric_name=self.metric_name,
             metric_meta=MetricMeta(self.metric_name,
                                    metric_type="INTERSECTION"))
     else:
         self.callback_metric(metric_name=self.metric_name,
                              metric_namespace=self.metric_namespace,
                              metric_data=[
                                  Metric("Coverage", self.coverage),
                                  Metric("Block number", block_num)
                              ])
         self.tracker.set_metric_meta(
             metric_namespace=self.metric_namespace,
             metric_name=self.metric_name,
             metric_meta=MetricMeta(self.metric_name,
                                    metric_type="INTERSECTION"))
Exemple #2
0
    def fit(self, data):
        self.__init_intersect_method()

        if self.model_param.repeated_id_process:
            if self.model_param.intersect_cache_param.use_cache is True and self.model_param.intersect_method == consts.RSA:
                raise ValueError(
                    "Not support cache module while repeated id process.")

            if len(
                    self.host_party_id_list
            ) > 1 and self.model_param.repeated_id_owner != consts.GUEST:
                raise ValueError(
                    "While multi-host, repeated_id_owner should be guest.")

            proc_obj = RepeatedIDIntersect(
                repeated_id_owner=self.model_param.repeated_id_owner,
                role=self.role)
            data = proc_obj.run(data=data)

        if self.model_param.allow_info_share:
            if self.model_param.intersect_method == consts.RSA and self.model_param.info_owner == consts.GUEST \
                    or self.model_param.intersect_method == consts.RAW and self.model_param.join_role == self.model_param.info_owner:
                self.model_param.sync_intersect_ids = False

        self.intersect_ids = self.intersection_obj.run(data)

        if self.model_param.allow_info_share:
            self.intersect_ids = self.__share_info(data)

        LOGGER.info("Finish intersection")

        if self.intersect_ids:
            self.intersect_num = self.intersect_ids.count()
            self.intersect_rate = self.intersect_num * 1.0 / data.count()

        self.set_summary(self.get_model_summary())

        self.callback_metric(metric_name=self.metric_name,
                             metric_namespace=self.metric_namespace,
                             metric_data=[
                                 Metric("intersect_count", self.intersect_num),
                                 Metric("intersect_rate", self.intersect_rate)
                             ])
        self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                     metric_name=self.metric_name,
                                     metric_meta=MetricMeta(
                                         name=self.metric_name,
                                         metric_type=self.metric_type))
Exemple #3
0
def callback(keyword="missing_impute",
             value_list=None,
             tracker=None):
    # tracker = Tracking("abc", "123")
    metric_type = None
    """
    if keyword.endswith("ratio"):
        metric_list = []
        for i in range(len(value_list)):
            metric_list.append(Metric(i, value_list[i]))

        tracker.log_metric_data(keyword, "DATAIO", metric_list)

        metric_type = "DATAIO_TABLE"
    """
    metric_list = []
    for i in range(len(value_list)):
        metric_list.append(Metric(value_list[i], i))

    tracker.log_metric_data(keyword, "DATAIO", metric_list)

    metric_type = "DATAIO_TEXT"

    tracker.set_metric_meta(keyword,
                            "DATAIO",
                            MetricMeta(name=keyword,
                                       metric_type=metric_type))
Exemple #4
0
 def run(self, component_parameters=None, args=None):
     self.parameters = component_parameters["DownloadParam"]
     self.parameters["role"] = component_parameters["role"]
     self.parameters["local"] = component_parameters["local"]
     table_name, namespace = dtable_utils.get_table_info(config=self.parameters,
                                                         create=False)
     job_id = self.taskid.split("_")[0]
     session.init(job_id, self.parameters["work_mode"])
     with open(os.path.abspath(self.parameters["output_path"]), "w") as fout:
         data_table = session.get_data_table(name=table_name, namespace=namespace)
         count = data_table.count()
         LOGGER.info('===== begin to export data =====')
         lines = 0
         for key, value in data_table.collect():
             if not value:
                 fout.write(key + "\n")
             else:
                 fout.write(key + self.parameters.get("delimitor", ",") + value + "\n")
             lines += 1
             if lines % 2000 == 0:
                 LOGGER.info("===== export {} lines =====".format(lines))
             if lines % 10000 == 0:
                 job_info = {'f_progress': lines/count*100//1}
                 self.update_job_status(self.parameters["local"]['role'], self.parameters["local"]['party_id'],
                                        job_info)
         self.update_job_status(self.parameters["local"]['role'],
                                self.parameters["local"]['party_id'], {'f_progress': 100})
         self.callback_metric(metric_name='data_access',
                              metric_namespace='download',
                              metric_data=[Metric("count", data_table.count())])
         LOGGER.info("===== export {} lines totally =====".format(lines))
         LOGGER.info('===== export data finish =====')
         LOGGER.info('===== export data file path:{} ====='.format(os.path.abspath(self.parameters["output_path"])))
Exemple #5
0
 def save_data_table(self, dst_table_name, dst_table_namespace, head=True, job_id=None):
     input_file = self.parameters["file"]
     count = self.get_count(input_file)
     with open(input_file, 'r') as fin:
         lines_count = 0
         if head is True:
             data_head = fin.readline()
             count -= 1
             self.save_data_header(data_head, dst_table_name, dst_table_namespace)
         while True:
             data = list()
             lines = fin.readlines(self.MAX_BYTES)
             if lines:
                 for line in lines:
                     values = line.replace("\n", "").replace("\t", ",").split(",")
                     data.append((values[0], self.list_to_str(values[1:])))
                 lines_count += len(data)
                 f_progress = lines_count/count*100//1
                 job_info = {'f_progress': f_progress}
                 self.update_job_status(self.parameters["local"]['role'], self.parameters["local"]['party_id'],
                                        job_info)
                 data_table = session.save_data(data, name=dst_table_name, namespace=dst_table_namespace,
                                                partition=self.parameters["partition"])
             else:
                 self.tracker.save_data_view(role=self.parameters["local"]['role'],
                                             party_id=self.parameters["local"]['party_id'],
                                             data_info={'f_table_name': dst_table_name,
                                                        'f_table_namespace': dst_table_namespace,
                                                        'f_partition': self.parameters["partition"],
                                                        'f_table_create_count': data_table.count()
                                                        })
                 self.callback_metric(metric_name='data_access',
                                      metric_namespace='upload',
                                      metric_data=[Metric("count", data_table.count())])
                 return data_table.count()
Exemple #6
0
    def save_data_table(self,
                        dst_table_name,
                        dst_table_namespace,
                        head=True,
                        in_version=False):
        input_file = self.parameters["file"]
        count = self.get_count(input_file)
        with open(input_file, 'r') as fin:
            lines_count = 0
            if head is True:
                data_head = fin.readline()
                count -= 1
                self.save_data_header(data_head, dst_table_name,
                                      dst_table_namespace)
                self.table_info["cols"] = data_head
            while True:
                data = list()
                lines = fin.readlines(self.MAX_BYTES)
                if lines:
                    for line in lines:
                        values = line.replace("\n", "").replace("\t",
                                                                ",").split(",")
                        data.append((values[0], self.list_to_str(values[1:])))
                    lines_count += len(data)
                    f_progress = lines_count / count * 100 // 1
                    job_info = {'f_progress': f_progress}
                    self.update_job_status(
                        self.parameters["local"]['role'],
                        self.parameters["local"]['party_id'], job_info)
                    data_table = session.save_data(
                        data,
                        name=dst_table_name,
                        namespace=dst_table_namespace,
                        partition=self.parameters["partition"])

                    self.table_info["v_len"] = data_table_count
                else:
                    self.tracker.save_data_view(
                        role=self.parameters["local"]['role'],
                        party_id=self.parameters["local"]['party_id'],
                        data_info={
                            'f_table_name': dst_table_name,
                            'f_table_namespace': dst_table_namespace,
                            'f_partition': self.parameters["partition"],
                            'f_table_count_actual': data_table.count(),
                            'f_table_count_upload': count
                        })
                    self.callback_metric(
                        metric_name='data_access',
                        metric_namespace='upload',
                        metric_data=[Metric("count", data_table.count())])
                    if in_version:
                        version_log = "[AUTO] save data at %s." % datetime.datetime.now(
                        )
                        version_control.save_version(
                            name=dst_table_name,
                            namespace=dst_table_namespace,
                            version_log=version_log)
                    return data_table.count()
Exemple #7
0
    def fit(self, data_inst, validate_data=None):
        validation_strategy = self.init_validation_strategy(
            data_inst, validate_data)
        self._build_model()
        self.prepare_batch_data(self.batch_generator, data_inst)
        if not self.input_shape:
            self.model.set_empty()

        self._set_loss_callback_info()
        cur_epoch = 0
        while cur_epoch < self.epochs:
            LOGGER.debug("cur epoch is {}".format(cur_epoch))
            epoch_loss = 0

            for batch_idx in range(len(self.data_x)):
                self.model.train(self.data_x[batch_idx],
                                 self.data_y[batch_idx], cur_epoch, batch_idx)

                self.reset_flowid()
                metrics = self.model.evaluate(self.data_x[batch_idx],
                                              self.data_y[batch_idx],
                                              cur_epoch, batch_idx)
                self.recovery_flowid()

                LOGGER.debug("metrics is {}".format(metrics))
                batch_loss = metrics["loss"]

                epoch_loss += batch_loss

            epoch_loss /= len(self.data_x)

            LOGGER.debug("epoch {}' loss is {}".format(cur_epoch, epoch_loss))

            self.callback_metric("loss", "train",
                                 [Metric(cur_epoch, epoch_loss)])

            self.history_loss.append(epoch_loss)

            if validation_strategy:
                validation_strategy.validate(self, cur_epoch)

            is_converge = self.converge_func.is_converge(epoch_loss)
            self.transfer_variable.is_converge.remote(is_converge,
                                                      role=consts.HOST,
                                                      idx=0,
                                                      suffix=(cur_epoch, ))

            if is_converge:
                LOGGER.debug(
                    "Training process is converged in epoch {}".format(
                        cur_epoch))
                break

            cur_epoch += 1

        if cur_epoch == self.epochs:
            LOGGER.debug(
                "Training process reach max training epochs {} and not converged"
                .format(self.epochs))
Exemple #8
0
 def __save_single_value(self, result, metric_name, metric_namespace,
                         eval_name):
     self.tracker.log_metric_data(
         metric_namespace, metric_name,
         [Metric(eval_name, np.round(result, self.round_num))])
     self.tracker.set_metric_meta(
         metric_namespace, metric_name,
         MetricMeta(name=metric_name, metric_type="EVALUATION_SUMMARY"))
Exemple #9
0
    def fit(self, data_inst, validate_data=None):
        self.validation_strategy = self.init_validation_strategy(data_inst, validate_data)
        self._build_model()
        self.prepare_batch_data(self.batch_generator, data_inst)
        if not self.input_shape:
            self.model.set_empty()

        self._set_loss_callback_info()
        cur_epoch = 0
        while cur_epoch < self.epochs:
            LOGGER.debug("cur epoch is {}".format(cur_epoch))
            epoch_loss = 0

            for batch_idx in range(len(self.data_x)):
                batch_loss = self.model.train(self.data_x[batch_idx], self.data_y[batch_idx], cur_epoch, batch_idx)

                epoch_loss += batch_loss

            epoch_loss /= len(self.data_x)

            LOGGER.debug("epoch {}' loss is {}".format(cur_epoch, epoch_loss))

            self.callback_metric("loss",
                                 "train",
                                 [Metric(cur_epoch, epoch_loss)])

            self.history_loss.append(epoch_loss)

            if self.validation_strategy:
                self.validation_strategy.validate(self, cur_epoch)
                if self.validation_strategy.need_stop():
                    LOGGER.debug('early stopping triggered')
                    break

            if self.hetero_nn_param.selector_param.method:
                # when use selective bp, loss converge will be disabled
                is_converge = False
            else:
                is_converge = self.converge_func.is_converge(epoch_loss)
            self._summary_buf["is_converged"] = is_converge
            self.transfer_variable.is_converge.remote(is_converge,
                                                      role=consts.HOST,
                                                      idx=0,
                                                      suffix=(cur_epoch,))

            if is_converge:
                LOGGER.debug("Training process is converged in epoch {}".format(cur_epoch))
                break

            cur_epoch += 1

        if cur_epoch == self.epochs:
            LOGGER.debug("Training process reach max training epochs {} and not converged".format(self.epochs))

        if self.validation_strategy and self.validation_strategy.has_saved_best_model():
            self.load_model(self.validation_strategy.cur_best_model)

        self.set_summary(self._get_model_summary())
Exemple #10
0
 def read_metric_data(self,
                      metric_namespace: str,
                      metric_name: str,
                      job_level=False):
     metrics = []
     for k, v in self.read_metrics_from_db(metric_namespace, metric_name, 1,
                                           job_level):
         metrics.append(Metric(key=k, value=v))
     return metrics
Exemple #11
0
    def record_step_best(self, step_best, host_mask, guest_mask,
                         data_instances, model):
        metas = {
            "host_mask": host_mask.tolist(),
            "guest_mask": guest_mask.tolist(),
            "score_name": self.score_name
        }
        metas["number_in"] = int(sum(host_mask) + sum(guest_mask))
        metas["direction"] = self.direction
        metas["n_count"] = int(self.n_count)

        host_party_id = model.component_properties.host_party_idlist[0]
        guest_party_id = model.component_properties.guest_partyid
        metas["host_features_anonym"] = [
            f"host_{host_party_id}_{i}" for i in range(len(host_mask))
        ]
        metas["guest_features_anonym"] = [
            f"guest_{guest_party_id}_{i}" for i in range(len(guest_mask))
        ]

        model_info = self.models_trained[step_best]
        loss = model_info.get_loss()
        ic_val = model_info.get_score()
        metas["loss"] = loss
        metas["current_ic_val"] = ic_val
        metas["fit_intercept"] = model.fit_intercept

        model_key = model_info.get_key()
        model_dict = self._get_model(model_key)

        if self.role != consts.ARBITER:
            all_features = data_instances.schema.get('header')
            metas["all_features"] = all_features
            metas["to_enter"] = self.get_to_enter(host_mask, guest_mask,
                                                  all_features)
            model_param = list(model_dict.get('model').values())[0].get(
                model.model_param_name)
            param_dict = MessageToDict(model_param)
            metas["intercept"] = param_dict.get("intercept", None)
            metas["weight"] = param_dict.get("weight", {})
            metas["header"] = param_dict.get("header", [])
            if self.n_step == 0 and self.direction == "forward":
                metas["intercept"] = self.intercept

        metric_name = f"stepwise_{self.n_step}"
        metric = [Metric(metric_name, float(self.n_step))]
        model.callback_metric(metric_name=metric_name,
                              metric_namespace=self.metric_namespace,
                              metric_data=metric)
        model.tracker.set_metric_meta(metric_name=metric_name,
                                      metric_namespace=self.metric_namespace,
                                      metric_meta=MetricMeta(
                                          name=metric_name,
                                          metric_type=self.metric_type,
                                          extra_metas=metas))
        LOGGER.info(f"metric_name: {metric_name}, metas: {metas}")
        return
Exemple #12
0
    def __save_single_value(self, result, metric_name, metric_namespace, eval_name):

        metric_type = 'EVALUATION_SUMMARY'
        if eval_name in consts.ALL_CLUSTER_METRICS:
            metric_type = 'CLUSTERING_EVALUATION_SUMMARY'

        self.tracker.log_metric_data(metric_namespace, metric_name,
                                     [Metric(eval_name, np.round(result, self.round_num))])
        self.tracker.set_metric_meta(metric_namespace, metric_name,
                                     MetricMeta(name=metric_name, metric_type=metric_type))
Exemple #13
0
    def __save_curve_data(self, x_axis_list, y_axis_list, metric_name, metric_namespace):
        points = []
        for i, value in enumerate(x_axis_list):
            if isinstance(value, float):
                value = np.round(value, self.round_num)
            points.append((value, np.round(y_axis_list[i], self.round_num)))
        points.sort(key=lambda x: x[0])

        metric_points = [Metric(point[0], point[1]) for point in points]
        self.tracker.log_metric_data(metric_namespace, metric_name, metric_points)
Exemple #14
0
 def callback(self, metas):
     metric = [Metric(self.metric_name, 0)]
     self.callback_metric(metric_name=self.metric_name,
                          metric_namespace=self.metric_namespace,
                          metric_data=metric)
     self.tracker.set_metric_meta(metric_name=self.metric_name,
                                  metric_namespace=self.metric_namespace,
                                  metric_meta=MetricMeta(
                                      name=self.metric_name,
                                      metric_type=self.metric_type,
                                      extra_metas=metas))
Exemple #15
0
    def callback_loss(self, iter_num, loss):
        metric_meta = MetricMeta(name='train',
                                 metric_type=MetricType.LOSS,
                                 extra_metas={
                                     "unit_name": "iters",
                                 })

        self.callback_meta(metric_name='loss', metric_namespace='train', metric_meta=metric_meta)
        self.callback_metric(metric_name='loss',
                             metric_namespace='train',
                             metric_data=[Metric(iter_num, loss)])
Exemple #16
0
    def fit(self, data):
        self.__init_intersect_method()
        self.intersect_ids = self.intersection_obj.run(data)
        LOGGER.info("Finish intersection")

        if self.intersect_ids:
            self.intersect_num = self.intersect_ids.count()
            self.intersect_rate = self.intersect_num * 1.0 / data.count()

        self.callback_metric(metric_name=self.metric_name,
                             metric_namespace=self.metric_namespace,
                             metric_data=[
                                 Metric("intersect_count", self.intersect_num),
                                 Metric("intersect_rate", self.intersect_rate)
                             ])
        self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                     metric_name=self.metric_name,
                                     metric_meta=MetricMeta(
                                         name=self.metric_name,
                                         metric_type=self.metric_type))
Exemple #17
0
    def save_meta(self, dst_table_namespace, dst_table_name, table_count):
        self.tracker.log_output_data_info(data_name='upload',
                                          table_namespace=dst_table_namespace,
                                          table_name=dst_table_name)

        self.tracker.log_metric_data(metric_namespace="upload",
                                     metric_name="data_access",
                                     metrics=[Metric("count", table_count)])
        self.tracker.set_metric_meta(metric_namespace="upload",
                                     metric_name="data_access",
                                     metric_meta=MetricMeta(
                                         name='upload', metric_type='UPLOAD'))
    def fit(self, data_inst, valid_inst=None):

        self.federated_binning()
        # initializing
        self.feature_num = self.sync_feature_num()
        self.tree_dim = 1

        if self.task_type == consts.CLASSIFICATION:
            label_mapping = self.label_alignment()
            LOGGER.debug('label mapping is {}'.format(label_mapping))
            self.tree_dim = len(label_mapping) if len(label_mapping) > 2 else 1

        if self.n_iter_no_change:
            self.check_convergence_func = converge_func_factory(
                "diff", self.tol)

        LOGGER.debug('begin to fit a boosting tree')
        for epoch_idx in range(self.num_trees):

            for t_idx in range(self.tree_dim):
                valid_feature = self.sample_valid_feature()
                self.send_valid_features(valid_feature, epoch_idx, t_idx)
                flow_id = self.generate_flowid(epoch_idx, t_idx)
                new_tree = HomoDecisionTreeArbiter(self.tree_param,
                                                   valid_feature=valid_feature,
                                                   epoch_idx=epoch_idx,
                                                   flow_id=flow_id,
                                                   tree_idx=t_idx)
                new_tree.fit()

            global_loss = self.aggregator.aggregate_loss(suffix=(epoch_idx, ))
            self.global_loss_history.append(global_loss)
            LOGGER.debug('cur epoch global loss is {}'.format(global_loss))

            self.callback_metric("loss", "train",
                                 [Metric(epoch_idx, global_loss)])

            if self.n_iter_no_change:
                should_stop = self.aggregator.broadcast_converge_status(
                    self.check_convergence, (global_loss, ),
                    suffix=(epoch_idx, ))
                LOGGER.debug('stop flag sent')
                if should_stop:
                    break

        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"Best": min(self.global_loss_history)}))

        LOGGER.debug('fitting h**o decision tree done')
Exemple #19
0
    def callback_dbi(self, iter_num, dbi):
        metric_meta = MetricMeta(name='train',
                                 metric_type="DBI",
                                 extra_metas={
                                     "unit_name": "iters",
                                 })

        self.callback_meta(metric_name='DBI',
                           metric_namespace='train',
                           metric_meta=metric_meta)
        self.callback_metric(metric_name='DBI',
                             metric_namespace='train',
                             metric_data=[Metric(iter_num, dbi)])
Exemple #20
0
    def fit(self, data_inst, validate_data=None):

        # init aggregator
        self.aggregator = HomoBoostArbiterAggregator()
        self.binning_obj = HomoFeatureBinningServer()

        self.federated_binning()
        # initializing
        self.feature_num = self.sync_feature_num()

        if self.task_type == consts.CLASSIFICATION:
            label_mapping = HomoLabelEncoderArbiter().label_alignment()
            LOGGER.info('label mapping is {}'.format(label_mapping))
            self.booster_dim = len(
                label_mapping) if len(label_mapping) > 2 else 1

        if self.n_iter_no_change:
            self.check_convergence_func = converge_func_factory(
                "diff", self.tol)

        LOGGER.info('begin to fit a boosting tree')
        for epoch_idx in range(self.boosting_round):

            LOGGER.info('cur epoch idx is {}'.format(epoch_idx))

            for class_idx in range(self.booster_dim):
                model = self.fit_a_booster(epoch_idx, class_idx)

            global_loss = self.aggregator.aggregate_loss(suffix=(epoch_idx, ))
            self.history_loss.append(global_loss)
            LOGGER.debug('cur epoch global loss is {}'.format(global_loss))

            self.callback_metric("loss", "train",
                                 [Metric(epoch_idx, global_loss)])

            if self.n_iter_no_change:
                should_stop = self.aggregator.broadcast_converge_status(
                    self.check_convergence, (global_loss, ),
                    suffix=(epoch_idx, ))
                LOGGER.debug('stop flag sent')
                if should_stop:
                    break

        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"Best": min(self.history_loss)}))

        self.set_summary(self.generate_summary())
Exemple #21
0
    def callback_loss(self, iter_num, loss):
        # noinspection PyTypeChecker
        metric_meta = MetricMeta(name='train',
                                 metric_type="LOSS",
                                 extra_metas={
                                     "unit_name": "iters",
                                 })

        self.callback_meta(metric_name='loss', metric_namespace='train', metric_meta=metric_meta)
        self.callback_metric(metric_name='loss',
                             metric_namespace='train',
                             metric_data=[Metric(iter_num, loss)])

        self._summary["loss_history"].append(loss)
Exemple #22
0
 def run(self, component_parameters=None, args=None):
     self.parameters = component_parameters["DownloadParam"]
     self.parameters["role"] = component_parameters["role"]
     self.parameters["local"] = component_parameters["local"]
     name, namespace = self.parameters.get("name"), self.parameters.get(
         "namespace")
     with open(os.path.abspath(self.parameters["output_path"]),
               "w") as fout:
         with storage.Session.build(
                 session_id=job_utils.generate_session_id(
                     self.tracker.task_id,
                     self.tracker.task_version,
                     self.tracker.role,
                     self.tracker.party_id,
                     suffix="storage",
                     random_end=True),
                 name=name,
                 namespace=namespace) as storage_session:
             data_table = storage_session.get_table()
             count = data_table.count()
             LOGGER.info('===== begin to export data =====')
             lines = 0
             job_info = {}
             job_info["job_id"] = self.tracker.job_id
             job_info["role"] = self.tracker.role
             job_info["party_id"] = self.tracker.party_id
             for key, value in data_table.collect():
                 if not value:
                     fout.write(key + "\n")
                 else:
                     fout.write(key +
                                self.parameters.get("delimiter", ",") +
                                str(value) + "\n")
                 lines += 1
                 if lines % 2000 == 0:
                     LOGGER.info(
                         "===== export {} lines =====".format(lines))
                 if lines % 10000 == 0:
                     job_info["progress"] = lines / count * 100 // 1
                     ControllerClient.update_job(job_info=job_info)
             job_info["progress"] = 100
             ControllerClient.update_job(job_info=job_info)
             self.callback_metric(
                 metric_name='data_access',
                 metric_namespace='download',
                 metric_data=[Metric("count", data_table.count())])
         LOGGER.info("===== export {} lines totally =====".format(lines))
         LOGGER.info('===== export data finish =====')
         LOGGER.info('===== export data file path:{} ====='.format(
             os.path.abspath(self.parameters["output_path"])))
Exemple #23
0
    def callback_loss(self, iter_num, loss):
        # noinspection PyTypeChecker
        metric_meta = MetricMeta(
            name="train",
            metric_type="LOSS",
            extra_metas={
                "unit_name": "iters",
            },
        )

        self.callback_meta(metric_name="loss",
                           metric_namespace="train",
                           metric_meta=metric_meta)
        self.callback_metric(
            metric_name="loss",
            metric_namespace="train",
            metric_data=[Metric(iter_num, loss)],
        )
Exemple #24
0
    def callback_loss(self, iter_num, loss):
        """
        call back function of loss and metrics.
        :param iter_num: iter number.
        :param loss: loss type.
        :return:
        """
        metric_meta = MetricMeta(name='train',
                                 metric_type="LOSS",
                                 extra_metas={
                                     "unit_name": "iters",
                                 })

        self.callback_meta(metric_name='loss',
                           metric_namespace='train',
                           metric_meta=metric_meta)
        self.callback_metric(metric_name='loss',
                             metric_namespace='train',
                             metric_data=[Metric(iter_num, loss)])
Exemple #25
0
    def callback_info(self):
        class_weight = None
        classes = None
        if self.class_weight:
            class_weight = {str(k): v for k, v in self.class_weight.items()}
            classes = sorted([str(k) for k in self.class_weight.keys()])
        # LOGGER.debug(f"callback class weight is: {class_weight}")

        metric_meta = MetricMeta(name='train',
                                 metric_type=self.metric_type,
                                 extra_metas={
                                     "weight_mode": self.weight_mode,
                                     "class_weight": class_weight,
                                     "classes": classes,
                                     "sample_weight_name":
                                     self.sample_weight_name
                                 })

        self.callback_metric(metric_name=self.metric_name,
                             metric_namespace=self.metric_namespace,
                             metric_data=[Metric(self.metric_name, 0)])
        self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                     metric_name=self.metric_name,
                                     metric_meta=metric_meta)
Exemple #26
0
    def fit(self, data_inst, validate_data=None):

        LOGGER.info('begin to fit a hetero boosting model, model is {}'.format(
            self.model_name))

        self.data_inst = data_inst

        self.data_bin, self.bin_split_points, self.bin_sparse_points = self.prepare_data(
            data_inst)

        self.y = self.get_label(self.data_bin)

        self.classes_, self.num_classes, self.booster_dim = self.check_label()

        LOGGER.info('class index is {}'.format(self.classes_))

        self.loss = self.get_loss_function()

        self.sync_booster_dim()

        self.y_hat, self.init_score = self.get_init_score(
            self.y, self.num_classes)

        self.generate_encrypter()

        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"unit_name": "iters"}))

        self.validation_strategy = self.init_validation_strategy(
            data_inst, validate_data)

        for epoch_idx in range(self.boosting_round):

            LOGGER.info('cur epoch idx is {}'.format(epoch_idx))

            for class_idx in range(self.booster_dim):

                # fit a booster
                model = self.fit_a_booster(epoch_idx, class_idx)

                booster_meta, booster_param = model.get_model()

                if booster_meta is not None and booster_param is not None:
                    self.booster_meta = booster_meta
                    self.boosting_model_list.append(booster_param)

                # update predict score
                cur_sample_weights = model.get_sample_weights()
                self.y_hat = self.get_new_predict_score(self.y_hat,
                                                        cur_sample_weights,
                                                        dim=class_idx)

            # compute loss
            loss = self.compute_loss(self.y_hat, self.y)
            self.history_loss.append(loss)
            LOGGER.info("round {} loss is {}".format(epoch_idx, loss))
            self.callback_metric("loss", "train", [Metric(epoch_idx, loss)])

            if self.validation_strategy:
                self.validation_strategy.validate(
                    self,
                    epoch_idx,
                    use_precomputed_train=True,
                    train_scores=self.score_to_predict_result(
                        data_inst, self.y_hat))

            should_stop_a, should_stop_b = False, False
            if self.validation_strategy is not None:
                if self.validation_strategy.need_stop():
                    should_stop_a = True

            if self.n_iter_no_change and self.check_convergence(loss):
                should_stop_b = True
                self.is_converged = True

            self.sync_stop_flag(self.is_converged, epoch_idx)

            if should_stop_a or should_stop_b:
                break

        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"Best": min(self.history_loss)}))

        if self.validation_strategy and self.validation_strategy.has_saved_best_model(
        ):
            LOGGER.info('best model exported')
            self.load_model(self.validation_strategy.cur_best_model)

        # get summary
        self.set_summary(self.generate_summary())
Exemple #27
0
    def fit(self, data_instances):
        if not self.need_run:
            return data_instances

        self.init_schema(data_instances)
        LOGGER.debug("Before trainning, self.header: {}".format(self.header))
        self._abnormal_detection(data_instances)

        self.__init_parameters(data_instances)

        w = self.__init_model(data_instances)

        for iter_num in range(self.max_iter):
            # mini-batch
            LOGGER.debug("In iter: {}".format(iter_num))
            batch_data_generator = self.mini_batch_obj.mini_batch_data_generator(
            )
            batch_num = 0
            total_loss = 0

            for batch_data in batch_data_generator:
                f = functools.partial(self.gradient_operator.compute,
                                      coef=self.coef_,
                                      intercept=self.intercept_,
                                      fit_intercept=self.fit_intercept)

                grad_loss = batch_data.mapPartitions(f)

                n = batch_data.count()
                if not self.use_encrypt:
                    grad, loss = grad_loss.reduce(
                        self.aggregator.aggregate_grad_loss)
                    grad = np.array(grad)
                    grad /= n
                    loss /= n
                    if self.updater is not None:
                        loss_norm = self.updater.loss_norm(self.coef_)
                        total_loss += loss + loss_norm

                    # if not self.use_loss:
                    #     total_loss = np.linalg.norm(self.coef_)

                    if not self.need_one_vs_rest:
                        metric_meta = MetricMeta(
                            name='train',
                            metric_type="LOSS",
                            extra_metas={"unit_name": "iters"})
                        metric_name = self.get_metric_name('loss')

                        self.callback_meta(metric_name=metric_name,
                                           metric_namespace='train',
                                           metric_meta=metric_meta)
                        self.callback_metric(
                            metric_name=metric_name,
                            metric_namespace='train',
                            metric_data=[Metric(iter_num, total_loss)])

                else:
                    grad, _ = grad_loss.reduce(self.aggregator.aggregate_grad)
                    grad = np.array(grad)
                    grad /= n

                self.update_model(grad)
                w = self.merge_model()

                batch_num += 1
                if self.use_encrypt and batch_num % self.re_encrypt_batches == 0:
                    to_encrypt_model_id = self.transfer_variable.generate_transferid(
                        self.transfer_variable.to_encrypt_model, iter_num,
                        batch_num)

                    federation.remote(
                        w,
                        name=self.transfer_variable.to_encrypt_model.name,
                        tag=to_encrypt_model_id,
                        role=consts.ARBITER,
                        idx=0)

                    re_encrypted_model_id = self.transfer_variable.generate_transferid(
                        self.transfer_variable.re_encrypted_model, iter_num,
                        batch_num)
                    LOGGER.debug("re_encrypted_model_id: {}".format(
                        re_encrypted_model_id))
                    w = federation.get(
                        name=self.transfer_variable.re_encrypted_model.name,
                        tag=re_encrypted_model_id,
                        idx=0)

                    w = np.array(w)
                    self.set_coef_(w)

            model_transfer_id = self.transfer_variable.generate_transferid(
                self.transfer_variable.host_model, iter_num)
            federation.remote(w,
                              name=self.transfer_variable.host_model.name,
                              tag=model_transfer_id,
                              role=consts.ARBITER,
                              idx=0)

            if not self.use_encrypt:
                loss_transfer_id = self.transfer_variable.generate_transferid(
                    self.transfer_variable.host_loss, iter_num)

                federation.remote(total_loss,
                                  name=self.transfer_variable.host_loss.name,
                                  tag=loss_transfer_id,
                                  role=consts.ARBITER,
                                  idx=0)

            LOGGER.debug("model and loss sent")

            final_model_id = self.transfer_variable.generate_transferid(
                self.transfer_variable.final_model, iter_num)

            w = federation.get(name=self.transfer_variable.final_model.name,
                               tag=final_model_id,
                               idx=0)

            w = np.array(w)
            self.set_coef_(w)

            converge_flag_id = self.transfer_variable.generate_transferid(
                self.transfer_variable.converge_flag, iter_num)

            converge_flag = federation.get(
                name=self.transfer_variable.converge_flag.name,
                tag=converge_flag_id,
                idx=0)

            self.n_iter_ = iter_num
            LOGGER.debug("converge_flag: {}".format(converge_flag))
            if converge_flag:
                break
Exemple #28
0
    def fit(self, data):
        LOGGER.debug(f"fit receives data is {data}")
        if not isinstance(data, dict):
            raise ValueError(
                "Union module must receive more than one table as input.")
        empty_count = 0
        combined_table = None
        combined_schema = None
        metrics = []

        for (key, local_table) in data.items():
            LOGGER.debug("table to combine name: {}".format(key))
            num_data = local_table.count()
            LOGGER.debug("table count: {}".format(num_data))
            metrics.append(Metric(key, num_data))
            self.add_summary(key, num_data)

            if num_data == 0:
                LOGGER.warning("Table {} is empty.".format(key))
                if combined_table is None:
                    combined_table = local_table
                    combined_schema = local_table.schema
                empty_count += 1
                continue

            local_is_data_instance = self.check_is_data_instance(local_table)
            if combined_table is None:
                self.is_data_instance = local_is_data_instance
            LOGGER.debug(f"self.is_data_instance is {self.is_data_instance}, "
                         f"local_is_data_instance is {local_is_data_instance}")
            if self.is_data_instance != local_is_data_instance:
                raise ValueError(
                    f"Cannot combine DataInstance and non-DataInstance object. Union aborted."
                )

            if self.is_data_instance:
                self.is_empty_feature = data_overview.is_empty_feature(
                    local_table)
                if self.is_empty_feature:
                    LOGGER.warning("Table {} has empty feature.".format(key))
                else:
                    self.check_schema_content(local_table.schema)

            if combined_table is None:
                # first table to combine
                combined_table = local_table
                combined_schema = local_table.schema
            else:
                self.check_id(local_table, combined_table)
                self.check_label_name(local_table, combined_table)
                self.check_header(local_table, combined_table)
                if self.keep_duplicate:
                    repeated_ids = combined_table.join(local_table,
                                                       lambda v1, v2: 1)
                    self.repeated_ids = [v[0] for v in repeated_ids.collect()]
                    self.key = key
                    local_table = local_table.flatMap(self._renew_id)

                combined_table = combined_table.union(local_table,
                                                      self._keep_first)

                combined_table.schema = combined_schema

            # only check feature length if not empty
            if self.is_data_instance and not self.is_empty_feature:
                self.feature_count = len(combined_schema.get("header"))
                LOGGER.debug("feature count: {}".format(self.feature_count))
                combined_table.mapValues(self.check_feature_length)

        if combined_table is None:
            num_data = 0
            LOGGER.warning(
                "All tables provided are empty or have empty features.")
        else:
            num_data = combined_table.count()
        metrics.append(Metric("Total", num_data))
        self.add_summary("Total", num_data)

        self.callback_metric(metric_name=self.metric_name,
                             metric_namespace=self.metric_namespace,
                             metric_data=metrics)
        self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                     metric_name=self.metric_name,
                                     metric_meta=MetricMeta(
                                         name=self.metric_name,
                                         metric_type=self.metric_type))

        LOGGER.info(
            "Union operation finished. Total {} empty tables encountered.".
            format(empty_count))

        return combined_table
Exemple #29
0
    def save_data_table(self,
                        job_id,
                        dst_table_name,
                        dst_table_namespace,
                        head=True):
        input_file = self.parameters["file"]
        input_feature_count = self.get_count(input_file)
        with open(input_file, 'r') as fin:
            lines_count = 0
            if head is True:
                data_head = fin.readline()
                input_feature_count -= 1
                self.table.get_meta().update_metas(
                    schema=data_utils.get_header_schema(
                        header_line=data_head,
                        id_delimiter=self.parameters["id_delimiter"]))
            n = 0
            while True:
                data = list()
                lines = fin.readlines(self.MAX_BYTES)
                if lines:
                    for line in lines:
                        values = line.rstrip().split(
                            self.parameters["id_delimiter"])
                        data.append((
                            values[0],
                            data_utils.list_to_str(
                                values[1:],
                                id_delimiter=self.parameters["id_delimiter"])))
                    lines_count += len(data)
                    save_progress = lines_count / input_feature_count * 100 // 1
                    job_info = {
                        'progress': save_progress,
                        "job_id": job_id,
                        "role": self.parameters["local"]['role'],
                        "party_id": self.parameters["local"]['party_id']
                    }
                    ControllerClient.update_job(job_info=job_info)
                    self.table.put_all(data)
                    if n == 0:
                        self.table.get_meta().update_metas(part_of_data=data)
                else:
                    table_count = self.table.count()
                    self.table.get_meta().update_metas(
                        count=table_count,
                        partitions=self.parameters["partition"])
                    self.tracker.log_output_data_info(
                        data_name='upload',
                        table_namespace=dst_table_namespace,
                        table_name=dst_table_name)

                    self.tracker.log_metric_data(
                        metric_namespace="upload",
                        metric_name="data_access",
                        metrics=[Metric("count", table_count)])
                    self.tracker.set_metric_meta(metric_namespace="upload",
                                                 metric_name="data_access",
                                                 metric_meta=MetricMeta(
                                                     name='upload',
                                                     metric_type='UPLOAD'))
                    return table_count
                n += 1
Exemple #30
0
    def fit(self, data_inst, validate_data=None):
        LOGGER.info("begin to train secureboosting guest model")
        self.gen_feature_fid_mapping(data_inst.schema)
        data_inst = self.data_alignment(data_inst)
        self.convert_feature_to_bin(data_inst)
        self.set_y()
        self.update_f_value()
        self.generate_encrypter()

        self.sync_tree_dim()

        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"unit_name": "iters"}))

        validation_strategy = self.init_validation_strategy(
            data_inst, validate_data)

        for i in range(self.num_trees):
            self.compute_grad_and_hess()
            for tidx in range(self.tree_dim):
                tree_inst = HeteroDecisionTreeGuest(self.tree_param)

                tree_inst.set_inputinfo(self.data_bin,
                                        self.get_grad_and_hess(tidx),
                                        self.bin_split_points,
                                        self.bin_sparse_points)

                valid_features = self.sample_valid_features()
                tree_inst.set_valid_features(valid_features)
                tree_inst.set_encrypter(self.encrypter)
                tree_inst.set_encrypted_mode_calculator(
                    self.encrypted_calculator)
                tree_inst.set_flowid(self.generate_flowid(i, tidx))
                tree_inst.set_host_party_idlist(
                    self.component_properties.host_party_idlist)
                tree_inst.set_runtime_idx(
                    self.component_properties.local_partyid)

                tree_inst.fit()

                tree_meta, tree_param = tree_inst.get_model()
                self.trees_.append(tree_param)
                if self.tree_meta is None:
                    self.tree_meta = tree_meta
                self.update_f_value(new_f=tree_inst.predict_weights, tidx=tidx)
                self.update_feature_importance(
                    tree_inst.get_feature_importance())

            loss = self.compute_loss()
            self.history_loss.append(loss)
            LOGGER.info("round {} loss is {}".format(i, loss))

            LOGGER.debug("type of loss is {}".format(type(loss).__name__))
            self.callback_metric("loss", "train", [Metric(i, loss)])

            if validation_strategy:
                validation_strategy.validate(self, i)

            if self.n_iter_no_change is True:
                if self.check_convergence(loss):
                    self.sync_stop_flag(True, i)
                    break
                else:
                    self.sync_stop_flag(False, i)

        LOGGER.debug("history loss is {}".format(min(self.history_loss)))
        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"Best": min(self.history_loss)}))

        LOGGER.info("end to train secureboosting guest model")