Exemple #1
0
    def callback_info(self):
        class_weight = None
        classes = None
        if self.class_weight_dict:
            class_weight = {
                str(k): v
                for k, v in self.class_weight_dict.items()
            }
            classes = sorted([str(k) for k in self.class_weight_dict.keys()])
        # LOGGER.debug(f"callback class weight is: {class_weight}")

        metric_meta = MetricMeta(name='train',
                                 metric_type=self.metric_type,
                                 extra_metas={
                                     "weight_mode": self.weight_mode,
                                     "class_weight": class_weight,
                                     "classes": classes,
                                     "sample_weight_name":
                                     self.sample_weight_name
                                 })

        self.callback_metric(metric_name=self.metric_name,
                             metric_namespace=self.metric_namespace,
                             metric_data=[Metric(self.metric_name, 0)])
        self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                     metric_name=self.metric_name,
                                     metric_meta=metric_meta)
Exemple #2
0
    def __save_curve_meta(self,
                          metric_name,
                          metric_namespace,
                          metric_type,
                          unit_name=None,
                          ordinate_name=None,
                          curve_name=None,
                          best=None,
                          pair_type=None,
                          thresholds=None):
        extra_metas = {}
        metric_type = "_".join([metric_type, "EVALUATION"])

        key_list = [
            "unit_name", "ordinate_name", "curve_name", "best", "pair_type",
            "thresholds"
        ]
        for key in key_list:
            value = locals()[key]
            if value:
                if key == "thresholds":
                    value = np.round(value, self.round_num).tolist()
                extra_metas[key] = value

        self.tracker.set_metric_meta(
            metric_namespace, metric_name,
            MetricMeta(name=metric_name,
                       metric_type=metric_type,
                       extra_metas=extra_metas))
Exemple #3
0
    def _callback(self):

        self.tracker.set_metric_meta(
            metric_namespace="statistic",
            metric_name="correlation",
            metric_meta=MetricMeta(name="pearson",
                                   metric_type="CORRELATION_GRAPH"),
        )
 def _display_result(self, block_num=None):
     if block_num is None:
         self.callback_metric(metric_name=self.metric_name,
                              metric_namespace=self.metric_namespace,
                              metric_data=[Metric("Coverage", self.coverage),
                                           Metric("Block number", self.block_num)])
         self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                      metric_name=self.metric_name,
                                      metric_meta=MetricMeta(self.metric_name, metric_type="INTERSECTION"))
     else:
         self.callback_metric(metric_name=self.metric_name,
                              metric_namespace=self.metric_namespace,
                              metric_data=[Metric("Coverage", self.coverage),
                                           Metric("Block number", block_num)])
         self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                      metric_name=self.metric_name,
                                      metric_meta=MetricMeta(self.metric_name, metric_type="INTERSECTION"))
Exemple #5
0
    def _callback_leaf_id_mapping(self, mapping):

        metric_namespace = 'sbt_transformer'
        metric_name = 'leaf_mapping'
        self.tracker.set_metric_meta(
            metric_namespace, metric_name,
            MetricMeta(name=metric_name,
                       metric_type=metric_name,
                       extra_metas=mapping))
Exemple #6
0
    def record_step_best(self, step_best, host_mask, guest_mask, data_instances, model):
        metas = {"host_mask": host_mask.tolist(), "guest_mask": guest_mask.tolist(),
                 "score_name": self.score_name}
        metas["number_in"] = int(sum(host_mask) + sum(guest_mask))
        metas["direction"] = self.direction
        metas["n_count"] = int(self.n_count)

        host_anonym = [
            anonymous_generator.generate_anonymous(
                fid=i,
                role='host',
                model=model) for i in range(
                len(host_mask))]
        guest_anonym = [
            anonymous_generator.generate_anonymous(
                fid=i,
                role='guest',
                model=model) for i in range(
                len(guest_mask))]
        metas["host_features_anonym"] = host_anonym
        metas["guest_features_anonym"] = guest_anonym

        model_info = self.models_trained[step_best]
        loss = model_info.get_loss()
        ic_val = model_info.get_score()
        metas["loss"] = loss
        metas["current_ic_val"] = ic_val
        metas["fit_intercept"] = model.fit_intercept

        model_key = model_info.get_key()
        model_dict = self._get_model(model_key)

        if self.role != consts.ARBITER:
            all_features = data_instances.schema.get('header')
            metas["all_features"] = all_features
            metas["to_enter"] = self.get_to_enter(host_mask, guest_mask, all_features)
            model_param = list(model_dict.get('model').values())[0].get(
                model.model_param_name)
            param_dict = MessageToDict(model_param)
            metas["intercept"] = param_dict.get("intercept", None)
            metas["weight"] = param_dict.get("weight", {})
            metas["header"] = param_dict.get("header", [])
            if self.n_step == 0 and self.direction == "forward":
                metas["intercept"] = self.intercept
            self.update_summary_client(model, host_mask, guest_mask, all_features, host_anonym, guest_anonym)
        else:
            self.update_summary_arbiter(model, loss, ic_val)
        metric_name = f"stepwise_{self.n_step}"
        metric = [Metric(metric_name, float(self.n_step))]
        model.callback_metric(metric_name=metric_name, metric_namespace=self.metric_namespace, metric_data=metric)
        model.tracker.set_metric_meta(metric_name=metric_name, metric_namespace=self.metric_namespace,
                                      metric_meta=MetricMeta(name=metric_name, metric_type=self.metric_type,
                                                             extra_metas=metas))
        LOGGER.info(f"metric_name: {metric_name}, metas: {metas}")
        return
Exemple #7
0
    def fit(self, data_inst, validate_data=None):

        # init binning obj
        self.aggregator = HomoBoostArbiterAggregator()
        self.binning_obj = HomoFeatureBinningServer()

        self.federated_binning()
        # initializing
        self.feature_num = self.sync_feature_num()

        if self.task_type == consts.CLASSIFICATION:
            label_mapping = HomoLabelEncoderArbiter().label_alignment()
            LOGGER.info('label mapping is {}'.format(label_mapping))
            self.booster_dim = len(
                label_mapping) if len(label_mapping) > 2 else 1

        if self.n_iter_no_change:
            self.check_convergence_func = converge_func_factory(
                "diff", self.tol)

        # sync start round and end round
        self.sync_start_round_and_end_round()

        LOGGER.info('begin to fit a boosting tree')
        self.preprocess()
        for epoch_idx in range(self.start_round, self.boosting_round):

            LOGGER.info('cur epoch idx is {}'.format(epoch_idx))

            for class_idx in range(self.booster_dim):
                model = self.fit_a_learner(epoch_idx, class_idx)

            global_loss = self.aggregator.aggregate_loss(suffix=(epoch_idx, ))
            self.history_loss.append(global_loss)
            LOGGER.debug('cur epoch global loss is {}'.format(global_loss))

            self.callback_metric("loss", "train",
                                 [Metric(epoch_idx, global_loss)])

            if self.n_iter_no_change:
                should_stop = self.aggregator.broadcast_converge_status(
                    self.check_convergence, (global_loss, ),
                    suffix=(epoch_idx, ))
                LOGGER.debug('stop flag sent')
                if should_stop:
                    break

        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"Best": min(self.history_loss)}))
        self.postprocess()
        self.callback_list.on_train_end()
        self.set_summary(self.generate_summary())
Exemple #8
0
    def callback_loss(self, iter_num, loss):
        metric_meta = MetricMeta(name='train',
                                 metric_type="LOSS",
                                 extra_metas={
                                     "unit_name": "iters",
                                 })

        self.callback_meta(metric_name='loss', metric_namespace='train', metric_meta=metric_meta)
        self.callback_metric(metric_name='loss',
                             metric_namespace='train',
                             metric_data=[Metric(iter_num, loss)])
Exemple #9
0
 def callback(self, metas):
     metric = [Metric(self.metric_name, 0)]
     self.callback_metric(metric_name=self.metric_name,
                          metric_namespace=self.metric_namespace,
                          metric_data=metric)
     self.tracker.set_metric_meta(metric_name=self.metric_name,
                                  metric_namespace=self.metric_namespace,
                                  metric_meta=MetricMeta(
                                      name=self.metric_name,
                                      metric_type=self.metric_type,
                                      extra_metas=metas))
Exemple #10
0
    def callback_ovr_metric_data(self, eval_results):

        for model_name, eval_rs in eval_results.items():

            train_callback_meta = defaultdict(dict)
            validate_callback_meta = defaultdict(dict)
            split_list = model_name.split('_')
            label = split_list[-1]
            origin_model_name_list = split_list[:
                                                -2]  # remove ' "class" label_index'
            origin_model_name = ''
            for s in origin_model_name_list:
                origin_model_name += (s + '_')
            origin_model_name = origin_model_name[:-1]

            for rs_dict in eval_rs:
                for metric_name, metric_rs in rs_dict.items():
                    if metric_name == consts.KS:
                        metric_rs = [
                            metric_rs[0], metric_rs[1][0]
                        ]  # ks value only, curve data is not needed
                    metric_namespace = metric_rs[0]
                    if metric_namespace == 'train':
                        callback_meta = train_callback_meta
                    else:
                        callback_meta = validate_callback_meta
                    callback_meta[label][metric_name] = metric_rs[1]

            self.tracker.set_metric_meta(
                "train", model_name + '_' + 'ovr',
                MetricMeta(name=origin_model_name,
                           metric_type='ovr',
                           extra_metas=train_callback_meta))
            self.tracker.set_metric_meta(
                "validate", model_name + '_' + 'ovr',
                MetricMeta(name=origin_model_name,
                           metric_type='ovr',
                           extra_metas=validate_callback_meta))

            LOGGER.debug('callback data {} {}'.format(train_callback_meta,
                                                      validate_callback_meta))
Exemple #11
0
    def callback_info(self):
        metric_meta = MetricMeta(
            name='train',
            metric_type=self.metric_type,
            extra_metas={"label_encoder": self.label_encoder})

        self.callback_metric(metric_name=self.metric_name,
                             metric_namespace=self.metric_namespace,
                             metric_data=[Metric(self.metric_name, 0)])
        self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                     metric_name=self.metric_name,
                                     metric_meta=metric_meta)
Exemple #12
0
def callback(tracker,
             method,
             callback_metrics,
             other_metrics=None,
             summary_dict=None):
    LOGGER.debug("callback: method is {}".format(method))
    if method == "random":
        tracker.log_metric_data("sample_count", "random", callback_metrics)

        tracker.set_metric_meta(
            "sample_count", "random",
            MetricMeta(name="sample_count", metric_type="SAMPLE_TEXT"))

        summary_dict["sample_count"] = callback_metrics[0].value

    else:
        LOGGER.debug("callback: name {}, namespace {}, metrics_data {}".format(
            "sample_count", "stratified", callback_metrics))

        tracker.log_metric_data("sample_count", "stratified", callback_metrics)

        tracker.set_metric_meta(
            "sample_count", "stratified",
            MetricMeta(name="sample_count", metric_type="SAMPLE_TABLE"))

        tracker.log_metric_data("original_count", "stratified", other_metrics)

        tracker.set_metric_meta(
            "original_count", "stratified",
            MetricMeta(name="original_count", metric_type="SAMPLE_TABLE"))

        summary_dict["sample_count"] = {}
        for sample_metric in callback_metrics:
            summary_dict["sample_count"][
                sample_metric.key] = sample_metric.value

        summary_dict["original_count"] = {}
        for sample_metric in other_metrics:
            summary_dict["original_count"][
                sample_metric.key] = sample_metric.value
Exemple #13
0
    def __save_f1_score_table(self, metric, f1_scores, thresholds, metric_name,
                              metric_namespace):

        extra_metas = {
            'f1_scores': list(np.round(f1_scores, self.round_num)),
            'thresholds': list(np.round(thresholds, self.round_num))
        }

        self.tracker.set_metric_meta(
            metric_namespace, metric_name,
            MetricMeta(name=metric_name,
                       metric_type=metric.upper(),
                       extra_metas=extra_metas))
Exemple #14
0
 def callback_cache_meta(self, intersect_meta):
     """
     self.callback_metric(f"{self.metric_name}_cache_meta",
                          f"{self.metric_namespace}_CACHE",
                          metric_data=[Metric("intersect_cache_meta", 0)])
     """
     metric_name = f"{self.metric_name}_cache_meta"
     self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                  metric_name=metric_name,
                                  metric_meta=MetricMeta(name=f"{self.metric_name}_cache_meta",
                                                         metric_type=self.metric_type,
                                                         extra_metas=intersect_meta)
                                  )
Exemple #15
0
    def __save_single_value(self, result, metric_name, metric_namespace,
                            eval_name):

        metric_type = 'EVALUATION_SUMMARY'
        if eval_name in consts.ALL_CLUSTER_METRICS:
            metric_type = 'CLUSTERING_EVALUATION_SUMMARY'

        self.tracker.log_metric_data(
            metric_namespace, metric_name,
            [Metric(eval_name, np.round(result, self.round_num))])
        self.tracker.set_metric_meta(
            metric_namespace, metric_name,
            MetricMeta(name=metric_name, metric_type=metric_type))
Exemple #16
0
    def callback_dbi(self, iter_num, dbi):
        metric_meta = MetricMeta(name='train',
                                 metric_type="DBI",
                                 extra_metas={
                                     "unit_name": "iters",
                                 })

        self.callback_meta(metric_name='DBI',
                           metric_namespace='train',
                           metric_meta=metric_meta)
        self.callback_metric(metric_name='DBI',
                             metric_namespace='train',
                             metric_data=[Metric(iter_num, dbi)])
Exemple #17
0
 def callback(self):
     meta_info = {"intersect_method": self.model_param.intersect_method,
                  "join_method": self.model_param.join_method}
     self.callback_metric(metric_name=self.metric_name,
                          metric_namespace=self.metric_namespace,
                          metric_data=[Metric("intersect_count", self.intersect_num),
                                       Metric("intersect_rate", self.intersect_rate),
                                       Metric("unmatched_count", self.unmatched_num),
                                       Metric("unmatched_rate", self.unmatched_rate)])
     self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                  metric_name=self.metric_name,
                                  metric_meta=MetricMeta(name=self.metric_name,
                                                         metric_type=self.metric_type,
                                                         extra_metas=meta_info)
                                  )
Exemple #18
0
    def __save_pr_table(self, metric, metric_res, metric_name,
                        metric_namespace):

        p_scores, r_scores, score_threshold = metric_res

        extra_metas = {
            'p_scores': list(map(list, np.round(p_scores, self.round_num))),
            'r_scores': list(map(list, np.round(r_scores, self.round_num))),
            'thresholds': list(np.round(score_threshold, self.round_num))
        }

        self.tracker.set_metric_meta(
            metric_namespace, metric_name,
            MetricMeta(name=metric_name,
                       metric_type=metric.upper(),
                       extra_metas=extra_metas))
Exemple #19
0
    def __save_confusion_mat_table(self, metric, confusion_mat, thresholds,
                                   metric_name, metric_namespace):

        extra_metas = {
            'tp': list(confusion_mat['tp']),
            'tn': list(confusion_mat['tn']),
            'fp': list(confusion_mat['fp']),
            'fn': list(confusion_mat['fn']),
            'thresholds': list(np.round(thresholds, self.round_num))
        }

        self.tracker.set_metric_meta(
            metric_namespace, metric_name,
            MetricMeta(name=metric_name,
                       metric_type=metric.upper(),
                       extra_metas=extra_metas))
Exemple #20
0
    def callback_loss(self, iter_num, loss):
        # noinspection PyTypeChecker
        metric_meta = MetricMeta(
            name="train",
            metric_type="LOSS",
            extra_metas={
                "unit_name": "iters",
            },
        )

        self.callback_meta(
            metric_name="loss", metric_namespace="train", metric_meta=metric_meta
        )
        self.callback_metric(
            metric_name="loss",
            metric_namespace="train",
            metric_data=[Metric(iter_num, loss)],
        )
Exemple #21
0
    def __save_distance_measure(self, metric, metric_res: dict, metric_name,
                                metric_namespace):

        extra_metas = {}
        cluster_index = [k for k in metric_res.keys()]
        radius, neareast_idx = [], []
        for k in metric_res:
            radius.append(metric_res[k][0])
            neareast_idx.append(metric_res[k][1])

        extra_metas['cluster_index'] = cluster_index
        extra_metas['radius'] = radius
        extra_metas['nearest_idx'] = neareast_idx

        self.tracker.set_metric_meta(
            metric_namespace, metric_name,
            MetricMeta(name=metric_name,
                       metric_type=metric.upper(),
                       extra_metas=extra_metas))
Exemple #22
0
def server_callback_loss(self, iter_num, loss):
    # noinspection PyTypeChecker
    metric_meta = MetricMeta(
        name="train",
        metric_type="LOSS",
        extra_metas={
            "unit_name": "iters",
        },
    )

    self.callback_meta(metric_name="loss",
                       metric_namespace="train",
                       metric_meta=metric_meta)
    self.callback_metric(
        metric_name="loss",
        metric_namespace="train",
        metric_data=[Metric(iter_num, loss)],
    )

    self._summary["loss_history"].append(loss)
Exemple #23
0
    def __save_contingency_matrix(self, metric, metric_res, metric_name,
                                  metric_namespace):

        result_array, unique_predicted_label, unique_true_label = metric_res
        true_labels = list(map(int, unique_true_label))
        predicted_label = list(map(int, unique_predicted_label))
        result_table = []
        for l_ in result_array:
            result_table.append(list(map(int, l_)))

        extra_metas = {
            'true_labels': true_labels,
            'predicted_labels': predicted_label,
            'result_table': result_table
        }

        self.tracker.set_metric_meta(
            metric_namespace, metric_name,
            MetricMeta(name=metric_name,
                       metric_type=metric.upper(),
                       extra_metas=extra_metas))
Exemple #24
0
    def transform(self, data, fit_config=None):
        """
        Transform input data using scale with fit results
        Parameters
        ----------
        data: data_instance, input data
        fit_config: list, the fit results information of scale

        Returns
        ----------
        transform_data:data_instance, data after transform
        """
        LOGGER.info("Start scale data transform ...")

        if self.model_param.method == consts.MINMAXSCALE:
            self.scale_obj = MinMaxScale(self.model_param)
        elif self.model_param.method == consts.STANDARDSCALE:
            self.scale_obj = StandardScale(self.model_param)
            self.scale_obj.set_param(self.mean, self.std)
        else:
            LOGGER.info("DataTransform method is {}, do nothing and return!".format(self.model_param.method))

        if self.scale_obj:
            self.scale_obj.header = self.header
            self.scale_obj.scale_column_idx = self.scale_column_idx
            self.scale_obj.set_column_range(self.column_max_value, self.column_min_value)
            transform_data = self.scale_obj.transform(data)
            transform_data.schema = data.schema

            self.callback_meta(metric_name="scale", metric_namespace="train",
                               metric_meta=MetricMeta(name="scale", metric_type="SCALE",
                                                      extra_metas={"method": self.model_param.method}))

        else:
            transform_data = data

        LOGGER.info("End transform data.")

        return transform_data
Exemple #25
0
    def fit(self, data):
        """
        Apply scale for input data
        Parameters
        ----------
        data: data_instance, input data

        Returns
        ----------
        data:data_instance, data after scale
        scale_value_results: list, the fit results information of scale
        """
        LOGGER.info("Start scale data fit ...")

        if self.model_param.method == consts.MINMAXSCALE:
            self.scale_obj = MinMaxScale(self.model_param)
        elif self.model_param.method == consts.STANDARDSCALE:
            self.scale_obj = StandardScale(self.model_param)
        else:
            LOGGER.warning("Scale method is {}, do nothing and return!".format(self.model_param.method))

        if self.scale_obj:
            fit_data = self.scale_obj.fit(data)
            fit_data.schema = data.schema

            self.callback_meta(metric_name="scale", metric_namespace="train",
                               metric_meta=MetricMeta(name="scale", metric_type="SCALE",
                                                      extra_metas={"method": self.model_param.method}))
            
            LOGGER.info("start to get model summary ...")
            self.set_summary(self.scale_obj.get_model_summary())
            LOGGER.info("Finish getting model summary.")

        else:
            fit_data = data

        LOGGER.info("End fit data ...")
        return fit_data
Exemple #26
0
    def __save_psi_table(self, metric, metric_res, metric_name,
                         metric_namespace):

        psi_scores, total_psi, expected_interval, expected_percentage, actual_interval, actual_percentage, \
            train_pos_perc, validate_pos_perc, intervals = metric_res[1]

        extra_metas = {
            'psi_scores': list(np.round(psi_scores, self.round_num)),
            'total_psi': round(total_psi, self.round_num),
            'expected_interval': list(expected_interval),
            'expected_percentage': list(expected_percentage),
            'actual_interval': list(actual_interval),
            'actual_percentage': list(actual_percentage),
            'intervals': list(intervals),
            'train_pos_perc': train_pos_perc,
            'validate_pos_perc': validate_pos_perc
        }

        self.tracker.set_metric_meta(
            metric_namespace, metric_name,
            MetricMeta(name=metric_name,
                       metric_type=metric.upper(),
                       extra_metas=extra_metas))
Exemple #27
0
    def fit(self, data_inst, validate_data=None):

        LOGGER.debug('in training, partitions is {}'.format(
            data_inst.partitions))
        LOGGER.info('start to fit a ftl model, '
                    'run mode is {},'
                    'communication efficient mode is {}'.format(
                        self.mode, self.comm_eff))

        self.check_host_number()

        data_loader, self.x_shape, self.data_num, self.overlap_num = self.prepare_data(
            self.init_intersect_obj(), data_inst, guest_side=True)
        self.input_dim = self.x_shape[0]

        # cache data_loader for faster validation
        self.cache_dataloader[self.get_dataset_key(data_inst)] = data_loader

        self.partitions = data_inst.partitions
        LOGGER.debug('self partitions is {}'.format(self.partitions))

        self.initialize_nn(input_shape=self.x_shape)
        self.feat_dim = self.nn._model.output_shape[1]
        self.constant_k = 1 / self.feat_dim
        self.callback_list.on_train_begin(train_data=data_inst,
                                          validate_data=validate_data)

        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"unit_name": "iters"}))

        # compute intermediate result of first epoch
        self.phi, self.phi_product, self.overlap_ua, self.send_components = self.batch_compute_components(
            data_loader)

        for epoch_idx in range(self.epochs):

            LOGGER.debug('fitting epoch {}'.format(epoch_idx))

            self.callback_list.on_epoch_begin(epoch_idx)

            host_components = self.exchange_components(self.send_components,
                                                       epoch_idx=epoch_idx)

            loss = None

            for local_round_idx in range(self.local_round):

                if self.comm_eff:
                    LOGGER.debug(
                        'running local iter {}'.format(local_round_idx))

                grads = self.compute_backward_gradients(
                    host_components,
                    data_loader,
                    epoch_idx=epoch_idx,
                    local_round=local_round_idx)
                self.update_nn_weights(grads,
                                       data_loader,
                                       epoch_idx,
                                       decay=self.comm_eff)

                if local_round_idx == 0:
                    loss = self.compute_loss(
                        host_components, epoch_idx,
                        len(data_loader.get_overlap_indexes()))

                if local_round_idx + 1 != self.local_round:
                    self.phi, self.overlap_ua = self.compute_phi_and_overlap_ua(
                        data_loader)

            self.callback_metric("loss", "train", [Metric(epoch_idx, loss)])
            self.history_loss.append(loss)

            # updating variables for next epochs
            if epoch_idx + 1 == self.epochs:
                # only need to update phi in last epochs
                self.phi, _ = self.compute_phi_and_overlap_ua(data_loader)
            else:
                # compute phi, phi_product, overlap_ua etc. for next epoch
                self.phi, self.phi_product, self.overlap_ua, self.send_components = self.batch_compute_components(
                    data_loader)

            self.callback_list.on_epoch_end(epoch_idx)

            # check n_iter_no_change
            if self.n_iter_no_change is True:
                if self.check_convergence(loss):
                    self.sync_stop_flag(epoch_idx, stop_flag=True)
                    break
                else:
                    self.sync_stop_flag(epoch_idx, stop_flag=False)

            LOGGER.debug('fitting epoch {} done, loss is {}'.format(
                epoch_idx, loss))

        self.callback_list.on_train_end()
        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"Best": min(self.history_loss)}))

        self.set_summary(self.generate_summary())
        LOGGER.debug('fitting ftl model done')
Exemple #28
0
    def fit(self, data):
        # LOGGER.debug(f"fit receives data is {data}")
        if not isinstance(data, dict) or len(data) <= 1:
            raise ValueError(
                "Union module must receive more than one table as input.")
        empty_count = 0
        combined_table = None
        combined_schema = None
        metrics = []

        for (key, local_table) in data.items():
            LOGGER.debug("table to combine name: {}".format(key))
            num_data = local_table.count()
            LOGGER.debug("table count: {}".format(num_data))
            metrics.append(Metric(key, num_data))
            self.add_summary(key, num_data)

            if num_data == 0:
                LOGGER.warning("Table {} is empty.".format(key))
                if combined_table is None:
                    combined_table = local_table
                    combined_schema = local_table.schema
                empty_count += 1
                continue

            local_is_data_instance = self.check_is_data_instance(local_table)
            if self.is_data_instance is None or combined_table is None:
                self.is_data_instance = local_is_data_instance
            LOGGER.debug(f"self.is_data_instance is {self.is_data_instance}, "
                         f"local_is_data_instance is {local_is_data_instance}")
            if self.is_data_instance != local_is_data_instance:
                raise ValueError(
                    f"Cannot combine DataInstance and non-DataInstance object. Union aborted."
                )

            if self.is_data_instance:
                self.is_empty_feature = data_overview.is_empty_feature(
                    local_table)
                if self.is_empty_feature:
                    LOGGER.warning("Table {} has empty feature.".format(key))
                else:
                    self.check_schema_content(local_table.schema)

            if combined_table is None or combined_table.count() == 0:
                # first non-empty table to combine
                combined_table = local_table
                combined_schema = local_table.schema
                if self.keep_duplicate:
                    combined_table = combined_table.map(lambda k, v:
                                                        (f"{k}_{key}", v))
                    combined_table.schema = combined_schema
            else:
                self.check_id(local_table, combined_table)
                self.check_label_name(local_table, combined_table)
                self.check_header(local_table, combined_table)
                if self.keep_duplicate:
                    local_table = local_table.map(lambda k, v:
                                                  (f"{k}_{key}", v))

                combined_table = combined_table.union(local_table,
                                                      self._keep_first)

                combined_table.schema = combined_schema

            # only check feature length if not empty
            if self.is_data_instance and not self.is_empty_feature:
                self.feature_count = len(combined_schema.get("header"))
                # LOGGER.debug(f"feature count: {self.feature_count}")
                combined_table.mapValues(self.check_feature_length)

        if combined_table is None:
            LOGGER.warning(
                "All tables provided are empty or have empty features.")
            first_table = list(data.values())[0]
            combined_table = first_table.join(first_table)
        num_data = combined_table.count()
        metrics.append(Metric("Total", num_data))
        self.add_summary("Total", num_data)
        LOGGER.info(f"Result total data entry count: {num_data}")

        self.callback_metric(metric_name=self.metric_name,
                             metric_namespace=self.metric_namespace,
                             metric_data=metrics)
        self.tracker.set_metric_meta(metric_namespace=self.metric_namespace,
                                     metric_name=self.metric_name,
                                     metric_meta=MetricMeta(
                                         name=self.metric_name,
                                         metric_type=self.metric_type))

        LOGGER.info(
            "Union operation finished. Total {} empty tables encountered.".
            format(empty_count))

        return combined_table
Exemple #29
0
 def _set_loss_callback_info(self):
     self.callback_meta("loss",
                        "train",
                        MetricMeta(name="train",
                                   metric_type="LOSS",
                                   extra_metas={"unit_name": "iters"}))
Exemple #30
0
    def fit(self, data_inst, validate_data=None):

        LOGGER.info('begin to fit a hetero boosting model, model is {}'.format(
            self.model_name))

        self.start_round = 0

        self.on_training = True

        self.data_inst = data_inst

        self.data_bin, self.bin_split_points, self.bin_sparse_points = self.prepare_data(
            data_inst)

        self.y = self.get_label(self.data_bin)

        if not self.is_warm_start:
            self.feature_name_fid_mapping = self.gen_feature_fid_mapping(
                data_inst.schema)
            self.classes_, self.num_classes, self.booster_dim = self.check_label(
            )
            self.loss = self.get_loss_function()
            self.y_hat, self.init_score = self.get_init_score(
                self.y, self.num_classes)
        else:
            classes_, num_classes, booster_dim = self.check_label()
            self.prepare_warm_start(data_inst, classes_)

        LOGGER.info('class index is {}'.format(self.classes_))

        self.sync_booster_dim()

        self.generate_encrypter()

        self.callback_list.on_train_begin(data_inst, validate_data)

        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"unit_name": "iters"}))

        self.preprocess()

        for epoch_idx in range(self.start_round, self.boosting_round):

            LOGGER.info('cur epoch idx is {}'.format(epoch_idx))

            self.callback_list.on_epoch_begin(epoch_idx)

            for class_idx in range(self.booster_dim):

                # fit a booster
                model = self.fit_a_learner(epoch_idx, class_idx)

                booster_meta, booster_param = model.get_model()

                if booster_meta is not None and booster_param is not None:
                    self.booster_meta = booster_meta
                    self.boosting_model_list.append(booster_param)

                # update predict score
                cur_sample_weights = model.get_sample_weights()
                self.y_hat = self.get_new_predict_score(self.y_hat,
                                                        cur_sample_weights,
                                                        dim=class_idx)

            # compute loss
            loss = self.compute_loss(self.y_hat, self.y)
            self.history_loss.append(loss)
            LOGGER.info("round {} loss is {}".format(epoch_idx, loss))
            self.callback_metric("loss", "train", [Metric(epoch_idx, loss)])

            # check validation
            validation_strategy = self.callback_list.get_validation_strategy()
            if validation_strategy:
                validation_strategy.set_precomputed_train_scores(
                    self.score_to_predict_result(data_inst, self.y_hat))

            self.callback_list.on_epoch_end(epoch_idx)

            should_stop = False
            if self.n_iter_no_change and self.check_convergence(loss):
                should_stop = True
                self.is_converged = True
            self.sync_stop_flag(self.is_converged, epoch_idx)
            if self.stop_training or should_stop:
                break

        self.postprocess()
        self.callback_list.on_train_end()
        self.callback_meta(
            "loss", "train",
            MetricMeta(name="train",
                       metric_type="LOSS",
                       extra_metas={"Best": min(self.history_loss)}))
        # get summary
        self.set_summary(self.generate_summary())