Beispiel #1
0
    def synthesis(self):
        new_data_name = "{}MaxDiffMin_{}_{}".format(
            feature_types.aggregate_processed_numerical.prefix, self.__path,
            self.__col)

        dst_table = self.dataset.tables[self.__path.dst]
        dst_df = dst_table.df
        dst_data = dst_df[self.__col].values
        time_for_each_table = {
            table_idx: self.dataset.tables[table_name].hour_time_data
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        sorted_index_for_each_table = {
            table_idx: self.dataset.tables[table_name].sorted_time_index
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        src_id_for_each_relation = [
            self.dataset.tables[rel.src].df[rel.src_id].values
            for rel in self.__path.relations
        ]
        dst_id_for_each_relation = [
            self.dataset.tables[rel.dst].df[rel.dst_id].values
            for rel in self.__path.relations
        ]
        src_is_unique_for_each_relation = [
            rel.type.src_is_unique for rel in self.__path.relations
        ]
        dst_is_unique_for_each_relation = [
            rel.type.dst_is_unique for rel in self.__path.relations
        ]

        max_new_data = _core.Aggregator().aggregate(
            dst_data, time_for_each_table, sorted_index_for_each_table,
            src_id_for_each_relation, dst_id_for_each_relation,
            src_is_unique_for_each_relation, dst_is_unique_for_each_relation,
            "max", "max")
        min_new_data = _core.Aggregator().aggregate(
            dst_data, time_for_each_table, sorted_index_for_each_table,
            src_id_for_each_relation, dst_id_for_each_relation,
            src_is_unique_for_each_relation, dst_is_unique_for_each_relation,
            "min", "min")
        new_data = max_new_data - min_new_data
        new_data = new_data.astype(np.float32)
        self.__dataset.tables[self.__path.src].set_new_data(
            new_data, new_data_name)
Beispiel #2
0
    def synthesis(self):
        if len(self.__path) == 0:
            return

        if self.__is_cat:
            new_data_name = "{}Identify_{}_{}".format(
                feature_types.aggregate_processed_categorical.prefix,
                self.__path, self.__col)
        else:
            new_data_name = "{}Identify_{}_{}".format(
                feature_types.aggregate_processed_numerical.prefix,
                self.__path, self.__col)

        dst_table = self.dataset.tables[self.__path.dst]
        dst_df = dst_table.df
        dst_data = dst_df[self.__col].values
        time_for_each_table = {
            table_idx: self.dataset.tables[table_name].hour_time_data
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        sorted_index_for_each_table = {
            table_idx: self.dataset.tables[table_name].sorted_time_index
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        src_id_for_each_relation = [
            self.dataset.tables[rel.src].df[rel.src_id].values
            for rel in self.__path.relations
        ]
        dst_id_for_each_relation = [
            self.dataset.tables[rel.dst].df[rel.dst_id].values
            for rel in self.__path.relations
        ]
        src_is_unique_for_each_relation = [
            rel.type.src_is_unique for rel in self.__path.relations
        ]
        dst_is_unique_for_each_relation = [
            rel.type.dst_is_unique for rel in self.__path.relations
        ]

        new_data = _core.Aggregator().aggregate(
            dst_data, time_for_each_table, sorted_index_for_each_table,
            src_id_for_each_relation, dst_id_for_each_relation,
            src_is_unique_for_each_relation, dst_is_unique_for_each_relation,
            "last", "last")
        """
        if (
            not self.__is_cat
            and not auc_selection.numerical_adversarial_auc_select(
                self.__dataset, new_data, 0.2
            )
        ):
            return
        """

        self.__dataset.tables[self.__path.src].set_new_data(
            new_data, new_data_name)
Beispiel #3
0
    def __recursive_synthesis(self, path):
        if len(self.__path) == 0:
            return

        new_data_name = "{}OneHotSum_{}_{}_{}".format(
            feature_types.aggregate_processed_numerical.prefix, path,
            self.__col, self.__value)
        dst_table = self.dataset.tables[self.__path.dst]

        if dst_table.has_cache(("categorical_manager", self.__col)):
            categorical_manager = dst_table.get_cache(
                ("categorical_manager", self.__col))
        else:
            processing_data = \
                dst_table.df[self.__col].fillna("").astype(str).values
            categorical_manager = \
                _core.CategoricalManager(processing_data)
            dst_table.set_cache(("categorical_manager", self.__col),
                                categorical_manager)

        dst_data = categorical_manager.is_array(self.__value)
        time_for_each_table = {
            table_idx: self.dataset.tables[table_name].hour_time_data
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        sorted_index_for_each_table = {
            table_idx: self.dataset.tables[table_name].sorted_time_index
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        src_id_for_each_relation = [
            self.dataset.tables[rel.src].df[rel.src_id].values
            for rel in self.__path.relations
        ]
        dst_id_for_each_relation = [
            self.dataset.tables[rel.dst].df[rel.dst_id].values
            for rel in self.__path.relations
        ]
        src_is_unique_for_each_relation = [
            rel.type.src_is_unique for rel in self.__path.relations
        ]
        dst_is_unique_for_each_relation = [
            rel.type.dst_is_unique for rel in self.__path.relations
        ]

        new_data = _core.Aggregator().aggregate(
            dst_data, time_for_each_table, sorted_index_for_each_table,
            src_id_for_each_relation, dst_id_for_each_relation,
            src_is_unique_for_each_relation, dst_is_unique_for_each_relation,
            "sum", "sum")
        self.__dataset.tables[self.__path.src].set_new_data(
            new_data, new_data_name)
Beispiel #4
0
    def __recursive_synthesis(self, path):
        if len(self.__path) == 0:
            return

        new_data_name = "{}Count_{}".format(
            feature_types.aggregate_processed_numerical.prefix, path)
        dst_table = self.dataset.tables[self.__path.dst]
        dst_df = dst_table.df
        dst_data = np.ones(len(dst_df)).astype(np.float32)
        time_for_each_table = {
            table_idx: self.dataset.tables[table_name].hour_time_data
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        sorted_index_for_each_table = {
            table_idx: self.dataset.tables[table_name].sorted_time_index
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        src_id_for_each_relation = [
            self.dataset.tables[rel.src].df[rel.src_id].values
            for rel in self.__path.relations
        ]
        dst_id_for_each_relation = [
            self.dataset.tables[rel.dst].df[rel.dst_id].values
            for rel in self.__path.relations
        ]
        src_is_unique_for_each_relation = [
            rel.type.src_is_unique for rel in self.__path.relations
        ]
        dst_is_unique_for_each_relation = [
            rel.type.dst_is_unique for rel in self.__path.relations
        ]

        new_data = _core.Aggregator().aggregate(
            dst_data, time_for_each_table, sorted_index_for_each_table,
            src_id_for_each_relation, dst_id_for_each_relation,
            src_is_unique_for_each_relation, dst_is_unique_for_each_relation,
            "sum", "sum")
        self.__dataset.tables[self.__path.src].set_new_data(
            new_data, new_data_name)
    def __recursive_synthesis(self, path):
        if len(self.__path) == 0:
            return

        new_data_name = "{}OneHotMean_{}_{}_{}".format(
            feature_types.aggregate_processed_numerical.prefix, path,
            self.__col, self.__value)
        dst_table = self.dataset.tables[self.__path.dst]

        if dst_table.has_cache(("categorical_manager", self.__col)):
            categorical_manager = dst_table.get_cache(
                ("categorical_manager", self.__col))
        else:
            processing_data = \
                dst_table.df[self.__col].fillna("").astype(str).values
            categorical_manager = \
                _core.CategoricalManager(processing_data)
            dst_table.set_cache(("categorical_manager", self.__col),
                                categorical_manager)

        dst_data = categorical_manager.is_array(self.__value)
        time_for_each_table = {
            table_idx: self.dataset.tables[table_name].hour_time_data
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        sorted_index_for_each_table = {
            table_idx: self.dataset.tables[table_name].sorted_time_index
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        src_id_for_each_relation = [
            self.dataset.tables[rel.src].df[rel.src_id].values
            for rel in self.__path.relations
        ]
        dst_id_for_each_relation = [
            self.dataset.tables[rel.dst].df[rel.dst_id].values
            for rel in self.__path.relations
        ]
        src_is_unique_for_each_relation = [
            rel.type.src_is_unique for rel in self.__path.relations
        ]
        dst_is_unique_for_each_relation = [
            rel.type.dst_is_unique for rel in self.__path.relations
        ]

        new_data = _core.Aggregator().aggregate(
            dst_data, time_for_each_table, sorted_index_for_each_table,
            src_id_for_each_relation, dst_id_for_each_relation,
            src_is_unique_for_each_relation, dst_is_unique_for_each_relation,
            "mean", "mean")

        train_size = np.isfinite(self.__dataset.target).sum()
        train_isfinite = np.isfinite(new_data[:train_size])
        if (len(np.unique(new_data[:train_size][train_isfinite])) <= 1):
            return
        auc = metrics.roc_auc_score(
            self.__dataset.target[:train_size][train_isfinite],
            new_data[:train_size][train_isfinite])
        if (auc < 0.5001 and auc > 0.4999):
            return

        self.__dataset.tables[self.__path.src].set_new_data(
            new_data, new_data_name)
Beispiel #6
0
    def synthesis(self):
        dst_table = self.dataset.tables[self.path.dst]

        if dst_table.has_cache(("multi_categorical_manager", self.__col)):
            multi_categorical_manager = dst_table.get_cache(
                ("multi_categorical_manager", self.__col))
        else:
            multi_categorical_string_values = \
                dst_table.df[self.col].fillna("").values
            multi_categorical_manager = \
                _core.MultiCategoricalManager(multi_categorical_string_values)
            dst_table.set_cache(("multi_categorical_manager", self.__col),
                                multi_categorical_manager)

        to_one_path = None
        to_many_path = None
        for i in range(len(self.__path), -1, -1):
            if self.__path[i:].is_substance_to_one_with_col(
                    self.__dataset, self.__col):
                to_one_path = self.__path[i:]
                to_many_path = self.__path[:i]

        # to_one identify of dst_induces
        if len(to_one_path) > 0:
            dst_induces = np.arange(len(dst_table.df))
            time_for_each_table = {
                table_idx: self.dataset.tables[table_name].hour_time_data
                for table_idx, table_name in enumerate(to_one_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            sorted_index_for_each_table = {
                table_idx: self.dataset.tables[table_name].sorted_time_index
                for table_idx, table_name in enumerate(to_one_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            src_id_for_each_relation = [
                self.dataset.tables[rel.src].df[rel.src_id].values
                for rel in to_one_path.relations
            ]
            dst_id_for_each_relation = [
                self.dataset.tables[rel.dst].df[rel.dst_id].values
                for rel in to_one_path.relations
            ]
            src_is_unique_for_each_relation = [
                rel.type.src_is_unique for rel in to_one_path.relations
            ]
            dst_is_unique_for_each_relation = [
                rel.type.dst_is_unique for rel in to_one_path.relations
            ]
            dst_induces = _core.Aggregator().aggregate(
                dst_induces, time_for_each_table, sorted_index_for_each_table,
                src_id_for_each_relation, dst_id_for_each_relation,
                src_is_unique_for_each_relation,
                dst_is_unique_for_each_relation, "last", "last")
            dst_induces = dst_induces.astype(np.int32)
            dst_induces[dst_induces < 0] = -1
        else:
            dst_table = self.dataset.tables[to_one_path.dst]
            dst_induces = np.arange(len(dst_table.df))
            dst_induces = dst_induces.astype(np.int32)
            dst_induces[dst_induces < 0] = -1

        # target encoding
        dst_table = self.dataset.tables[to_many_path.dst]
        if not dst_table.has_pseudo_target:
            return

        if dst_table.has_time:
            sorted_index = dst_table.sorted_time_index
            if dst_table.has_hist_time_data:
                time_data = dst_table.hist_time_data
            else:
                time_data = dst_table.time_data
            new_data = multi_categorical_manager \
                .temporal_target_encode_with_dst_induces(
                    dst_table.pseudo_target,
                    dst_induces,
                    time_data,
                    sorted_index,
                    multi_categorical_manager.unique_word_num,
                    multi_categorical_manager.row_num
                )
        else:
            new_data = multi_categorical_manager \
                .target_encode_with_dst_induces(
                    dst_table.pseudo_target,
                    dst_induces,
                    multi_categorical_manager.unique_word_num,
                    multi_categorical_manager.row_num
                )

        # to_many_aggregate or set
        if len(to_many_path) == 0:
            new_data_name = "{}MultiCategoricalTargetEncoding_{}_{}".format(
                feature_types.aggregate_processed_numerical.prefix,
                self.__path, self.__col)
            self.__dataset.tables[to_many_path.src].set_new_data(
                new_data, new_data_name)
        else:
            # to_many_aggregate
            dst_data = new_data

            time_for_each_table = {
                table_idx: self.dataset.tables[table_name].hour_time_data
                for table_idx, table_name in enumerate(
                    to_many_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            sorted_index_for_each_table = {
                table_idx: self.dataset.tables[table_name].sorted_time_index
                for table_idx, table_name in enumerate(
                    to_many_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            src_id_for_each_relation = [
                self.dataset.tables[rel.src].df[rel.src_id].values
                for rel in to_many_path.relations
            ]
            dst_id_for_each_relation = [
                self.dataset.tables[rel.dst].df[rel.dst_id].values
                for rel in to_many_path.relations
            ]
            src_is_unique_for_each_relation = [
                rel.type.src_is_unique for rel in to_many_path.relations
            ]
            dst_is_unique_for_each_relation = [
                rel.type.dst_is_unique for rel in to_many_path.relations
            ]

            new_data = _core.Aggregator().aggregate(
                dst_data, time_for_each_table, sorted_index_for_each_table,
                src_id_for_each_relation, dst_id_for_each_relation,
                src_is_unique_for_each_relation,
                dst_is_unique_for_each_relation, "mean", "mean")

            new_data_name = "{}MultiCategoricalTargetEncoding_{}_{}".format(
                feature_types.aggregate_processed_numerical.prefix,
                self.__path, self.__col)

            self.__dataset.tables[to_many_path.src].set_new_data(
                new_data, new_data_name)
    def synthesis(self):
        to_one_path = None
        to_many_path = None
        for i in range(len(self.__path), -1, -1):
            if self.__path[i:].is_substance_to_one_with_col(
               self.__dataset, self.__col1) \
               and self.__path[i:].is_substance_to_one_with_col(
                  self.__dataset, self.__col2):
                to_one_path = self.__path[i:]
                to_many_path = self.__path[:i]

        # to_one identify
        if len(to_one_path) > 0:
            dst_table = self.dataset.tables[to_one_path.dst]
            dst_data1 = dst_table.df[self.col1].values
            dst_data2 = dst_table.df[self.col2].values
            time_for_each_table = {
                table_idx: self.dataset.tables[table_name].hour_time_data
                for table_idx, table_name in enumerate(to_one_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            sorted_index_for_each_table = {
                table_idx: self.dataset.tables[table_name].sorted_time_index
                for table_idx, table_name in enumerate(to_one_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            src_id_for_each_relation = [
                self.dataset.tables[rel.src].df[rel.src_id].values
                for rel in to_one_path.relations
            ]
            dst_id_for_each_relation = [
                self.dataset.tables[rel.dst].df[rel.dst_id].values
                for rel in to_one_path.relations
            ]
            src_is_unique_for_each_relation = [
                rel.type.src_is_unique for rel in to_one_path.relations
            ]
            dst_is_unique_for_each_relation = [
                rel.type.dst_is_unique for rel in to_one_path.relations
            ]
            ids1 = _core.Aggregator().aggregate(
                dst_data1, time_for_each_table, sorted_index_for_each_table,
                src_id_for_each_relation, dst_id_for_each_relation,
                src_is_unique_for_each_relation,
                dst_is_unique_for_each_relation, "last", "last")
            ids2 = _core.Aggregator().aggregate(
                dst_data2, time_for_each_table, sorted_index_for_each_table,
                src_id_for_each_relation, dst_id_for_each_relation,
                src_is_unique_for_each_relation,
                dst_is_unique_for_each_relation, "last", "last")
            ids1 = ids1.astype(np.int32)
            ids1[ids1 < 0] = -1
            ids2 = ids2.astype(np.int32)
            ids2[ids2 < 0] = -1
        else:
            dst_table = self.dataset.tables[to_one_path.dst]
            ids1 = dst_table.df[self.col1].values
            ids2 = dst_table.df[self.col2].values
            ids1 = ids1.astype(np.int32)
            ids1[ids1 < 0] = -1
            ids2 = ids2.astype(np.int32)
            ids2[ids2 < 0] = -1

        # target encoding
        dst_table = self.dataset.tables[to_many_path.dst]
        if not dst_table.has_pseudo_target:
            return

        targets = dst_table.pseudo_target
        encoder = _core.FactorizedTargetEncoder()
        k1 = len(np.unique(ids1))
        k2 = len(np.unique(ids2))
        k0 = k1 * k2
        if dst_table.has_hist_time_data:
            sorted_index = dst_table.sorted_time_index
            time_data = dst_table.hist_time_data
            new_data = encoder.temporal_encode(targets, ids1, ids2, time_data,
                                               sorted_index, k0, k1, k2)
        elif dst_table.has_time:
            sorted_index = dst_table.sorted_time_index
            time_data = dst_table.time_data
            new_data = encoder.temporal_encode(targets, ids1, ids2, time_data,
                                               sorted_index, k0, k1, k2)
        else:
            new_data = encoder.encode(targets, ids1, ids2, k0, k1, k2)

        if len(to_many_path) == 0:
            new_data_name = "{}FactorizedTargetEncoding_{}_{}_{}".format(
                feature_types.aggregate_processed_numerical.prefix,
                self.__path, self.__col1, self.__col2)
            self.__dataset.tables[to_many_path.src].set_new_data(
                new_data, new_data_name)
        else:
            # to_many_aggregate
            dst_data = new_data

            time_for_each_table = {
                table_idx: self.dataset.tables[table_name].hour_time_data
                for table_idx, table_name in enumerate(
                    to_many_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            sorted_index_for_each_table = {
                table_idx: self.dataset.tables[table_name].sorted_time_index
                for table_idx, table_name in enumerate(
                    to_many_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            src_id_for_each_relation = [
                self.dataset.tables[rel.src].df[rel.src_id].values
                for rel in to_many_path.relations
            ]
            dst_id_for_each_relation = [
                self.dataset.tables[rel.dst].df[rel.dst_id].values
                for rel in to_many_path.relations
            ]
            src_is_unique_for_each_relation = [
                rel.type.src_is_unique for rel in to_many_path.relations
            ]
            dst_is_unique_for_each_relation = [
                rel.type.dst_is_unique for rel in to_many_path.relations
            ]

            new_data = _core.Aggregator().aggregate(
                dst_data, time_for_each_table, sorted_index_for_each_table,
                src_id_for_each_relation, dst_id_for_each_relation,
                src_is_unique_for_each_relation,
                dst_is_unique_for_each_relation, "mean", "mean")

            new_data_name = "{}FactorizedTargetEncoding_{}_{}_{}".format(
                feature_types.aggregate_processed_numerical.prefix,
                self.__path, self.__col1, self.__col2)

            train_size = np.isfinite(self.dataset.target).sum()
            from sklearn import metrics
            print("Factorized")
            print(
                metrics.roc_auc_score(
                    self.dataset.target[:train_size][np.isfinite(
                        new_data[:train_size])],
                    new_data[:train_size][np.isfinite(new_data[:train_size])]))

            self.__dataset.tables[to_many_path.src].set_new_data(
                new_data, new_data_name)
    def synthesis(self):
        if len(self.__path) == 0:
            return

        if self.__is_cat:
            new_data_name = "{}{}_{}_{}".format(
                feature_types.aggregate_processed_categorical.prefix,
                self.__name, self.__path, self.__col)
        else:
            new_data_name = "{}{}_{}_{}".format(
                feature_types.aggregate_processed_numerical.prefix,
                self.__name, self.__path, self.__col)

        dst_table = self.dataset.tables[self.__path.dst]
        dst_df = dst_table.df
        dst_data = dst_df[self.__col].values
        time_for_each_table = {
            table_idx: self.dataset.tables[table_name].hour_time_data
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        sorted_index_for_each_table = {
            table_idx: self.dataset.tables[table_name].sorted_time_index
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        src_id_for_each_relation = [
            self.dataset.tables[rel.src].df[rel.src_id].values
            for rel in self.__path.relations
        ]
        dst_id_for_each_relation = [
            self.dataset.tables[rel.dst].df[rel.dst_id].values
            for rel in self.__path.relations
        ]
        src_is_unique_for_each_relation = [
            rel.type.src_is_unique for rel in self.__path.relations
        ]
        """
        dst_is_unique_for_each_relation = [
            rel.type.dst_is_unique
            for rel in self.__path.relations
        ]
        """
        dst_is_unique_for_each_relation = get_dst_is_substance_unique(
            self.__path, self.__dataset, self.__col)

        new_data = _core.Aggregator().aggregate(
            dst_data, time_for_each_table, sorted_index_for_each_table,
            src_id_for_each_relation, dst_id_for_each_relation,
            src_is_unique_for_each_relation, dst_is_unique_for_each_relation,
            self.__last_agg, self.__other_agg)

        if self.__is_cat:
            self.__dataset.tables[self.__path.src].set_new_data(
                new_data, new_data_name)
        else:
            train_size = np.isfinite(self.__dataset.target).sum()
            train_isfinite = np.isfinite(new_data[:train_size])
            if not train_isfinite.any():
                return
            score = metrics.roc_auc_score(
                self.__dataset.target[:train_size][train_isfinite],
                new_data[:train_size][train_isfinite])
            score = np.abs(score - 0.5)
            if score > 0.001:
                self.__dataset.tables[self.__path.src].set_new_data(
                    new_data, new_data_name)
Beispiel #9
0
    def synthesis(self):
        dst_table = self.dataset.tables[self.path.dst]

        if dst_table.has_cache(("categorical_manager", self.__col)):
            categorical_manager = dst_table.get_cache(
                ("categorical_manager", self.__col))
        else:
            processing_data = \
                dst_table.df[self.__col].fillna("").astype(str).values
            categorical_manager = \
                _core.CategoricalManager(processing_data)
            dst_table.set_cache(("categorical_manager", self.__col),
                                categorical_manager)

        to_one_path = None
        to_many_path = None
        for i in range(len(self.__path), -1, -1):
            if self.__path[i:].is_substance_to_one_with_col(
                    self.__dataset, self.__col):
                to_one_path = self.__path[i:]
                to_many_path = self.__path[:i]

        # to_one identify
        if len(to_one_path) > 0:
            dst_induces = np.arange(len(dst_table.df))
            time_for_each_table = {
                table_idx: self.dataset.tables[table_name].hour_time_data
                for table_idx, table_name in enumerate(to_one_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            sorted_index_for_each_table = {
                table_idx: self.dataset.tables[table_name].sorted_time_index
                for table_idx, table_name in enumerate(to_one_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            src_id_for_each_relation = [
                self.dataset.tables[rel.src].df[rel.src_id].values
                for rel in to_one_path.relations
            ]
            dst_id_for_each_relation = [
                self.dataset.tables[rel.dst].df[rel.dst_id].values
                for rel in to_one_path.relations
            ]
            src_is_unique_for_each_relation = [
                rel.type.src_is_unique for rel in to_one_path.relations
            ]
            dst_is_unique_for_each_relation = [
                rel.type.dst_is_unique for rel in to_one_path.relations
            ]
            dst_induces = _core.Aggregator().aggregate(
                dst_induces, time_for_each_table, sorted_index_for_each_table,
                src_id_for_each_relation, dst_id_for_each_relation,
                src_is_unique_for_each_relation,
                dst_is_unique_for_each_relation, "last", "last")
            dst_induces = dst_induces.astype(np.int32)
            dst_induces[dst_induces < 0] = -1
        else:
            dst_table = self.dataset.tables[to_one_path.dst]
            dst_induces = np.arange(len(dst_table.df))
            dst_induces = dst_induces.astype(np.int32)
            dst_induces[dst_induces < 0] = -1

        # target encoding
        dst_table = self.dataset.tables[to_many_path.dst]
        if not dst_table.has_pseudo_target:
            return

        targets = dst_table.pseudo_target

        if dst_table.has_time:
            sorted_index = dst_table.sorted_time_index
            if dst_table.has_hist_time_data:
                time_data = dst_table.hist_time_data
            else:
                time_data = dst_table.time_data
            new_data = categorical_manager \
                .temporal_target_encode_with_dst_induces(
                    targets, dst_induces, time_data, sorted_index,
                    categorical_manager.unique_num
                )
        else:
            new_data = \
                categorical_manager.target_encode_with_dst_induces(
                    targets, dst_induces,
                    categorical_manager.unique_num
                )

        if len(to_many_path) > 0:
            # to_many_aggregate
            dst_data = new_data

            time_for_each_table = {
                table_idx: self.dataset.tables[table_name].hour_time_data
                for table_idx, table_name in enumerate(
                    to_many_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            sorted_index_for_each_table = {
                table_idx: self.dataset.tables[table_name].sorted_time_index
                for table_idx, table_name in enumerate(
                    to_many_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            src_id_for_each_relation = [
                self.dataset.tables[rel.src].df[rel.src_id].values
                for rel in to_many_path.relations
            ]
            dst_id_for_each_relation = [
                self.dataset.tables[rel.dst].df[rel.dst_id].values
                for rel in to_many_path.relations
            ]
            src_is_unique_for_each_relation = [
                rel.type.src_is_unique for rel in to_many_path.relations
            ]
            dst_is_unique_for_each_relation = [
                rel.type.dst_is_unique for rel in to_many_path.relations
            ]

            new_data = _core.Aggregator().aggregate(
                dst_data, time_for_each_table, sorted_index_for_each_table,
                src_id_for_each_relation, dst_id_for_each_relation,
                src_is_unique_for_each_relation,
                dst_is_unique_for_each_relation, "mean", "mean")

        new_data_name = "{}TargetEncoding_{}_{}".format(
            feature_types.aggregate_processed_numerical.prefix, self.__path,
            self.__col)
        train_size = np.isfinite(self.__dataset.target).sum()
        train_isfinite = np.isfinite(new_data[:train_size])
        if (len(np.unique(new_data[:train_size][train_isfinite])) <= 1):
            return

        if not auc_selection.numerical_adversarial_auc_select(
                self.__dataset, new_data, 0.2):
            return

        self.__dataset.tables[to_many_path.src].set_new_data(
            new_data, new_data_name)