def search(self, path, dataset):
        if path.is_to_many:
            dst_table = dataset.tables[path.dst]
            ret = []
            for col in dst_table.df.columns:
                if path.is_substance_to_one_with_col(dataset, col):
                    continue
                ftype = dst_table.ftypes[col]
                if ftype == feature_types.categorical \
                   or ftype == feature_types.c_processed_categorical:
                    dst_data = dataset.tables[path.dst].df[col].values

                    if dst_table.has_cache(("categorical_manager", col)):
                        categorical_manager = dst_table.get_cache(
                            ("categorical_manager", col))
                    else:
                        processing_data = \
                            dst_table.df[col].fillna("").astype(str).values
                        categorical_manager = \
                            _core.CategoricalManager(processing_data)
                        dst_table.set_cache(("categorical_manager", col),
                                            categorical_manager)
                    if dst_table.nunique[col] == 2:
                        mode = categorical_manager.most_common(1)[0][0]
                        ret.append(
                            OneHotMeanManipulation(path, dataset, col, mode))
                    else:
                        for value, freq in categorical_manager.most_common(5):
                            if freq > 1:
                                ret.append(
                                    OneHotMeanManipulation(
                                        path, dataset, col, value))
            return ret
        else:
            return []
Esempio n. 2
0
    def __recursive_synthesis(self, path):
        if len(self.__path) == 0:
            return

        new_data_name = "{}OneHotSum_{}_{}_{}".format(
            feature_types.aggregate_processed_numerical.prefix, path,
            self.__col, self.__value)
        dst_table = self.dataset.tables[self.__path.dst]

        if dst_table.has_cache(("categorical_manager", self.__col)):
            categorical_manager = dst_table.get_cache(
                ("categorical_manager", self.__col))
        else:
            processing_data = \
                dst_table.df[self.__col].fillna("").astype(str).values
            categorical_manager = \
                _core.CategoricalManager(processing_data)
            dst_table.set_cache(("categorical_manager", self.__col),
                                categorical_manager)

        dst_data = categorical_manager.is_array(self.__value)
        time_for_each_table = {
            table_idx: self.dataset.tables[table_name].hour_time_data
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        sorted_index_for_each_table = {
            table_idx: self.dataset.tables[table_name].sorted_time_index
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        src_id_for_each_relation = [
            self.dataset.tables[rel.src].df[rel.src_id].values
            for rel in self.__path.relations
        ]
        dst_id_for_each_relation = [
            self.dataset.tables[rel.dst].df[rel.dst_id].values
            for rel in self.__path.relations
        ]
        src_is_unique_for_each_relation = [
            rel.type.src_is_unique for rel in self.__path.relations
        ]
        dst_is_unique_for_each_relation = [
            rel.type.dst_is_unique for rel in self.__path.relations
        ]

        new_data = _core.Aggregator().aggregate(
            dst_data, time_for_each_table, sorted_index_for_each_table,
            src_id_for_each_relation, dst_id_for_each_relation,
            src_is_unique_for_each_relation, dst_is_unique_for_each_relation,
            "sum", "sum")
        self.__dataset.tables[self.__path.src].set_new_data(
            new_data, new_data_name)
    def __recursive_synthesis(self, path):
        if len(self.__path) == 0:
            return

        new_data_name = "{}OneHotMean_{}_{}_{}".format(
            feature_types.aggregate_processed_numerical.prefix, path,
            self.__col, self.__value)
        dst_table = self.dataset.tables[self.__path.dst]

        if dst_table.has_cache(("categorical_manager", self.__col)):
            categorical_manager = dst_table.get_cache(
                ("categorical_manager", self.__col))
        else:
            processing_data = \
                dst_table.df[self.__col].fillna("").astype(str).values
            categorical_manager = \
                _core.CategoricalManager(processing_data)
            dst_table.set_cache(("categorical_manager", self.__col),
                                categorical_manager)

        dst_data = categorical_manager.is_array(self.__value)
        time_for_each_table = {
            table_idx: self.dataset.tables[table_name].hour_time_data
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        sorted_index_for_each_table = {
            table_idx: self.dataset.tables[table_name].sorted_time_index
            for table_idx, table_name in enumerate(self.__path.table_names)
            if self.dataset.tables[table_name].has_time
        }
        src_id_for_each_relation = [
            self.dataset.tables[rel.src].df[rel.src_id].values
            for rel in self.__path.relations
        ]
        dst_id_for_each_relation = [
            self.dataset.tables[rel.dst].df[rel.dst_id].values
            for rel in self.__path.relations
        ]
        src_is_unique_for_each_relation = [
            rel.type.src_is_unique for rel in self.__path.relations
        ]
        dst_is_unique_for_each_relation = [
            rel.type.dst_is_unique for rel in self.__path.relations
        ]

        new_data = _core.Aggregator().aggregate(
            dst_data, time_for_each_table, sorted_index_for_each_table,
            src_id_for_each_relation, dst_id_for_each_relation,
            src_is_unique_for_each_relation, dst_is_unique_for_each_relation,
            "mean", "mean")

        train_size = np.isfinite(self.__dataset.target).sum()
        train_isfinite = np.isfinite(new_data[:train_size])
        if (len(np.unique(new_data[:train_size][train_isfinite])) <= 1):
            return
        auc = metrics.roc_auc_score(
            self.__dataset.target[:train_size][train_isfinite],
            new_data[:train_size][train_isfinite])
        if (auc < 0.5001 and auc > 0.4999):
            return

        self.__dataset.tables[self.__path.src].set_new_data(
            new_data, new_data_name)
Esempio n. 4
0
    def __preprocess(self):
        cols_of_each_ftype = self.cols_of_each_ftype

        # numericalでnuniqueが低いものはcategoricalに
        """
        if len(self.__df) > 1000:
            columns = self.__df.columns
            for col in columns:
                if self.__ftypes[col] == feature_types.numerical:
                    if self.__df[col].nunique() <= 10:
                        self.__df["{}{}".format(
                            feature_types.categorical.prefix, col,
                        )] = self.__df[col].astype(str)
                        self.__df.drop(col, axis=1, inplace=True)
                        print("numerical {} change to categorical".format(col))
            self.__ftypes = pd.Series(
                self.__automl_df_to_ftypes(), self.__df.dtypes.index)
        """
        import time

        new_data = {}
        columns = self.__df.columns
        for col in columns:
            start = time.time()
            if self.__ftypes[col] == feature_types.time:
                # Time preprocess
                self.__df[col] = pd.to_datetime(self.__df[col])
                """
                # time numericalize
                if self.__min_time is not None:
                    self.__df["{}numericalized_{}".format(
                        feature_types.t_processed_numerical.prefix, col,
                    )] = ((self.__df[col] - self.__min_time).astype(int)
                          / 1e9).astype(np.float32)
                else:
                    self.__df["{}numericalized_{}".format(
                        feature_types.t_processed_numerical.prefix, col,
                    )] = (self.__df[col].astype(int)
                          / 1e9).astype(np.float32)
                """

                max_min_time_diff = self.__df[col].max() - self.__df[col].min()
                # time hour
                if max_min_time_diff > pd.Timedelta('2 hours'):
                    new_data["{}hour_{}".format(
                        feature_types.t_processed_numerical.prefix,
                        col,
                    )] = self.__df[col].dt.hour.values.astype(np.float32)
                # time year
                if max_min_time_diff > pd.Timedelta('500 days'):
                    new_data["{}year_{}".format(
                        feature_types.t_processed_numerical.prefix,
                        col,
                    )] = self.__df[col].dt.year.values.astype(np.float32)
                # time doy
                if max_min_time_diff > pd.Timedelta('100 days'):
                    new_data["{}doy_{}".format(
                        feature_types.t_processed_numerical.prefix,
                        col,
                    )] = self.__df[col].dt.dayofyear.values.astype(np.float32)
                # time dow
                if max_min_time_diff > pd.Timedelta('2 days'):
                    new_data["{}dow_{}".format(
                        feature_types.t_processed_numerical.prefix,
                        col,
                    )] = self.__df[col].dt.dayofweek.values.astype(np.float32)
                # weekend
                if max_min_time_diff > pd.Timedelta('2 days'):
                    new_data["{}id_weekend_{}".format(
                        feature_types.t_processed_categorical.prefix,
                        col,
                    )] = (self.__df[col].dt.dayofweek >= 5).astype(np.int32)
                # time zone
                if max_min_time_diff > pd.Timedelta('8 hours'):
                    new_data["{}time_zone_{}".format(
                        feature_types.t_processed_categorical.prefix,
                        col,
                    )] = (self.__df[col].dt.hour.values // 6).astype(np.int32)

                self.__df[col] = (
                    (self.__df[col] - self.__min_time).astype(int) /
                    1e9).astype(np.float32)

            elif self.__ftypes[col] == feature_types.categorical:
                # categorical preprocess
                processing_data = \
                    self.__df[col].fillna("").values
                categorical_manager = \
                    _core.CategoricalManager(processing_data)
                self.set_cache(("categorical_manager", col),
                               categorical_manager)
                if col in self.__label_encoders:
                    self.__df[col] = self.__label_encoders[col].transform(
                        processing_data).astype(np.int32)
                else:
                    self.__df[col] = categorical_manager.label()

                # frequency encoding
                new_data["{}frequency_{}".format(
                    feature_types.c_processed_numerical.prefix,
                    col)] = categorical_manager.frequency()

                if self.has_time:
                    # processing_data = self.__df[col].values
                    """
                    new_data["{}neighbor_nunique_{}".format(
                        feature_types.c_processed_numerical.prefix, col
                    )] = _core.not_temporal_to_many_aggregate(
                        np.roll(processing_data, -1),
                        processing_data, processing_data, 'nunique') \
                        / _core.not_temporal_to_many_aggregate(
                        np.ones_like(processing_data),
                        processing_data, processing_data, 'sum')

                    new_data["{}time_variance_{}".format(
                        feature_types.c_processed_numerical.prefix, col
                    )] = _core.not_temporal_to_many_aggregate(
                        np.arange(len(processing_data)),
                        processing_data, processing_data, 'variance')
                    """
                    """
                    new_data["{}neighbor_count_{}".format(
                        feature_types.c_processed_numerical.prefix, col
                    )] = categorical_manager.sequential_count_encoding(
                        self.__sorted_time_index,
                        len(self.__df) // 30)
                    """

                if categorical_manager.has_null:
                    new_data["{}_is_null_{}".format(
                        feature_types.c_processed_categorical.prefix,
                        col)] = categorical_manager.is_null()

            elif self.__ftypes[col] == feature_types.multi_categorical:
                # multi categorical preprocess
                processing_data = \
                    self.__df[col].fillna("").values
                multi_categorical_manager = \
                    _core.MultiCategoricalManager(processing_data)
                self.set_cache(("multi_categorical_manager", col),
                               multi_categorical_manager)

                counter = collections.Counter(processing_data)
                if np.median([value
                              for key, value in counter.most_common()]) > 1:
                    self.set_cache(("substance_categorical", col), True)
                    categorical_manager = \
                        _core.CategoricalManager(processing_data)
                    self.set_cache(("categorical_manager", col),
                                   categorical_manager)
                    # frequency encoding
                    """
                    self.__df["{}frequency_{}".format(
                        feature_types.c_processed_numerical.prefix, col
                    )] = categorical_manager.frequency()
                    """
                else:
                    self.set_cache(("substance_categorical", col), False)

                # length
                # nunique
                # duplicated
                length = multi_categorical_manager.length()
                nunique = multi_categorical_manager.nunique()
                # duplicated = length - nunique
                duplicated = multi_categorical_manager.duplicates()
                new_data["{}length_{}".format(
                    feature_types.mc_processed_numerical.prefix, col)] = length
                new_data["{}nunique_{}".format(
                    feature_types.mc_processed_numerical.prefix,
                    col)] = nunique
                new_data["{}duplicated_{}".format(
                    feature_types.mc_processed_numerical.prefix,
                    col)] = duplicated

                # max_count
                # min_count
                new_data["{}max_count_{}".format(
                    feature_types.mc_processed_numerical.prefix,
                    col)] = multi_categorical_manager.max_count()
                new_data["{}min_count_{}".format(
                    feature_types.mc_processed_numerical.prefix,
                    col)] = multi_categorical_manager.min_count()

                # mode
                new_data["{}mode_{}".format(
                    feature_types.mc_processed_categorical.prefix,
                    col)] = multi_categorical_manager.mode().astype(int)

                # max_tfidf_words
                """
                new_data["{}max_tfidf_words_{}".format(
                    feature_types.mc_processed_categorical.prefix, col
                )] = multi_categorical_manager.max_tfidf_words().astype(int)
                """

                # hashed tf-idf
                """
                multi_categorical_manager.calculate_hashed_tfidf(10)
                for vectorized_idx in range(10):
                    self.__df["{}hashed_tfidf_{}_{}".format(
                        feature_types.mc_processed_numerical.prefix, col,
                        vectorized_idx,
                    )] = multi_categorical_manager.get_hashed_tfidf(
                        vectorized_idx)
                """

                # tf-idf vectorize
                """
                for vectorized_idx in range(10):
                    new_data["{}tfidf_{}_{}".format(
                        feature_types.mc_processed_numerical.prefix, col,
                        vectorized_idx,
                    )] = multi_categorical_manager.tfidf(vectorized_idx)
                """
                for vectorized_idx in range(10):
                    new_data["{}count_{}_{}".format(
                        feature_types.mc_processed_numerical.prefix,
                        col,
                        vectorized_idx,
                    )] = multi_categorical_manager.count(vectorized_idx)

                # svd
                """
                svd_values = \
                    multi_categorical_manager.truncated_svd(10, False, False)
                """
                """
                tfidf_values = multi_categorical_manager.get_tfidf_matrix()
                from sklearn.decomposition import TruncatedSVD
                svd_values = TruncatedSVD(
                    n_components=10, random_state=10, algorithm='arpack',
                    n_iter=5).fit_transform(tfidf_values)
                """
                """
                for svd_idx in range(10):
                    new_data["{}svd_{}_{}".format(
                        feature_types.mc_processed_numerical.prefix, col,
                        svd_idx,
                    )] = svd_values[:, svd_idx]
                """
                self.__df.drop(col, axis=1, inplace=True)
                del processing_data
                self.__df[col] = ""
                gc.collect()

            elif self.__ftypes[col] == feature_types.numerical:
                # numerical preprocess
                if pd.isnull(self.__df[col]).all():
                    continue

                if (len(
                        np.unique(self.__df[col].values[np.isfinite(
                            self.__df[col].values)])) == 1):
                    self.__df.drop(col, axis=1, inplace=True)
                    continue
                """
                mode, mode_count = \
                    collections.Counter(
                        self.__df[col].values[
                            np.isfinite(self.__df[col].values)]
                        ).most_common(1)[0]
                mode_freq = mode_count / len(self.__df)
                if mode_freq >= 1:
                    self.__df.drop(col, axis=1, inplace=True)
                    continue

                if mode_freq > 0.1:
                    new_data["{}_is_mode_{}".format(
                        feature_types.n_processed_categorical.prefix, col
                    )] = (self.__df[col].values == mode).astype(np.int32)
                """
                if pd.isnull(self.__df[col]).any():
                    new_data["{}_is_null_{}".format(
                        feature_types.n_processed_categorical.prefix,
                        col)] = pd.isnull(self.__df[col]).astype(np.int32)
                self.__df[col] = self.__df[col].astype(np.float32)

            print(col, time.time() - start)

        new_data = pd.DataFrame(new_data)
        self.__df = pd.concat([self.__df, new_data], axis=1)
Esempio n. 5
0
    def synthesis(self):
        dst_table = self.dataset.tables[self.path.dst]

        if dst_table.has_cache(("categorical_manager", self.__col)):
            categorical_manager = dst_table.get_cache(
                ("categorical_manager", self.__col))
        else:
            processing_data = \
                dst_table.df[self.__col].fillna("").astype(str).values
            categorical_manager = \
                _core.CategoricalManager(processing_data)
            dst_table.set_cache(("categorical_manager", self.__col),
                                categorical_manager)

        to_one_path = None
        to_many_path = None
        for i in range(len(self.__path), -1, -1):
            if self.__path[i:].is_substance_to_one_with_col(
                    self.__dataset, self.__col):
                to_one_path = self.__path[i:]
                to_many_path = self.__path[:i]

        # to_one identify
        if len(to_one_path) > 0:
            dst_induces = np.arange(len(dst_table.df))
            time_for_each_table = {
                table_idx: self.dataset.tables[table_name].hour_time_data
                for table_idx, table_name in enumerate(to_one_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            sorted_index_for_each_table = {
                table_idx: self.dataset.tables[table_name].sorted_time_index
                for table_idx, table_name in enumerate(to_one_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            src_id_for_each_relation = [
                self.dataset.tables[rel.src].df[rel.src_id].values
                for rel in to_one_path.relations
            ]
            dst_id_for_each_relation = [
                self.dataset.tables[rel.dst].df[rel.dst_id].values
                for rel in to_one_path.relations
            ]
            src_is_unique_for_each_relation = [
                rel.type.src_is_unique for rel in to_one_path.relations
            ]
            dst_is_unique_for_each_relation = [
                rel.type.dst_is_unique for rel in to_one_path.relations
            ]
            dst_induces = _core.Aggregator().aggregate(
                dst_induces, time_for_each_table, sorted_index_for_each_table,
                src_id_for_each_relation, dst_id_for_each_relation,
                src_is_unique_for_each_relation,
                dst_is_unique_for_each_relation, "last", "last")
            dst_induces = dst_induces.astype(np.int32)
            dst_induces[dst_induces < 0] = -1
        else:
            dst_table = self.dataset.tables[to_one_path.dst]
            dst_induces = np.arange(len(dst_table.df))
            dst_induces = dst_induces.astype(np.int32)
            dst_induces[dst_induces < 0] = -1

        # target encoding
        dst_table = self.dataset.tables[to_many_path.dst]
        if not dst_table.has_pseudo_target:
            return

        targets = dst_table.pseudo_target

        if dst_table.has_time:
            sorted_index = dst_table.sorted_time_index
            if dst_table.has_hist_time_data:
                time_data = dst_table.hist_time_data
            else:
                time_data = dst_table.time_data
            new_data = categorical_manager \
                .temporal_target_encode_with_dst_induces(
                    targets, dst_induces, time_data, sorted_index,
                    categorical_manager.unique_num
                )
        else:
            new_data = \
                categorical_manager.target_encode_with_dst_induces(
                    targets, dst_induces,
                    categorical_manager.unique_num
                )

        if len(to_many_path) > 0:
            # to_many_aggregate
            dst_data = new_data

            time_for_each_table = {
                table_idx: self.dataset.tables[table_name].hour_time_data
                for table_idx, table_name in enumerate(
                    to_many_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            sorted_index_for_each_table = {
                table_idx: self.dataset.tables[table_name].sorted_time_index
                for table_idx, table_name in enumerate(
                    to_many_path.table_names)
                if self.dataset.tables[table_name].has_time
            }
            src_id_for_each_relation = [
                self.dataset.tables[rel.src].df[rel.src_id].values
                for rel in to_many_path.relations
            ]
            dst_id_for_each_relation = [
                self.dataset.tables[rel.dst].df[rel.dst_id].values
                for rel in to_many_path.relations
            ]
            src_is_unique_for_each_relation = [
                rel.type.src_is_unique for rel in to_many_path.relations
            ]
            dst_is_unique_for_each_relation = [
                rel.type.dst_is_unique for rel in to_many_path.relations
            ]

            new_data = _core.Aggregator().aggregate(
                dst_data, time_for_each_table, sorted_index_for_each_table,
                src_id_for_each_relation, dst_id_for_each_relation,
                src_is_unique_for_each_relation,
                dst_is_unique_for_each_relation, "mean", "mean")

        new_data_name = "{}TargetEncoding_{}_{}".format(
            feature_types.aggregate_processed_numerical.prefix, self.__path,
            self.__col)
        train_size = np.isfinite(self.__dataset.target).sum()
        train_isfinite = np.isfinite(new_data[:train_size])
        if (len(np.unique(new_data[:train_size][train_isfinite])) <= 1):
            return

        if not auc_selection.numerical_adversarial_auc_select(
                self.__dataset, new_data, 0.2):
            return

        self.__dataset.tables[to_many_path.src].set_new_data(
            new_data, new_data_name)