def search(self, path, dataset): if path.is_to_many: dst_table = dataset.tables[path.dst] ret = [] for col in dst_table.df.columns: if path.is_substance_to_one_with_col(dataset, col): continue ftype = dst_table.ftypes[col] if ftype == feature_types.categorical \ or ftype == feature_types.c_processed_categorical: dst_data = dataset.tables[path.dst].df[col].values if dst_table.has_cache(("categorical_manager", col)): categorical_manager = dst_table.get_cache( ("categorical_manager", col)) else: processing_data = \ dst_table.df[col].fillna("").astype(str).values categorical_manager = \ _core.CategoricalManager(processing_data) dst_table.set_cache(("categorical_manager", col), categorical_manager) if dst_table.nunique[col] == 2: mode = categorical_manager.most_common(1)[0][0] ret.append( OneHotMeanManipulation(path, dataset, col, mode)) else: for value, freq in categorical_manager.most_common(5): if freq > 1: ret.append( OneHotMeanManipulation( path, dataset, col, value)) return ret else: return []
def __recursive_synthesis(self, path): if len(self.__path) == 0: return new_data_name = "{}OneHotSum_{}_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, path, self.__col, self.__value) dst_table = self.dataset.tables[self.__path.dst] if dst_table.has_cache(("categorical_manager", self.__col)): categorical_manager = dst_table.get_cache( ("categorical_manager", self.__col)) else: processing_data = \ dst_table.df[self.__col].fillna("").astype(str).values categorical_manager = \ _core.CategoricalManager(processing_data) dst_table.set_cache(("categorical_manager", self.__col), categorical_manager) dst_data = categorical_manager.is_array(self.__value) time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in self.__path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in self.__path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in self.__path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in self.__path.relations ] new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "sum", "sum") self.__dataset.tables[self.__path.src].set_new_data( new_data, new_data_name)
def __recursive_synthesis(self, path): if len(self.__path) == 0: return new_data_name = "{}OneHotMean_{}_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, path, self.__col, self.__value) dst_table = self.dataset.tables[self.__path.dst] if dst_table.has_cache(("categorical_manager", self.__col)): categorical_manager = dst_table.get_cache( ("categorical_manager", self.__col)) else: processing_data = \ dst_table.df[self.__col].fillna("").astype(str).values categorical_manager = \ _core.CategoricalManager(processing_data) dst_table.set_cache(("categorical_manager", self.__col), categorical_manager) dst_data = categorical_manager.is_array(self.__value) time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in self.__path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in self.__path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in self.__path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in self.__path.relations ] new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "mean", "mean") train_size = np.isfinite(self.__dataset.target).sum() train_isfinite = np.isfinite(new_data[:train_size]) if (len(np.unique(new_data[:train_size][train_isfinite])) <= 1): return auc = metrics.roc_auc_score( self.__dataset.target[:train_size][train_isfinite], new_data[:train_size][train_isfinite]) if (auc < 0.5001 and auc > 0.4999): return self.__dataset.tables[self.__path.src].set_new_data( new_data, new_data_name)
def __preprocess(self): cols_of_each_ftype = self.cols_of_each_ftype # numericalでnuniqueが低いものはcategoricalに """ if len(self.__df) > 1000: columns = self.__df.columns for col in columns: if self.__ftypes[col] == feature_types.numerical: if self.__df[col].nunique() <= 10: self.__df["{}{}".format( feature_types.categorical.prefix, col, )] = self.__df[col].astype(str) self.__df.drop(col, axis=1, inplace=True) print("numerical {} change to categorical".format(col)) self.__ftypes = pd.Series( self.__automl_df_to_ftypes(), self.__df.dtypes.index) """ import time new_data = {} columns = self.__df.columns for col in columns: start = time.time() if self.__ftypes[col] == feature_types.time: # Time preprocess self.__df[col] = pd.to_datetime(self.__df[col]) """ # time numericalize if self.__min_time is not None: self.__df["{}numericalized_{}".format( feature_types.t_processed_numerical.prefix, col, )] = ((self.__df[col] - self.__min_time).astype(int) / 1e9).astype(np.float32) else: self.__df["{}numericalized_{}".format( feature_types.t_processed_numerical.prefix, col, )] = (self.__df[col].astype(int) / 1e9).astype(np.float32) """ max_min_time_diff = self.__df[col].max() - self.__df[col].min() # time hour if max_min_time_diff > pd.Timedelta('2 hours'): new_data["{}hour_{}".format( feature_types.t_processed_numerical.prefix, col, )] = self.__df[col].dt.hour.values.astype(np.float32) # time year if max_min_time_diff > pd.Timedelta('500 days'): new_data["{}year_{}".format( feature_types.t_processed_numerical.prefix, col, )] = self.__df[col].dt.year.values.astype(np.float32) # time doy if max_min_time_diff > pd.Timedelta('100 days'): new_data["{}doy_{}".format( feature_types.t_processed_numerical.prefix, col, )] = self.__df[col].dt.dayofyear.values.astype(np.float32) # time dow if max_min_time_diff > pd.Timedelta('2 days'): new_data["{}dow_{}".format( feature_types.t_processed_numerical.prefix, col, )] = self.__df[col].dt.dayofweek.values.astype(np.float32) # weekend if max_min_time_diff > pd.Timedelta('2 days'): new_data["{}id_weekend_{}".format( feature_types.t_processed_categorical.prefix, col, )] = (self.__df[col].dt.dayofweek >= 5).astype(np.int32) # time zone if max_min_time_diff > pd.Timedelta('8 hours'): new_data["{}time_zone_{}".format( feature_types.t_processed_categorical.prefix, col, )] = (self.__df[col].dt.hour.values // 6).astype(np.int32) self.__df[col] = ( (self.__df[col] - self.__min_time).astype(int) / 1e9).astype(np.float32) elif self.__ftypes[col] == feature_types.categorical: # categorical preprocess processing_data = \ self.__df[col].fillna("").values categorical_manager = \ _core.CategoricalManager(processing_data) self.set_cache(("categorical_manager", col), categorical_manager) if col in self.__label_encoders: self.__df[col] = self.__label_encoders[col].transform( processing_data).astype(np.int32) else: self.__df[col] = categorical_manager.label() # frequency encoding new_data["{}frequency_{}".format( feature_types.c_processed_numerical.prefix, col)] = categorical_manager.frequency() if self.has_time: # processing_data = self.__df[col].values """ new_data["{}neighbor_nunique_{}".format( feature_types.c_processed_numerical.prefix, col )] = _core.not_temporal_to_many_aggregate( np.roll(processing_data, -1), processing_data, processing_data, 'nunique') \ / _core.not_temporal_to_many_aggregate( np.ones_like(processing_data), processing_data, processing_data, 'sum') new_data["{}time_variance_{}".format( feature_types.c_processed_numerical.prefix, col )] = _core.not_temporal_to_many_aggregate( np.arange(len(processing_data)), processing_data, processing_data, 'variance') """ """ new_data["{}neighbor_count_{}".format( feature_types.c_processed_numerical.prefix, col )] = categorical_manager.sequential_count_encoding( self.__sorted_time_index, len(self.__df) // 30) """ if categorical_manager.has_null: new_data["{}_is_null_{}".format( feature_types.c_processed_categorical.prefix, col)] = categorical_manager.is_null() elif self.__ftypes[col] == feature_types.multi_categorical: # multi categorical preprocess processing_data = \ self.__df[col].fillna("").values multi_categorical_manager = \ _core.MultiCategoricalManager(processing_data) self.set_cache(("multi_categorical_manager", col), multi_categorical_manager) counter = collections.Counter(processing_data) if np.median([value for key, value in counter.most_common()]) > 1: self.set_cache(("substance_categorical", col), True) categorical_manager = \ _core.CategoricalManager(processing_data) self.set_cache(("categorical_manager", col), categorical_manager) # frequency encoding """ self.__df["{}frequency_{}".format( feature_types.c_processed_numerical.prefix, col )] = categorical_manager.frequency() """ else: self.set_cache(("substance_categorical", col), False) # length # nunique # duplicated length = multi_categorical_manager.length() nunique = multi_categorical_manager.nunique() # duplicated = length - nunique duplicated = multi_categorical_manager.duplicates() new_data["{}length_{}".format( feature_types.mc_processed_numerical.prefix, col)] = length new_data["{}nunique_{}".format( feature_types.mc_processed_numerical.prefix, col)] = nunique new_data["{}duplicated_{}".format( feature_types.mc_processed_numerical.prefix, col)] = duplicated # max_count # min_count new_data["{}max_count_{}".format( feature_types.mc_processed_numerical.prefix, col)] = multi_categorical_manager.max_count() new_data["{}min_count_{}".format( feature_types.mc_processed_numerical.prefix, col)] = multi_categorical_manager.min_count() # mode new_data["{}mode_{}".format( feature_types.mc_processed_categorical.prefix, col)] = multi_categorical_manager.mode().astype(int) # max_tfidf_words """ new_data["{}max_tfidf_words_{}".format( feature_types.mc_processed_categorical.prefix, col )] = multi_categorical_manager.max_tfidf_words().astype(int) """ # hashed tf-idf """ multi_categorical_manager.calculate_hashed_tfidf(10) for vectorized_idx in range(10): self.__df["{}hashed_tfidf_{}_{}".format( feature_types.mc_processed_numerical.prefix, col, vectorized_idx, )] = multi_categorical_manager.get_hashed_tfidf( vectorized_idx) """ # tf-idf vectorize """ for vectorized_idx in range(10): new_data["{}tfidf_{}_{}".format( feature_types.mc_processed_numerical.prefix, col, vectorized_idx, )] = multi_categorical_manager.tfidf(vectorized_idx) """ for vectorized_idx in range(10): new_data["{}count_{}_{}".format( feature_types.mc_processed_numerical.prefix, col, vectorized_idx, )] = multi_categorical_manager.count(vectorized_idx) # svd """ svd_values = \ multi_categorical_manager.truncated_svd(10, False, False) """ """ tfidf_values = multi_categorical_manager.get_tfidf_matrix() from sklearn.decomposition import TruncatedSVD svd_values = TruncatedSVD( n_components=10, random_state=10, algorithm='arpack', n_iter=5).fit_transform(tfidf_values) """ """ for svd_idx in range(10): new_data["{}svd_{}_{}".format( feature_types.mc_processed_numerical.prefix, col, svd_idx, )] = svd_values[:, svd_idx] """ self.__df.drop(col, axis=1, inplace=True) del processing_data self.__df[col] = "" gc.collect() elif self.__ftypes[col] == feature_types.numerical: # numerical preprocess if pd.isnull(self.__df[col]).all(): continue if (len( np.unique(self.__df[col].values[np.isfinite( self.__df[col].values)])) == 1): self.__df.drop(col, axis=1, inplace=True) continue """ mode, mode_count = \ collections.Counter( self.__df[col].values[ np.isfinite(self.__df[col].values)] ).most_common(1)[0] mode_freq = mode_count / len(self.__df) if mode_freq >= 1: self.__df.drop(col, axis=1, inplace=True) continue if mode_freq > 0.1: new_data["{}_is_mode_{}".format( feature_types.n_processed_categorical.prefix, col )] = (self.__df[col].values == mode).astype(np.int32) """ if pd.isnull(self.__df[col]).any(): new_data["{}_is_null_{}".format( feature_types.n_processed_categorical.prefix, col)] = pd.isnull(self.__df[col]).astype(np.int32) self.__df[col] = self.__df[col].astype(np.float32) print(col, time.time() - start) new_data = pd.DataFrame(new_data) self.__df = pd.concat([self.__df, new_data], axis=1)
def synthesis(self): dst_table = self.dataset.tables[self.path.dst] if dst_table.has_cache(("categorical_manager", self.__col)): categorical_manager = dst_table.get_cache( ("categorical_manager", self.__col)) else: processing_data = \ dst_table.df[self.__col].fillna("").astype(str).values categorical_manager = \ _core.CategoricalManager(processing_data) dst_table.set_cache(("categorical_manager", self.__col), categorical_manager) to_one_path = None to_many_path = None for i in range(len(self.__path), -1, -1): if self.__path[i:].is_substance_to_one_with_col( self.__dataset, self.__col): to_one_path = self.__path[i:] to_many_path = self.__path[:i] # to_one identify if len(to_one_path) > 0: dst_induces = np.arange(len(dst_table.df)) time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(to_one_path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(to_one_path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in to_one_path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in to_one_path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in to_one_path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in to_one_path.relations ] dst_induces = _core.Aggregator().aggregate( dst_induces, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "last", "last") dst_induces = dst_induces.astype(np.int32) dst_induces[dst_induces < 0] = -1 else: dst_table = self.dataset.tables[to_one_path.dst] dst_induces = np.arange(len(dst_table.df)) dst_induces = dst_induces.astype(np.int32) dst_induces[dst_induces < 0] = -1 # target encoding dst_table = self.dataset.tables[to_many_path.dst] if not dst_table.has_pseudo_target: return targets = dst_table.pseudo_target if dst_table.has_time: sorted_index = dst_table.sorted_time_index if dst_table.has_hist_time_data: time_data = dst_table.hist_time_data else: time_data = dst_table.time_data new_data = categorical_manager \ .temporal_target_encode_with_dst_induces( targets, dst_induces, time_data, sorted_index, categorical_manager.unique_num ) else: new_data = \ categorical_manager.target_encode_with_dst_induces( targets, dst_induces, categorical_manager.unique_num ) if len(to_many_path) > 0: # to_many_aggregate dst_data = new_data time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate( to_many_path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate( to_many_path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in to_many_path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in to_many_path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in to_many_path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in to_many_path.relations ] new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "mean", "mean") new_data_name = "{}TargetEncoding_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, self.__path, self.__col) train_size = np.isfinite(self.__dataset.target).sum() train_isfinite = np.isfinite(new_data[:train_size]) if (len(np.unique(new_data[:train_size][train_isfinite])) <= 1): return if not auc_selection.numerical_adversarial_auc_select( self.__dataset, new_data, 0.2): return self.__dataset.tables[to_many_path.src].set_new_data( new_data, new_data_name)