def create_features( self, train_df: pd.DataFrame, test_df: pd.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) org_cols = train.columns.tolist() test = test_df.copy() with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True) del train, test gc.collect() with timer("make feats"): groupby = GroupbyTransformer(groupby_dict) total = groupby.transform(total) groupby = DiffGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) groupby = RatioGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) new_cols = [col for col in total.columns if col not in org_cols] train = total[new_cols].iloc[:len_train].reset_index(drop=True) test = total[new_cols].iloc[len_train:].reset_index(drop=True) with timer("end"): self.train = train.reset_index(drop=True).to_pandas() self.test = test.reset_index(drop=True).to_pandas()
def on_data_end(self, state: State): with utils.timer("Data Compressing", state.logger): dfs = state.dataframes for key in dfs: dfs[key] = utils.reduce_mem_usage(dfs[key], verbose=True, logger=state.logger)
def create_features( self, train_df: pd.DataFrame, test_df: pd.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) org_cols = train.columns.tolist() test = test_df.copy() with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True).reset_index() del train, test gc.collect() with timer("log transform"): for sub_target in num_var_list: total[sub_target] = cudf.Series(np.log1p(total[sub_target].to_pandas())) with timer("GroupbyTransformer"): groupby = GroupbyTransformer(groupby_dict) total = groupby.transform(total) groupby = DiffGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) groupby = RatioGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) with timer("end"): total = total.sort_values("index") new_cols = [col for col in total.columns if col not in org_cols + ["index"]] self.train = total[new_cols].iloc[:len_train].reset_index(drop=True) self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
submission_generator, set_interval_proba, ) current_dir = os.getcwd() main_path = os.path.dirname(current_dir) # main_path = r'C:\Users\Marco\Documents\GitHub\axa_challenge' os.chdir(main_path) gc.collect() # Importing data training = fetch_data("train_engineered") test = fetch_data("validation_engineered") training = reduce_mem_usage(training) test = reduce_mem_usage(test) # Eseguo subset del dataset di train training = subset_data(training, "random", prcn=1, smote_os=0) training = subset_data(training, "random", 1) print("train shape: ", training.shape, " - test shape: ", test.shape) # defining predictions dataframe submission_template = sub_template_creator(test) X_train, y_train, X_test = prepare_train_test_before_scoring(training, test) # Fare funzione
def create_features( self, train_df: pd.DataFrame, test_df: pd.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) org_cols = train.columns.tolist() test = test_df.copy() with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True).reset_index() del train, test gc.collect() with timer("GroupbyTransformer"): groupby = GroupbyTransformer(groupby_dict) total = groupby.transform(total) total["diff_Year_of_Release_groupby_Platform"] = ( total["max_Year_of_Release_groupby_Platform"] - total["min_Year_of_Release_groupby_Platform"] ) groupby = DiffGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) groupby = RatioGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) with timer("pivot_tables"): with timer("Publisher"): count_publishers_groupby_platform = cudf.from_pandas( total.to_pandas() .pivot_table( index="Platform", columns="Publisher", values="Name", aggfunc="count", ) .reset_index() ).fillna(0.0) count_publishers_groupby_platform.columns = ["Platform"] + [ "count_publisher_" + str(col) + "_groupby_platform" for col in count_publishers_groupby_platform.columns if str(col) != "Platform" ] total = cudf.merge( total, count_publishers_groupby_platform, how="left", on="Platform" ) with timer("Genre"): count_genres_groupby_platform = cudf.from_pandas( total.to_pandas() .pivot_table( index="Platform", columns="Genre", values="Name", aggfunc="count", ) .reset_index() ).fillna(0.0) count_genres_groupby_platform.columns = ["Platform"] + [ "count_genre_" + str(col) + "_groupby_platform" for col in count_genres_groupby_platform.columns if str(col) != "Platform" ] total = cudf.merge( total, count_genres_groupby_platform, how="left", on="Platform" ) with timer("Year_of_Release"): count_year_of_releases_groupby_platform = cudf.from_pandas( total.to_pandas() .pivot_table( index="Platform", columns="Year_of_Release", values="Name", aggfunc="count", ) .reset_index() ).fillna(0.0) count_year_of_releases_groupby_platform.columns = ["Platform"] + [ "count_year_of_release_" + str(col) + "_groupby_platform" for col in count_year_of_releases_groupby_platform.columns if str(col) != "Platform" ] total = cudf.merge( total, count_year_of_releases_groupby_platform, how="left", on="Platform", ) with timer("Rating"): count_ratings_groupby_platform = cudf.from_pandas( total.to_pandas() .pivot_table( index="Platform", columns="Rating", values="Name", aggfunc="count", ) .reset_index() ).fillna(0.0) count_ratings_groupby_platform.columns = ["Platform"] + [ "count_rating_" + str(col) + "_groupby_platform" for col in count_ratings_groupby_platform.columns if str(col) != "Platform" ] total = cudf.merge( total, count_ratings_groupby_platform, how="left", on="Platform" ) with timer("end"): total = total.sort_values("index") new_cols = [col for col in total.columns if col not in org_cols + ["index"]] self.train = total[new_cols].iloc[:len_train].reset_index(drop=True) self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
def create_features( self, train_df: pd.DataFrame, test_df: pd.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) test = test_df.copy() train_combi = cudf.read_feather( "./features/ConcatCategory_train.ftr") test_combi = cudf.read_feather( "./features/ConcatCategory_test.ftr") combi_cat_cols = test_combi.columns.tolist() with timer("concat combi"): train = cudf.concat([train, train_combi], axis="columns") org_cols = train.columns.tolist() test = cudf.concat([test, test_combi], axis="columns") with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True).reset_index() del train, test gc.collect() with timer("GroupbyTransformer"): num_var_list = [ "Critic_Score", "Critic_Count", "User_Score", "User_Count", "log_User_Count", ] cat_var_list = [ "Name", "Platform", "Genre", "Publisher", "Developer", "Rating", ] num_stats_list = [ "mean", "std", "min", "max", "sum", ] cat_stats_list = ["count", "nunique"] groupby_dict = [] cat_var_list = cat_var_list + combi_cat_cols for key in combi_cat_cols: groupby_dict.append({ "key": [key], "var": num_var_list, "agg": num_stats_list, }) groupby_dict.append({ "key": [key], "var": [cat for cat in cat_var_list if cat != key], "agg": cat_stats_list, }) groupby = GroupbyTransformer(groupby_dict) total = groupby.transform(total) groupby = DiffGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) groupby = RatioGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) with timer("end"): total = total.sort_values("index") new_cols = [ col for col in total.columns if col not in org_cols + ["index"] ] self.train = total[new_cols].iloc[:len_train].reset_index( drop=True) self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
def save(self): save_pickle(reduce_mem_usage(self.train), self.train_path) save_pickle(reduce_mem_usage(self.test), self.test_path)
def save(self): reduce_mem_usage(self.train).to_feather(self.train_path) reduce_mem_usage(self.test).to_feather(self.test_path)