def load_features(config: dict) -> Tuple[cudf.DataFrame, cudf.DataFrame]: feature_path = config["dataset"]["feature_dir"] with timer("load train"): train_feats = [ cudf.read_feather(f"{feature_path}/{f}_train.ftr") for f in config["features"] if Path(f"{feature_path}/{f}_train.ftr").exists() ] cols = [] for feats in train_feats: cols = cols + feats.columns.tolist() print( f"duplicated cols: {[k for k, v in collections.Counter(cols).items() if v > 1]}" ) assert len(cols) == len(np.unique(cols)) x_train = cudf.concat( train_feats, axis=1, sort=False, ) with timer("load test"): x_test = cudf.concat( [ cudf.read_feather(f"{feature_path}/{f}_test.ftr") for f in config["features"] if Path(f"{feature_path}/{f}_test.ftr").exists() ], axis=1, sort=False, ) return x_train, x_test
def test_feather_reader(feather_file, columns): expect = pa.feather.read_table(feather_file, columns=columns).to_pandas() got = (cudf.read_feather( feather_file, columns=columns).to_arrow(preserve_index=False).to_pandas()) assert_eq(expect, got, check_categorical=False)
def create_features( self, train_df: pd.DataFrame, test_df: pd.DataFrame, ): with timer("load data"): train = train_df.copy() len_train = len(train) test = test_df.copy() train_combi = cudf.read_feather( "./features/ConcatCategory_train.ftr") test_combi = cudf.read_feather( "./features/ConcatCategory_test.ftr") combi_cat_cols = test_combi.columns.tolist() with timer("concat combi"): train = cudf.concat([train, train_combi], axis="columns") org_cols = train.columns.tolist() test = cudf.concat([test, test_combi], axis="columns") with timer("concat train and test"): total = cudf.concat([train, test], ignore_index=True).reset_index() del train, test gc.collect() with timer("GroupbyTransformer"): num_var_list = [ "Critic_Score", "Critic_Count", "User_Score", "User_Count", "log_User_Count", ] cat_var_list = [ "Name", "Platform", "Genre", "Publisher", "Developer", "Rating", ] num_stats_list = [ "mean", "std", "min", "max", "sum", ] cat_stats_list = ["count", "nunique"] groupby_dict = [] cat_var_list = cat_var_list + combi_cat_cols for key in combi_cat_cols: groupby_dict.append({ "key": [key], "var": num_var_list, "agg": num_stats_list, }) groupby_dict.append({ "key": [key], "var": [cat for cat in cat_var_list if cat != key], "agg": cat_stats_list, }) groupby = GroupbyTransformer(groupby_dict) total = groupby.transform(total) groupby = DiffGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) groupby = RatioGroupbyTransformer(groupby_dict) total = groupby.transform(total) total = reduce_mem_usage(total) with timer("end"): total = total.sort_values("index") new_cols = [ col for col in total.columns if col not in org_cols + ["index"] ] self.train = total[new_cols].iloc[:len_train].reset_index( drop=True) self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
output_dir = output_root_dir / config_name output_dir.mkdir(parents=True, exist_ok=True) logging.info(f"model output dir: {str(output_dir)}") config["model_output_dir"] = str(output_dir) # =============================== # === Data/Feature Loading # =============================== input_dir = Path(config["dataset"]["dir"]) if (not feature_existence_checker(feature_dir, config["features"])) or args.force: with timer(name="load data"): train = cudf.read_feather(feature_dir / "Basic_train.ftr") test = cudf.read_feather(feature_dir / "Basic_test.ftr") # train = cudf.read_feather(input_dir / "train.ftr") # test = cudf.read_feather(input_dir / "test.ftr") with timer(name="generate features"): generate_features( train_df=train, test_df=test, namespace=globals(), required=config["features"], overwrite=args.force, log=True, ) del train, test gc.collect()