def load_features(config: dict) -> Tuple[cudf.DataFrame, cudf.DataFrame]:
    feature_path = config["dataset"]["feature_dir"]

    with timer("load train"):
        train_feats = [
            cudf.read_feather(f"{feature_path}/{f}_train.ftr")
            for f in config["features"]
            if Path(f"{feature_path}/{f}_train.ftr").exists()
        ]
        cols = []
        for feats in train_feats:
            cols = cols + feats.columns.tolist()

        print(
            f"duplicated cols: {[k for k, v in collections.Counter(cols).items() if v > 1]}"
        )
        assert len(cols) == len(np.unique(cols))
        x_train = cudf.concat(
            train_feats,
            axis=1,
            sort=False,
        )

    with timer("load test"):
        x_test = cudf.concat(
            [
                cudf.read_feather(f"{feature_path}/{f}_test.ftr")
                for f in config["features"]
                if Path(f"{feature_path}/{f}_test.ftr").exists()
            ],
            axis=1,
            sort=False,
        )

    return x_train, x_test
Exemple #2
0
def test_feather_reader(feather_file, columns):
    expect = pa.feather.read_table(feather_file, columns=columns).to_pandas()
    got = (cudf.read_feather(
        feather_file,
        columns=columns).to_arrow(preserve_index=False).to_pandas())

    assert_eq(expect, got, check_categorical=False)
Exemple #3
0
    def create_features(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            test = test_df.copy()
            train_combi = cudf.read_feather(
                "./features/ConcatCategory_train.ftr")
            test_combi = cudf.read_feather(
                "./features/ConcatCategory_test.ftr")
            combi_cat_cols = test_combi.columns.tolist()

        with timer("concat combi"):
            train = cudf.concat([train, train_combi], axis="columns")
            org_cols = train.columns.tolist()
            test = cudf.concat([test, test_combi], axis="columns")

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True).reset_index()
            del train, test
            gc.collect()

        with timer("GroupbyTransformer"):
            num_var_list = [
                "Critic_Score",
                "Critic_Count",
                "User_Score",
                "User_Count",
                "log_User_Count",
            ]
            cat_var_list = [
                "Name",
                "Platform",
                "Genre",
                "Publisher",
                "Developer",
                "Rating",
            ]
            num_stats_list = [
                "mean",
                "std",
                "min",
                "max",
                "sum",
            ]
            cat_stats_list = ["count", "nunique"]
            groupby_dict = []
            cat_var_list = cat_var_list + combi_cat_cols

            for key in combi_cat_cols:
                groupby_dict.append({
                    "key": [key],
                    "var": num_var_list,
                    "agg": num_stats_list,
                })
                groupby_dict.append({
                    "key": [key],
                    "var": [cat for cat in cat_var_list if cat != key],
                    "agg":
                    cat_stats_list,
                })

            groupby = GroupbyTransformer(groupby_dict)
            total = groupby.transform(total)

            groupby = DiffGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)

            groupby = RatioGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)

        with timer("end"):
            total = total.sort_values("index")
            new_cols = [
                col for col in total.columns
                if col not in org_cols + ["index"]
            ]

            self.train = total[new_cols].iloc[:len_train].reset_index(
                drop=True)
            self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
Exemple #4
0
    output_dir = output_root_dir / config_name
    output_dir.mkdir(parents=True, exist_ok=True)

    logging.info(f"model output dir: {str(output_dir)}")

    config["model_output_dir"] = str(output_dir)

    # ===============================
    # === Data/Feature Loading
    # ===============================
    input_dir = Path(config["dataset"]["dir"])

    if (not feature_existence_checker(feature_dir,
                                      config["features"])) or args.force:
        with timer(name="load data"):
            train = cudf.read_feather(feature_dir / "Basic_train.ftr")
            test = cudf.read_feather(feature_dir / "Basic_test.ftr")
            # train = cudf.read_feather(input_dir / "train.ftr")
            # test = cudf.read_feather(input_dir / "test.ftr")
        with timer(name="generate features"):
            generate_features(
                train_df=train,
                test_df=test,
                namespace=globals(),
                required=config["features"],
                overwrite=args.force,
                log=True,
            )

        del train, test
        gc.collect()