Beispiel #1
0
    def create_features(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            org_cols = train.columns.tolist()
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True)
            del train, test
            gc.collect()

        with timer("make feats"):
            groupby = GroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            groupby = DiffGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)
            groupby = RatioGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)

        new_cols = [col for col in total.columns if col not in org_cols]

        train = total[new_cols].iloc[:len_train].reset_index(drop=True)
        test = total[new_cols].iloc[len_train:].reset_index(drop=True)

        with timer("end"):
            self.train = train.reset_index(drop=True).to_pandas()
            self.test = test.reset_index(drop=True).to_pandas()
Beispiel #2
0
 def on_data_end(self, state: State):
     with utils.timer("Data Compressing", state.logger):
         dfs = state.dataframes
         for key in dfs:
             dfs[key] = utils.reduce_mem_usage(dfs[key],
                                               verbose=True,
                                               logger=state.logger)
    def create_features(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            org_cols = train.columns.tolist()
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True).reset_index()
            del train, test
            gc.collect()

        with timer("log transform"):
            for sub_target in num_var_list:
                total[sub_target] = cudf.Series(np.log1p(total[sub_target].to_pandas()))

        with timer("GroupbyTransformer"):
            groupby = GroupbyTransformer(groupby_dict)
            total = groupby.transform(total)

            groupby = DiffGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)

            groupby = RatioGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)


        with timer("end"):
            total = total.sort_values("index")
            new_cols = [col for col in total.columns if col not in org_cols + ["index"]]

            self.train = total[new_cols].iloc[:len_train].reset_index(drop=True)
            self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
Beispiel #4
0
    submission_generator,
    set_interval_proba,
)

current_dir = os.getcwd()
main_path = os.path.dirname(current_dir)
# main_path =  r'C:\Users\Marco\Documents\GitHub\axa_challenge'
os.chdir(main_path)

gc.collect()

# Importing data
training = fetch_data("train_engineered")
test = fetch_data("validation_engineered")

training = reduce_mem_usage(training)
test = reduce_mem_usage(test)

# Eseguo subset del dataset di train
training = subset_data(training, "random", prcn=1, smote_os=0)

training = subset_data(training, "random", 1)
print("train shape: ", training.shape, " - test shape: ", test.shape)

# defining predictions dataframe
submission_template = sub_template_creator(test)


X_train, y_train, X_test = prepare_train_test_before_scoring(training, test)

# Fare funzione
Beispiel #5
0
    def create_features(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            org_cols = train.columns.tolist()
            test = test_df.copy()

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True).reset_index()
            del train, test
            gc.collect()

        with timer("GroupbyTransformer"):
            groupby = GroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total["diff_Year_of_Release_groupby_Platform"] = (
                total["max_Year_of_Release_groupby_Platform"]
                - total["min_Year_of_Release_groupby_Platform"]
            )
            groupby = DiffGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)
            groupby = RatioGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)

        with timer("pivot_tables"):
            with timer("Publisher"):
                count_publishers_groupby_platform = cudf.from_pandas(
                    total.to_pandas()
                    .pivot_table(
                        index="Platform",
                        columns="Publisher",
                        values="Name",
                        aggfunc="count",
                    )
                    .reset_index()
                ).fillna(0.0)
                count_publishers_groupby_platform.columns = ["Platform"] + [
                    "count_publisher_" + str(col) + "_groupby_platform"
                    for col in count_publishers_groupby_platform.columns
                    if str(col) != "Platform"
                ]
                total = cudf.merge(
                    total, count_publishers_groupby_platform, how="left", on="Platform"
                )

            with timer("Genre"):
                count_genres_groupby_platform = cudf.from_pandas(
                    total.to_pandas()
                    .pivot_table(
                        index="Platform",
                        columns="Genre",
                        values="Name",
                        aggfunc="count",
                    )
                    .reset_index()
                ).fillna(0.0)
                count_genres_groupby_platform.columns = ["Platform"] + [
                    "count_genre_" + str(col) + "_groupby_platform"
                    for col in count_genres_groupby_platform.columns
                    if str(col) != "Platform"
                ]
                total = cudf.merge(
                    total, count_genres_groupby_platform, how="left", on="Platform"
                )

            with timer("Year_of_Release"):
                count_year_of_releases_groupby_platform = cudf.from_pandas(
                    total.to_pandas()
                    .pivot_table(
                        index="Platform",
                        columns="Year_of_Release",
                        values="Name",
                        aggfunc="count",
                    )
                    .reset_index()
                ).fillna(0.0)
                count_year_of_releases_groupby_platform.columns = ["Platform"] + [
                    "count_year_of_release_" + str(col) + "_groupby_platform"
                    for col in count_year_of_releases_groupby_platform.columns
                    if str(col) != "Platform"
                ]
                total = cudf.merge(
                    total,
                    count_year_of_releases_groupby_platform,
                    how="left",
                    on="Platform",
                )

            with timer("Rating"):
                count_ratings_groupby_platform = cudf.from_pandas(
                    total.to_pandas()
                    .pivot_table(
                        index="Platform",
                        columns="Rating",
                        values="Name",
                        aggfunc="count",
                    )
                    .reset_index()
                ).fillna(0.0)
                count_ratings_groupby_platform.columns = ["Platform"] + [
                    "count_rating_" + str(col) + "_groupby_platform"
                    for col in count_ratings_groupby_platform.columns
                    if str(col) != "Platform"
                ]
                total = cudf.merge(
                    total, count_ratings_groupby_platform, how="left", on="Platform"
                )

        with timer("end"):
            total = total.sort_values("index")
            new_cols = [col for col in total.columns if col not in org_cols + ["index"]]

            self.train = total[new_cols].iloc[:len_train].reset_index(drop=True)
            self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
Beispiel #6
0
    def create_features(
        self,
        train_df: pd.DataFrame,
        test_df: pd.DataFrame,
    ):

        with timer("load data"):
            train = train_df.copy()
            len_train = len(train)
            test = test_df.copy()
            train_combi = cudf.read_feather(
                "./features/ConcatCategory_train.ftr")
            test_combi = cudf.read_feather(
                "./features/ConcatCategory_test.ftr")
            combi_cat_cols = test_combi.columns.tolist()

        with timer("concat combi"):
            train = cudf.concat([train, train_combi], axis="columns")
            org_cols = train.columns.tolist()
            test = cudf.concat([test, test_combi], axis="columns")

        with timer("concat train and test"):
            total = cudf.concat([train, test], ignore_index=True).reset_index()
            del train, test
            gc.collect()

        with timer("GroupbyTransformer"):
            num_var_list = [
                "Critic_Score",
                "Critic_Count",
                "User_Score",
                "User_Count",
                "log_User_Count",
            ]
            cat_var_list = [
                "Name",
                "Platform",
                "Genre",
                "Publisher",
                "Developer",
                "Rating",
            ]
            num_stats_list = [
                "mean",
                "std",
                "min",
                "max",
                "sum",
            ]
            cat_stats_list = ["count", "nunique"]
            groupby_dict = []
            cat_var_list = cat_var_list + combi_cat_cols

            for key in combi_cat_cols:
                groupby_dict.append({
                    "key": [key],
                    "var": num_var_list,
                    "agg": num_stats_list,
                })
                groupby_dict.append({
                    "key": [key],
                    "var": [cat for cat in cat_var_list if cat != key],
                    "agg":
                    cat_stats_list,
                })

            groupby = GroupbyTransformer(groupby_dict)
            total = groupby.transform(total)

            groupby = DiffGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)

            groupby = RatioGroupbyTransformer(groupby_dict)
            total = groupby.transform(total)
            total = reduce_mem_usage(total)

        with timer("end"):
            total = total.sort_values("index")
            new_cols = [
                col for col in total.columns
                if col not in org_cols + ["index"]
            ]

            self.train = total[new_cols].iloc[:len_train].reset_index(
                drop=True)
            self.test = total[new_cols].iloc[len_train:].reset_index(drop=True)
Beispiel #7
0
 def save(self):
     save_pickle(reduce_mem_usage(self.train), self.train_path)
     save_pickle(reduce_mem_usage(self.test), self.test_path)
 def save(self):
     reduce_mem_usage(self.train).to_feather(self.train_path)
     reduce_mem_usage(self.test).to_feather(self.test_path)