Python read_preprocessing_data Examples, utils.read_data.read_preprocessing_data Python Examples

Example #1

0

Show file

File: count_encoding.py Project: harupy/kaggle_ieee

    def create_features(self):
        with timer("load data"):
            train = read_preprocessing_data(DATA_DIR,
                                            "train",
                                            write_mode=False)
            test = read_preprocessing_data(DATA_DIR, "test", write_mode=False)

        with timer("get predicted user id"):
            predicted_user = pd.read_csv(
                '../../data/interim/20190901_user_ids_share.csv')
            train = pd.merge(
                train,
                predicted_user[['TransactionID', 'predicted_user_id']],
                how='left',
                on='TransactionID')
            test = pd.merge(
                test,
                predicted_user[['TransactionID', 'predicted_user_id']],
                how='left',
                on='TransactionID')

        with timer("count encoding"):
            categorical_cols = self.categorical_features()
            for col in categorical_cols:
                train_result, test_result = count_encoding(col, train, test)
                self.train_feature[col] = train_result
                self.test_feature[col] = test_result

        with timer("end"):
            self.train_feature.reset_index(drop=True, inplace=True)
            self.test_feature.reset_index(drop=True, inplace=True)

Example #2

0

Show file

    def create_features(self):
        with timer("load data"):
            train = read_preprocessing_data(DATA_DIR, "train", write_mode=False)
            test = read_preprocessing_data(DATA_DIR, "test", write_mode=False)

        with timer("create features"):
            train_result, test_result = matrix_factorize(categorical_cols, train, test)
            self.train_feature = train_result
            self.test_feature = test_result

        with timer("end"):
            self.train_feature.reset_index(drop=True, inplace=True)
            self.test_feature.reset_index(drop=True, inplace=True)

Example #3

0

Show file

File: label_encoding_nway.py Project: harupy/kaggle_ieee

    def create_features(self):
        with timer("load data"):
            train = read_preprocessing_data(DATA_DIR, "train", write_mode=False)
            test = read_preprocessing_data(DATA_DIR, "test", write_mode=False)

        with timer("label encoding"):
            for col1, col2 in combinations(categorical_cols, 2):
                new_fe_col_name = f'{col1}_{col2}'
                train[new_fe_col_name] = train[col1].astype("str") + "_" + train[col2].astype("str")
                test[new_fe_col_name] = test[col1].astype("str") + "_" + test[col2].astype("str")
                train_result, test_result = label_encoding(new_fe_col_name, train, test)
                self.train_feature[new_fe_col_name] = train_result
                self.test_feature[new_fe_col_name] = test_result

        with timer("end"):
            self.train_feature.reset_index(drop=True, inplace=True)
            self.test_feature.reset_index(drop=True, inplace=True)

Example #4

0

Show file

File: label_encoding.py Project: harupy/kaggle_ieee

    def create_features(self):
        with timer("load data"):
            train = read_preprocessing_data(DATA_DIR,
                                            "train",
                                            write_mode=False)
            test = read_preprocessing_data(DATA_DIR, "test", write_mode=False)

        with timer("label encoding"):
            categorical_cols = self.categorical_features()
            for col in categorical_cols:
                train_result, test_result = label_encoding(col, train, test)
                self.train_feature[col] = train_result
                self.test_feature[col] = test_result

        with timer("end"):
            self.train_feature.reset_index(drop=True, inplace=True)
            self.test_feature.reset_index(drop=True, inplace=True)

Example #5

0

Show file

File: agg_categorical_features.py Project: harupy/kaggle_ieee

    def create_features(self):
        with timer("load data"):
            train = read_preprocessing_data(DATA_DIR,
                                            "train",
                                            write_mode=False)
            test = read_preprocessing_data(DATA_DIR, "test", write_mode=False)

        with timer("get predicted user id"):
            predicted_user = pd.read_csv(
                '../../data/interim/20190901_user_ids_share.csv')
            train = pd.merge(
                train,
                predicted_user[['TransactionID', 'predicted_user_id']],
                how='left',
                on='TransactionID')
            test = pd.merge(
                test,
                predicted_user[['TransactionID', 'predicted_user_id']],
                how='left',
                on='TransactionID')
            total = train.append(test).reset_index(drop=True)

        with timer("get original cols"):
            org_cols = total.columns

        with timer("aggregate categorical features"):
            total = calc_agg_category_func(total, groupby_dict)
            new_cols = [c for c in total.columns if c not in org_cols]
            total = total[new_cols]
            logger.info(f"n_features: {len(new_cols)}")

            train_result = total.iloc[:len(train)].reset_index(drop=True)
            test_result = total.iloc[len(train):].reset_index(drop=True)
            self.train_feature = train_result
            self.test_feature = test_result

        with timer("end"):
            self.train_feature.reset_index(drop=True, inplace=True)
            self.test_feature.reset_index(drop=True, inplace=True)

Example #6

0

Show file

File: agg_features.py Project: harupy/kaggle_ieee

    def create_features(self):
        with timer("load data"):
            train = read_preprocessing_data(DATA_DIR, "train", write_mode=False)
            test = read_preprocessing_data(DATA_DIR, "test", write_mode=False)

        with timer("get predicted user id"):
            predicted_user = pd.read_csv('../../data/interim/20190901_user_ids_share.csv')
            train = pd.merge(train, predicted_user[['TransactionID', 'predicted_user_id']], how='left', on='TransactionID')
            test = pd.merge(test, predicted_user[['TransactionID', 'predicted_user_id']], how='left', on='TransactionID')
            total = train.append(test).reset_index(drop=True)

            total['TransactionAmt_decimal'] = ((total['TransactionAmt'] - total['TransactionAmt'].astype(int)) * 1000).astype(int)

        with timer("make V features"):
            ### V75 ~~~ 94
            cols_V75_94 = [f'V{no}' for no in range(75, 95, 1)]
            cols_other = [f'V{no}' for no in [75, 88, 89, 90, 91, 94, 100, 104, 105, 106]]
            cols_V75_94 = list(set(cols_V75_94) - set(cols_other))
            total['V75_94_mean'] = total[cols_V75_94].mean(axis=1)

            ### V95 ~~~ 137
            cols_V95_137 = [f'V{no}' for no in range(95, 138, 1)]
            cols_V95_137 = list(set(cols_V95_137) - set([f'V{no}' for no in range(130, 138, 1)]))
            cols_other = [f'V{no}' for no in [96, 98, 99, 100, 104, 105, 106 , 120, 121, 122, 126, 127, 128]]
            cols_other_2 = [f'V{no}' for no in [117, 118, 119]]
            cols_V95_137 = sorted(list(set(cols_V95_137) - set(cols_other) -set(cols_other_2)))
            total['V95_137_mean'] = total[cols_V95_137].mean(axis=1)

            ### V167 ~~~ 216
            cols_V167_216 = [f'V{no}' for no in range(167, 217, 1)]
            cols_other = [f'V{no}' for no in range(186, 202, 1)]
            no_use_cols = [f'V{no}' for no in [169, 172, 173, 174, 175] + list(range(202, 217, 1))]
            cols_V167_216 = sorted(list(set(cols_V167_216) - set(cols_other) -set(no_use_cols)))
            total['V167_216_mean'] = total[cols_V167_216].mean(axis=1)

            ### V242 ~~~ 263
            cols_V242_263 = [f'V{no}' for no in list(range(242, 250, 1)) + list(range(252, 255, 1)) + list(range(257, 263, 1))]
            total['V242_263_mean'] = total[cols_V242_263].mean(axis=1)

            org_cols = total.columns

        with timer("sin/cos transformation"):
            total["D9_sin"] = np.sin(2 * np.pi * total["D9"] / 24).round(4)
            total["D9_cos"] = np.cos(2 * np.pi * total["D9"] / 24).round(4)
            total["D9_LocalTime_sin"] = np.sin(2 * np.pi * total["D9_LocalTime"] / 24).round(4)
            total["D9_LocalTime_cos"] = np.cos(2 * np.pi * total["D9_LocalTime"] / 24).round(4)

        with timer("group by features"):
            groupby = GroupbyTransformer(param_dict=groupby_dict)
            total = groupby.transform(total)
            diff = DiffGroupbyTransformer(param_dict=diff_dict)
            total = diff.transform(total)
            ratio = RatioGroupbyTransformer(param_dict=diff_dict)
            total = ratio.transform(total)

            new_cols = [c for c in total.columns if c not in org_cols]
            total = total[new_cols]
            logger.info(f"n_features: {len(new_cols)}")

            train_result = total.iloc[:len(train)].reset_index(drop=True)
            test_result = total.iloc[len(train):].reset_index(drop=True)
            self.train_feature = train_result
            self.test_feature = test_result

        with timer("end"):
            self.train_feature.reset_index(drop=True, inplace=True)
            self.test_feature.reset_index(drop=True, inplace=True)

Example #7

0

Show file

    def create_features(self):
        with timer("load data"):
            train = read_preprocessing_data(DATA_DIR,
                                            "train",
                                            write_mode=False)
            test = read_preprocessing_data(DATA_DIR, "test", write_mode=False)

        with timer("get predicted user id"):
            predicted_user = pd.read_csv(
                '../../data/interim/20190901_user_ids_share.csv')
            train = pd.merge(
                train,
                predicted_user[['TransactionID', 'predicted_user_id']],
                how='left',
                on='TransactionID')
            test = pd.merge(
                test,
                predicted_user[['TransactionID', 'predicted_user_id']],
                how='left',
                on='TransactionID')
            total = train.append(test).reset_index(drop=True)

        with timer("make V features"):
            ### V75 ~~~ 94
            cols_V75_94 = [f'V{no}' for no in range(75, 95, 1)]
            cols_other = [
                f'V{no}'
                for no in [75, 88, 89, 90, 91, 94, 100, 104, 105, 106]
            ]
            cols_V75_94 = list(set(cols_V75_94) - set(cols_other))
            total['V75_94_mean'] = total[cols_V75_94].mean(axis=1)

            ### V95 ~~~ 137
            cols_V95_137 = [f'V{no}' for no in range(95, 138, 1)]
            cols_V95_137 = list(
                set(cols_V95_137) -
                set([f'V{no}' for no in range(130, 138, 1)]))
            cols_other = [
                f'V{no}' for no in
                [96, 98, 99, 100, 104, 105, 106, 120, 121, 122, 126, 127, 128]
            ]
            cols_other_2 = [f'V{no}' for no in [117, 118, 119]]
            cols_V95_137 = sorted(
                list(set(cols_V95_137) - set(cols_other) - set(cols_other_2)))
            total['V95_137_mean'] = total[cols_V95_137].mean(axis=1)

            ### V167 ~~~ 216
            cols_V167_216 = [f'V{no}' for no in range(167, 217, 1)]
            cols_other = [f'V{no}' for no in range(186, 202, 1)]
            no_use_cols = [
                f'V{no}'
                for no in [169, 172, 173, 174, 175] + list(range(202, 217, 1))
            ]
            cols_V167_216 = sorted(
                list(set(cols_V167_216) - set(cols_other) - set(no_use_cols)))
            total['V167_216_mean'] = total[cols_V167_216].mean(axis=1)

            ### V242 ~~~ 263
            cols_V242_263 = [
                f'V{no}' for no in list(range(242, 250, 1)) +
                list(range(252, 255, 1)) + list(range(257, 263, 1))
            ]
            total['V242_263_mean'] = total[cols_V242_263].mean(axis=1)

        with timer("get original cols"):
            org_cols = total.columns

        with timer("Set TransactionDT"):
            total["TransactionDT"] = total["TransactionDT"].apply(
                lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
            total.set_index("TransactionDT", inplace=True)

        with timer("get rolling features"):
            total = calc_rolling_func(total, groupby_dict)
            new_cols = [c for c in total.columns if c not in org_cols]
            total = total[new_cols]
            logger.info(f"n_features: {len(new_cols)}")

            train_result = total.iloc[:len(train)].reset_index(drop=True)
            test_result = total.iloc[len(train):].reset_index(drop=True)
            self.train_feature = train_result
            self.test_feature = test_result

        with timer("end"):
            self.train_feature.reset_index(drop=True, inplace=True)
            self.test_feature.reset_index(drop=True, inplace=True)

Example #8

0

Show file

File: agg_future_and_past.py Project: harupy/kaggle_ieee

    def create_features(self):
        feature_name_list = []

        with timer("load data"):
            train = read_preprocessing_data(DATA_DIR,
                                            "train",
                                            write_mode=False)
            test = read_preprocessing_data(DATA_DIR, "test", write_mode=False)

        with timer("get predicted user id"):
            predicted_user = pd.read_csv(
                '../../data/interim/20190901_user_ids_share.csv')
            train = pd.merge(
                train,
                predicted_user[['TransactionID', 'predicted_user_id']],
                how='left',
                on='TransactionID')
            test = pd.merge(
                test,
                predicted_user[['TransactionID', 'predicted_user_id']],
                how='left',
                on='TransactionID')

        with timer("Set TransactionDT"):
            train["TransactionDT"] = train["TransactionDT"].apply(
                lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
            test["TransactionDT"] = test["TransactionDT"].apply(
                lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
            total = train.append(test).reset_index(drop=True)

        with timer("Get lag-time to future recoreds"):
            total['TransactionDT_lag1_future'] = total.groupby(
                'predicted_user_id')['TransactionDT'].shift(-1)
            total["Diff_Time_To_lag1_future"] = (
                total["TransactionDT_lag1_future"] - total["TransactionDT"]
            ).apply(lambda x: x.days * 24 + x.seconds / 60 / 60
                    if x == x else x).astype(np.int64)
            total['TransactionDT_lag1_past'] = total.groupby(
                'predicted_user_id')['TransactionDT'].shift(1)
            total["Diff_Time_To_lag1_past"] = (
                total["TransactionDT_lag1_past"] - total["TransactionDT"]
            ).apply(lambda x: x.days * 24 + x.seconds / 60 / 60
                    if x == x else x).astype(np.int64)
            feature_name_list.extend(
                ["Diff_Time_To_lag1_future", "Diff_Time_To_lag1_past"])

        with timer("Get lag-value"):
            # future value
            total['TransactionAmt_lag1_future'] = total.groupby(
                'predicted_user_id')['TransactionAmt'].shift(-1)
            feature_name_list.extend(["TransactionAmt_lag1_future"])

            # past value
            total['TransactionAmt_lag1_past'] = total.groupby(
                'predicted_user_id')['TransactionAmt'].shift(1)
            feature_name_list.extend(["TransactionAmt_lag1_past"])

            # current value - future value
            total['diff_TransactionAmt_lag1_future'] = total[
                'TransactionAmt'] - total['TransactionAmt_lag1_future']
            feature_name_list.extend(["diff_TransactionAmt_lag1_future"])

            # current value - past value
            total['diff_TransactionAmt_lag1_past'] = total[
                'TransactionAmt'] - total['TransactionAmt_lag1_past']
            feature_name_list.extend(["diff_TransactionAmt_lag1_past"])

            # current value / future value
            total['div_TransactionAmt_lag1_future'] = total[
                'TransactionAmt'] / total['TransactionAmt_lag1_future']
            feature_name_list.extend(["div_TransactionAmt_lag1_future"])

            # current value / past value
            total['div_TransactionAmt_lag1_past'] = total[
                'TransactionAmt'] / total['TransactionAmt_lag1_past']
            feature_name_list.extend(["div_TransactionAmt_lag1_past"])

            # slope
            total['slope_TransactionAmt_lag1_future'] = (
                total['TransactionAmt'] - total['TransactionAmt_lag1_future']
            ) / total['Diff_Time_To_lag1_future']
            total['slope_TransactionAmt_lag1_past'] = (
                total['TransactionAmt'] - total['TransactionAmt_lag1_past']
            ) / total['Diff_Time_To_lag1_past']
            feature_name_list.extend([
                "slope_TransactionAmt_lag1_future",
                "slope_TransactionAmt_lag1_past",
            ])

            # groupby value - future value
            grp_Amt = total.groupby(
                'predicted_user_id')['TransactionAmt'].mean().reset_index()
            grp_Amt.columns = ['predicted_user_id', 'groupby_TransactionAmt']
            total = total.merge(grp_Amt, how='left', on='predicted_user_id')
            total['diff_grp_TransactionAmt_lag1_future'] = total[
                'groupby_TransactionAmt'] - total['TransactionAmt_lag1_future']
            total['diff_grp_TransactionAmt_lag1_past'] = total[
                'groupby_TransactionAmt'] - total['TransactionAmt_lag1_past']
            feature_name_list.extend([
                "diff_grp_TransactionAmt_lag1_future",
                "diff_grp_TransactionAmt_lag1_past",
            ])

        with timer("end"):
            train_result = total.iloc[:len(train)].reset_index(drop=True)
            test_result = total.iloc[len(train):].reset_index(drop=True)
            for fe in feature_name_list:
                self.train_feature[fe] = train_result[fe]
                self.test_feature[fe] = test_result[fe]

            self.train_feature.reset_index(drop=True, inplace=True)
            self.test_feature.reset_index(drop=True, inplace=True)

Example #9

0

Show file

File: os_and_browser_features.py Project: harupy/kaggle_ieee

    def create_features(self):
        with timer("load data"):
            train = read_preprocessing_data(DATA_DIR,
                                            "train",
                                            write_mode=False)
            test = read_preprocessing_data(DATA_DIR, "test", write_mode=False)

        with timer("concat os and browser info"):
            os_info = load_os_release_date()
            os_info.sort_values(["os_type", "os_release_date"], inplace=True)
            os_info["os_release_date_next_ver"] = os_info.groupby(
                "os_type")["os_release_date"].shift(-1).fillna("2019-09-01")
            train = pd.merge(train,
                             os_info,
                             how="left",
                             left_on="id_30",
                             right_on="os_name").drop(columns="id_30")
            test = pd.merge(test,
                            os_info,
                            how="left",
                            left_on="id_30",
                            right_on="os_name").drop(columns="id_30")

            browser_info = load_browser_release_date()
            browser_info.sort_values(["browser_type", "browser_release_date"],
                                     inplace=True)
            browser_info[
                "browser_release_date_next_ver"] = browser_info.groupby(
                    "browser_type")["browser_release_date"].shift(-1).fillna(
                        "2019-09-01")
            train = pd.merge(train,
                             browser_info,
                             how="left",
                             left_on="id_31",
                             right_on="browser_name").drop(columns="id_31")
            test = pd.merge(test,
                            browser_info,
                            how="left",
                            left_on="id_31",
                            right_on="browser_name").drop(columns="id_31")

            total = train.append(test).reset_index(drop=True)
            feature_name_list = []

        with timer("convert object-type to datetime-type"):
            total["TransactionDT"] = total["TransactionDT"].apply(
                lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
            total["os_release_date"] = total["os_release_date"].apply(
                lambda x: datetime.datetime.strptime(x, "%Y-%m-%d")
                if x == x else x)
            total["browser_release_date"] = total[
                "browser_release_date"].apply(
                    lambda x: datetime.datetime.strptime(x, "%Y-%m-%d")
                    if x == x else x)

            total = get_new_browser(total)

        with timer("make features: elapsed times from release date"):
            total["elapsed_days_from_os_release"] = (
                total["TransactionDT"] -
                total["os_release_date"]).apply(lambda x: x.days)
            total["elapsed_days_from_browser_release"] = (
                total["TransactionDT"] -
                total["browser_release_date"]).apply(lambda x: x.days)
            feature_name_list.extend([
                "elapsed_days_from_os_release",
                "elapsed_days_from_browser_release"
            ])

        with timer("make features: elapsed times from new-version"):
            total = pd.merge(
                total,
                browser_info[["browser_name", "browser_release_date"]].rename(
                    columns={
                        "browser_name": "new_browser_name",
                        "browser_release_date": "new_browser_release_date"
                    }),
                how="left",
                on="new_browser_name")

            total["latest_browser"] = np.nan
            total.loc[total["browser_name"].notnull(), "latest_browser"] = (
                total.loc[total["browser_name"].notnull(), "browser_name"] ==
                total.loc[total["browser_name"].notnull(),
                          "new_browser_name"]).astype(int)

            total["new_browser_release_date"] = total[
                "new_browser_release_date"].apply(
                    lambda x: datetime.datetime.strptime(x, "%Y-%m-%d")
                    if x == x else x)
            total["elapsed_days_from_new_browser_release"] = (
                total["TransactionDT"] -
                total["new_browser_release_date"]).apply(lambda x: x.days)
            total.loc[total["latest_browser"] == 1,
                      "elapsed_days_from_new_browser_release"] = 0

            total["elapsed_days_from_new_browser_release_v2"] = total[
                "elapsed_days_from_new_browser_release"] + total[
                    "elapsed_days_from_browser_release"]
            total.loc[total["latest_browser"] == 1,
                      "elapsed_days_from_new_browser_release_v2"] = 0

            feature_name_list.extend([
                "elapsed_days_from_new_browser_release", "latest_browser",
                "elapsed_days_from_new_browser_release_v2"
            ])

        with timer("end"):
            train_result = total.iloc[:len(train)].reset_index(drop=True)
            test_result = total.iloc[len(train):].reset_index(drop=True)
            for fe in feature_name_list:
                self.train_feature[fe] = train_result[fe]
                self.test_feature[fe] = test_result[fe]

            self.train_feature.reset_index(drop=True, inplace=True)
            self.test_feature.reset_index(drop=True, inplace=True)

Example #10

0

Show file

File: numeric.py Project: harupy/kaggle_ieee

    def create_features(self):
        with timer("load data"):
            train = read_preprocessing_data(DATA_DIR,
                                            "train",
                                            write_mode=False)
            test = read_preprocessing_data(DATA_DIR, "test", write_mode=False)

        with timer("get numeric features"):
            numeric_cols = get_numeric_cols()
            self.train_feature[numeric_cols] = train[numeric_cols]
            self.test_feature[numeric_cols] = test[numeric_cols]

        with timer(
                "make features: V features related to TransactionAmt + TransactionAmt"
        ):
            v_cols_related_to_amt = get_V_cols_related_to_Amt()
            for col in v_cols_related_to_amt:
                new_fe_col_name = col + "_add_Amt"
                self.train_feature[
                    new_fe_col_name] = train[col] + train["TransactionAmt"]
                self.test_feature[
                    new_fe_col_name] = test[col] + test["TransactionAmt"]

        with timer("make features: TransactionAmt * CXX"):
            c_cols = ['C1', 'C13', 'C14']
            for col in c_cols:
                new_fe_col_name = col + "_mul_Amt"
                self.train_feature[
                    new_fe_col_name] = train[col] * train['TransactionAmt']
                self.test_feature[
                    new_fe_col_name] = test[col] * test['TransactionAmt']

        with timer("numeric feature processings"):
            self.train_feature["day_of_week"] = train["TransactionDT"].apply(
                lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"
                                                     ).weekday())
            self.test_feature["day_of_week"] = test["TransactionDT"].apply(
                lambda x: datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S"
                                                     ).weekday())
            self.train_feature['TransactionAmt_decimal'] = (
                (train['TransactionAmt'] - train['TransactionAmt'].astype(int))
                * 1000).astype(int)
            self.test_feature['TransactionAmt_decimal'] = (
                (test['TransactionAmt'] - test['TransactionAmt'].astype(int)) *
                1000).astype(int)

        with timer("agg V features"):
            # V1 ~ V11
            vcol_names = [f'V{i}' for i in range(1, 12)]
            self.train_feature['sum_V1_V11'] = train[vcol_names].sum(axis=1)
            self.test_feature['sum_V1_V11'] = test[vcol_names].sum(axis=1)
            self.train_feature['null_sum_V1_V11'] = train[vcol_names].isnull(
            ).sum(axis=1)
            self.test_feature['null_sum_V1_V11'] = test[vcol_names].isnull(
            ).sum(axis=1)

            # V12 ~ V34
            vcol_names = [f'V{i}' for i in range(12, 35)]
            self.train_feature['sum_V12_V34'] = train[vcol_names].sum(axis=1)
            self.test_feature['sum_V12_V34'] = test[vcol_names].sum(axis=1)
            self.train_feature['null_sum_V12_V34'] = train[vcol_names].isnull(
            ).sum(axis=1)
            self.test_feature['null_sum_V12_V34'] = test[vcol_names].isnull(
            ).sum(axis=1)

            # V35 ~ V52
            vcol_names = [f'V{i}' for i in range(35, 53)]
            self.train_feature['sum_V35_V52'] = train[vcol_names].sum(axis=1)
            self.test_feature['sum_V35_V52'] = test[vcol_names].sum(axis=1)
            self.train_feature['null_sum_V35_V52'] = train[vcol_names].isnull(
            ).sum(axis=1)
            self.test_feature['null_sum_V35_V52'] = test[vcol_names].isnull(
            ).sum(axis=1)

            # V53 ~ V74
            vcol_names = [f'V{i}' for i in range(53, 75)]
            self.train_feature['sum_V53_V74'] = train[vcol_names].sum(axis=1)
            self.test_feature['sum_V53_V74'] = test[vcol_names].sum(axis=1)
            self.train_feature['null_sum_V53_V74'] = train[vcol_names].isnull(
            ).sum(axis=1)
            self.test_feature['null_sum_V53_V74'] = test[vcol_names].isnull(
            ).sum(axis=1)

            # V75 ~ V94
            vcol_names = [f'V{i}' for i in range(75, 95)]
            self.train_feature['sum_V75_V94'] = train[vcol_names].sum(axis=1)
            self.test_feature['sum_V75_V94'] = test[vcol_names].sum(axis=1)
            self.train_feature['null_sum_V75_V94'] = train[vcol_names].isnull(
            ).sum(axis=1)
            self.test_feature['null_sum_V75_V94'] = test[vcol_names].isnull(
            ).sum(axis=1)

            # V95 ~ V125
            vcol_names = [f'V{i}' for i in range(95, 126)]
            self.train_feature['sum_V95_V125'] = train[vcol_names].sum(axis=1)
            self.test_feature['sum_V95_V125'] = test[vcol_names].sum(axis=1)
            self.train_feature['null_sum_V95_V125'] = train[vcol_names].isnull(
            ).sum(axis=1)
            self.test_feature['null_sum_V95_V125'] = test[vcol_names].isnull(
            ).sum(axis=1)

            # V138 ~ V166
            vcol_names = [f'V{i}' for i in range(138, 167)]
            self.train_feature['null_sum_V138_V166'] = train[
                vcol_names].isnull().sum(axis=1)
            self.test_feature['null_sum_V138_V166'] = test[vcol_names].isnull(
            ).sum(axis=1)

            # V167 ~ V216
            vcol_names = [f'V{i}' for i in range(167, 217)]
            self.train_feature['null_sum_V167_V216'] = train[
                vcol_names].isnull().sum(axis=1)
            self.test_feature['null_sum_V167_V216'] = test[vcol_names].isnull(
            ).sum(axis=1)

            # V217 ~ V278
            vcol_names = [f'V{i}' for i in range(217, 279)]
            self.train_feature['null_sum_V217_V278'] = train[
                vcol_names].isnull().sum(axis=1)
            self.test_feature['null_sum_V217_V278'] = test[vcol_names].isnull(
            ).sum(axis=1)

            # V279 ~ V321
            vcol_names = [f'V{i}' for i in range(279, 322)]
            self.train_feature['null_sum_V279_V321'] = train[
                vcol_names].isnull().sum(axis=1)
            self.test_feature['null_sum_V279_V321'] = test[vcol_names].isnull(
            ).sum(axis=1)

            # V322 ~ V339
            vcol_names = [f'V{i}' for i in range(322, 340)]
            self.train_feature['null_sum_V322_V339'] = train[
                vcol_names].isnull().sum(axis=1)
            self.test_feature['null_sum_V322_V339'] = test[vcol_names].isnull(
            ).sum(axis=1)

        with timer("end"):
            self.train_feature.reset_index(drop=True, inplace=True)
            self.test_feature.reset_index(drop=True, inplace=True)