Beispiel #1
0
def feature_selection(df: pd.DataFrame, config: Config):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb < 2 * 1024:
            return

        selected_columns = []
        config_sample = copy.deepcopy(config)
        for i in range(10):
            df_sample = df.sample(min(1000, len(df)),
                                  random_state=i).copy(deep=True)
            preprocess_pipeline(df_sample, config_sample)
            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            if len(selected_columns) > 0:
                X = X.drop(selected_columns, axis=1)

            if len(X.columns) > 0:
                selected_columns += select_features(X, y, config["mode"])
            else:
                break

        log("Selected columns: {}".format(selected_columns))

        drop_number_columns = [
            c for c in df if (c.startswith("number_") or c.startswith("id_"))
            and c not in selected_columns
        ]
        if len(drop_number_columns) > 0:
            config["drop_number_columns"] = drop_number_columns

        config["date_columns"] = {}
        for c in [c for c in selected_columns if c.startswith("datetime_")]:
            d = c.split("_")
            date_col = d[0] + "_" + d[1]
            date_part = d[2]

            if date_col not in config["date_columns"]:
                config["date_columns"][date_col] = []

            config["date_columns"][date_col].append(date_part)

        drop_datetime_columns = [
            c for c in df
            if c.startswith("datetime_") and c not in config["date_columns"]
        ]
        if len(drop_datetime_columns) > 0:
            config["drop_datetime_columns"] = drop_datetime_columns

    if "drop_number_columns" in config:
        log("Drop number columns: {}".format(config["drop_number_columns"]))
        df.drop(config["drop_number_columns"], axis=1, inplace=True)

    if "drop_datetime_columns" in config:
        log("Drop datetime columns: {}".format(
            config["drop_datetime_columns"]))
        df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
Beispiel #2
0
def feature_selection(df: pd.DataFrame, config: Config):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb < 2 * 1024:
            return

        selected_columns = []
        for i in range(3):

            config_sample = copy.deepcopy(config)

            df_sample = df.sample(frac=0.05, random_state=i).copy(deep=True)
            df_sample = preprocess_pipeline(df_sample, config_sample)
            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            if len(X.columns) > 0:
                selected_columns += select_features(X, y, config["mode"])
            else:
                break

            df_size_mb = df.drop(
                list(set(df) - set(selected_columns)), 1,
                errors='ignore').memory_usage(deep=True).sum() / 1024 / 1024
            if df_size_mb < 2 * 1024:
                break

        selected_columns = list(set(selected_columns))

        log("Selected columns: {}".format(selected_columns))

        drop_number_columns = [
            c for c in df if (c.startswith("number_") or c.startswith("id_"))
            and c not in selected_columns
        ]
        if len(drop_number_columns) > 0:
            config["drop_number_columns"] = drop_number_columns

        drop_datetime_columns = [
            c for c in df
            if c.startswith("datetime_") and c not in selected_columns
        ]
        if len(drop_datetime_columns) > 0:
            config["drop_datetime_columns"] = drop_datetime_columns

    if "drop_number_columns" in config:
        log("Drop number columns: {}".format(config["drop_number_columns"]))
        df.drop(config["drop_number_columns"], axis=1, inplace=True)

    if "drop_datetime_columns" in config:
        log("Drop datetime columns: {}".format(
            config["drop_datetime_columns"]))
        df.drop(config["drop_datetime_columns"], axis=1, inplace=True)