def feature_selection(df: pd.DataFrame, config: Config): if config.is_train(): df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024 if df_size_mb < 2 * 1024: return selected_columns = [] config_sample = copy.deepcopy(config) config.limit_time_fraction(0.1) for i in range(20): if config.is_time_fraction_limit(): break df_sample = df.sample(min(3000, len(df)), random_state=i).copy() transform(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) if len(selected_columns) > 0: X = X.drop(selected_columns, axis=1) if len(X.columns) > 0: selected_columns += select_features(X, y, config["mode"]) else: break Log.print("Selected columns: {}".format(selected_columns)) drop_number_columns = [c for c in df if c.startswith("number_") and c not in selected_columns] if len(drop_number_columns) > 0: config["drop_number_columns"] = drop_number_columns config["date_columns"] = {} for c in [c for c in selected_columns if c.startswith("datetime_")]: d = c.split("_") date_col = d[0] + "_" + d[1] date_part = d[2] if date_col not in config["date_columns"]: config["date_columns"][date_col] = [] config["date_columns"][date_col].append(date_part) drop_datetime_columns = [c for c in df if c.startswith("datetime_") and c not in config["date_columns"]] if len(drop_datetime_columns) > 0: config["drop_datetime_columns"] = drop_datetime_columns if "drop_number_columns" in config: Log.print("Drop number columns: {}".format(config["drop_number_columns"])) df.drop(config["drop_number_columns"], axis=1, inplace=True) if "drop_datetime_columns" in config: Log.print("Drop datetime columns: {}".format(config["drop_datetime_columns"])) df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
def time_series_detect(df: pd.DataFrame, config: Config): sample_size = 10000 model_params = { "objective": "regression" if config["mode"] == "regression" else "binary", "metric": "rmse" if config["mode"] == "regression" else "auc", "learning_rate": 0.01, "verbosity": -1, "seed": 1, "max_depth": -1, } if config.is_train(): datetime_columns = [c for c in df if c.startswith("datetime_")] id_columns = [c for c in df if c.startswith("id_")] sort_columns = [] for dc in datetime_columns: sort_columns.append([dc]) for ic in id_columns: sort_columns.append([ic, dc]) else: for ic in id_columns: sort_columns.append([ic]) scores = [] config.limit_time_fraction(0.1) for sc in sort_columns: if config.is_time_fraction_limit(): break Log.silent(True) df.sort_values(sc, inplace=True) config_sample = copy.deepcopy(config) df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy() df_sample = df_sample[[c for c in df_sample if c.startswith("number_") or c == "target" or c in sc]] shift_columns(df_sample, group= sc[0] if len(sc) > 1 else None) transform(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) X_train, X_test, y_train, y_test = ts_split(X, y, test_size=0.5) model_sorted = lgb.train(model_params, lgb.Dataset(X_train, label=y_train), 3000, lgb.Dataset(X_test, label=y_test), early_stopping_rounds=100, verbose_eval=False) score_sorted = model_sorted.best_score["valid_0"][model_params["metric"]] sampled_columns = [c for c in X if "_shift" not in c] model_sampled = lgb.train(model_params, lgb.Dataset(X_train[sampled_columns], label=y_train), 3000, lgb.Dataset(X_test[sampled_columns], label=y_test), early_stopping_rounds=100, verbose_eval=False) score_sampled = model_sampled.best_score["valid_0"][model_params["metric"]] if config.is_classification(): score_sorted = -score_sorted score_sampled = -score_sampled Log.silent(False) Log.print("Sort: {}. Score sorted: {:0.4f}. Score sampled: {:0.4f}".format(sc, score_sorted, score_sampled)) score_ratio = score_sampled / score_sorted if config.is_regression() else abs(score_sorted / score_sampled) if score_ratio >= 1.03: Log.print(score_ratio) scores.append((score_sorted, sc)) if len(scores) > 0: scores = sorted(scores, key=lambda x: x[0]) Log.print("Scores: {}".format(scores)) config["sort_values"] = scores[0][1] df.sort_values(config["sort_values"], inplace=True) config_sample = copy.deepcopy(config) df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy() shift_columns(df_sample, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None) transform(df_sample, config_sample) y = df_sample["target"] X = df_sample.drop("target", axis=1) model = lgb.train(model_params, lgb.Dataset(X, label=y), 1000) fi = pd.Series(model.feature_importance(importance_type="gain"), index=X.columns) fi = fi[fi > 0].sort_values() selected_columns = fi[fi >= fi.quantile(0.75)].index.tolist() selected_shift_columns = [c.replace("_shift", "") for c in selected_columns if "_shift" in c] if len(selected_shift_columns) > 0: Log.print("Shift columns: {}".format(selected_shift_columns)) config["shift_columns"] = selected_shift_columns if "shift_columns" in config: shift_columns(df, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None, number_columns=config["shift_columns"])