Exemple #1
0
def feature_selection(df: pd.DataFrame, config: Config):
    if config.is_train():
        df_size_mb = df.memory_usage(deep=True).sum() / 1024 / 1024
        if df_size_mb < 2 * 1024:
            return

        selected_columns = []
        config_sample = copy.deepcopy(config)
        config.limit_time_fraction(0.1)
        for i in range(20):
            if config.is_time_fraction_limit():
                break

            df_sample = df.sample(min(3000, len(df)), random_state=i).copy()
            transform(df_sample, config_sample)
            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            if len(selected_columns) > 0:
                X = X.drop(selected_columns, axis=1)

            if len(X.columns) > 0:
                selected_columns += select_features(X, y, config["mode"])
            else:
                break

        Log.print("Selected columns: {}".format(selected_columns))

        drop_number_columns = [c for c in df if c.startswith("number_") and c not in selected_columns]
        if len(drop_number_columns) > 0:
            config["drop_number_columns"] = drop_number_columns

        config["date_columns"] = {}
        for c in [c for c in selected_columns if c.startswith("datetime_")]:
            d = c.split("_")
            date_col = d[0] + "_" + d[1]
            date_part = d[2]

            if date_col not in config["date_columns"]:
                config["date_columns"][date_col] = []

            config["date_columns"][date_col].append(date_part)

        drop_datetime_columns = [c for c in df if c.startswith("datetime_") and c not in config["date_columns"]]
        if len(drop_datetime_columns) > 0:
            config["drop_datetime_columns"] = drop_datetime_columns

    if "drop_number_columns" in config:
        Log.print("Drop number columns: {}".format(config["drop_number_columns"]))
        df.drop(config["drop_number_columns"], axis=1, inplace=True)

    if "drop_datetime_columns" in config:
        Log.print("Drop datetime columns: {}".format(config["drop_datetime_columns"]))
        df.drop(config["drop_datetime_columns"], axis=1, inplace=True)
Exemple #2
0
def hyperopt_lightgbm(X: pd.DataFrame, y: pd.Series, params: Dict, config: Config):
    X_train, X_val, y_train, y_val = data_split(X, y, config, test_size=0.5)
    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_val, label=y_val)

    space = {
        "learning_rate": hp.choice("learning_rate", np.arange(0.01, 0.05, 0.01)),
        "boost_from_average": hp.choice("boost_from_average", [True, False]),
        "is_unbalance": hp.choice("is_unbalance", [True, False]),
        "zero_as_missing": hp.choice("zero_as_missing", [True, False]),
        "max_depth": hp.choice("max_depth", [-1, 2, 3, 4, 5, 6, 7]),
        "num_leaves": hp.choice("num_leaves", [11, 31, 51, 101, 151, 201]),
        "feature_fraction": hp.choice("feature_fraction", np.arange(0.5, 1.0, 0.1)),
        "bagging_fraction": hp.choice("bagging_fraction", np.arange(0.5, 1.0, 0.1)),
        "bagging_freq": hp.choice("bagging_freq", [1, 3, 5, 10, 20, 50]),
        "reg_alpha": hp.uniform("reg_alpha", 0, 10),
        "reg_lambda": hp.uniform("reg_lambda", 0, 10),
        "min_child_weight": hp.uniform("min_child_weight", 0, 10),
    }

    config.limit_time_fraction(0.15)

    def objective(hyperparams):
        if config.is_time_fraction_limit():
            score = np.inf if config.is_regression() else 0
            return {'loss': score, 'status': STATUS_OK}

        model = lgb.train({**params, **hyperparams}, train_data, 300, valid_data,
                          early_stopping_rounds=100, verbose_eval=False)

        score = model.best_score["valid_0"][params["metric"]]
        Log.print(score)
        if config.is_classification():
            score = -score

        return {'loss': score, 'status': STATUS_OK}

    trials = Trials()
    best = hyperopt.fmin(fn=objective, space=space, trials=trials, algo=tpe.suggest, max_evals=100, verbose=1,
                         rstate= np.random.RandomState(1))

    hyperparams = space_eval(space, best)
    Log.print("{:0.4f} {}".format(trials.best_trial['result']['loss'], hyperparams))
    return hyperparams
Exemple #3
0
def time_series_detect(df: pd.DataFrame, config: Config):
    sample_size = 10000
    model_params = {
        "objective": "regression" if config["mode"] == "regression" else "binary",
        "metric": "rmse" if config["mode"] == "regression" else "auc",
        "learning_rate": 0.01,
        "verbosity": -1,
        "seed": 1,
        "max_depth": -1,
    }

    if config.is_train():
        datetime_columns = [c for c in df if c.startswith("datetime_")]
        id_columns = [c for c in df if c.startswith("id_")]

        sort_columns = []
        for dc in datetime_columns:
            sort_columns.append([dc])
            for ic in id_columns:
                sort_columns.append([ic, dc])
        else:
            for ic in id_columns:
                sort_columns.append([ic])

        scores = []
        config.limit_time_fraction(0.1)
        for sc in sort_columns:
            if config.is_time_fraction_limit():
                break

            Log.silent(True)
            df.sort_values(sc, inplace=True)

            config_sample = copy.deepcopy(config)
            df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy()
            df_sample = df_sample[[c for c in df_sample if c.startswith("number_") or c == "target" or c in sc]]
            shift_columns(df_sample, group= sc[0] if len(sc) > 1 else None)
            transform(df_sample, config_sample)

            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)
            X_train, X_test, y_train, y_test = ts_split(X, y, test_size=0.5)

            model_sorted = lgb.train(model_params, lgb.Dataset(X_train, label=y_train), 3000, lgb.Dataset(X_test, label=y_test),
                              early_stopping_rounds=100, verbose_eval=False)
            score_sorted = model_sorted.best_score["valid_0"][model_params["metric"]]

            sampled_columns = [c for c in X if "_shift" not in c]
            model_sampled = lgb.train(model_params, lgb.Dataset(X_train[sampled_columns], label=y_train), 3000, lgb.Dataset(X_test[sampled_columns], label=y_test),
                              early_stopping_rounds=100, verbose_eval=False)
            score_sampled = model_sampled.best_score["valid_0"][model_params["metric"]]

            if config.is_classification():
                score_sorted = -score_sorted
                score_sampled = -score_sampled

            Log.silent(False)
            Log.print("Sort: {}. Score sorted: {:0.4f}. Score sampled: {:0.4f}".format(sc, score_sorted, score_sampled))
            score_ratio = score_sampled / score_sorted if config.is_regression() else abs(score_sorted / score_sampled)
            if score_ratio >= 1.03:
                Log.print(score_ratio)
                scores.append((score_sorted, sc))

        if len(scores) > 0:
            scores = sorted(scores, key=lambda x: x[0])
            Log.print("Scores: {}".format(scores))
            config["sort_values"] = scores[0][1]
            df.sort_values(config["sort_values"], inplace=True)

            config_sample = copy.deepcopy(config)
            df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy()
            shift_columns(df_sample, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None)
            transform(df_sample, config_sample)

            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            model = lgb.train(model_params, lgb.Dataset(X, label=y), 1000)
            fi = pd.Series(model.feature_importance(importance_type="gain"), index=X.columns)
            fi = fi[fi > 0].sort_values()
            selected_columns = fi[fi >= fi.quantile(0.75)].index.tolist()

            selected_shift_columns = [c.replace("_shift", "") for c in selected_columns if "_shift" in c]
            if len(selected_shift_columns) > 0:
                Log.print("Shift columns: {}".format(selected_shift_columns))
                config["shift_columns"] = selected_shift_columns

    if "shift_columns" in config:
        shift_columns(df, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None, number_columns=config["shift_columns"])