Beispiel #1
0
def train_lightgbm(X: pd.DataFrame, y: pd.Series, stored_models_key: str,
                   save_to_disk: bool, config: Config):

    config[stored_models_key] = []

    data = lgb.Dataset(X, label=y, free_raw_data=False)
    data.construct()
    gc.collect()

    params = {
        "objective": config["objective"],
        "metric": config["metric"],
        "seed": config["seed"],
        'num_threads': config['n_threads'],
        "verbosity": -1,
    }

    seed = config["seed"]

    space = {
        "learning_rate":
        hp.uniform("learning_rate", 0.01, 0.4),
        "max_depth":
        hp.choice("max_depth", [-1, 2, 3, 4, 5, 6, 10]),
        "num_leaves":
        hp.choice("num_leaves", np.linspace(4, 200, 50, dtype=int)),
        "feature_fraction":
        hp.quniform("feature_fraction", 0.1, 1., 0.1),
        "bagging_fraction":
        hp.quniform("bagging_fraction", 0.1, 1., 0.1),
        "bagging_freq":
        hp.choice("bagging_freq", np.linspace(0, 20, 10, dtype=int)),
        "reg_alpha":
        hp.uniform("reg_alpha", 0, 30),
        "reg_lambda":
        hp.uniform("reg_lambda", 0, 30),
        "min_child_weight":
        hp.uniform('min_child_weight', 1e-10, 20),
        "max_bin":
        hp.choice('max_bin', [50, 100, 255]),
        'boosting_type':
        hp.choice(
            'boosting_type',
            [
                {
                    'boosting_type': 'gbdt',
                },
                {
                    'boosting_type':
                    'dart',
                    'drop_rate':
                    hp.uniform('drop_rate', 0.01, 0.6),
                    'max_drop':
                    hp.choice(
                        "max_drop",
                        np.linspace(5,
                                    config["train_num_boost_round"] * .9,
                                    10,
                                    dtype=int)),
                    'skip_drop':
                    hp.uniform('skip_drop', 0.1, 0.7),
                },
                #  {'boosting_type': 'rf',
                #   'bagging_freq': 1,
                #   },
                #  {'boosting_type': 'goss',
                #   'bagging_freq': 0,
                #   },
            ]),
        #train params
        'early_stopping_rounds':
        hp.choice("early_stopping_rounds", [None, 50]),
        'cv_splits':
        hp.choice("cv_splits", np.linspace(3, 12, 10, dtype=int)),  # [4,8]
        'shuffle':
        hp.choice("shuffle", [True, False]),
    }

    if config.is_classification():
        space['scale_pos_weight'] = hp.uniform('scale_pos_weight', 0.5, 10)
    else:
        space['objective'] = hp.choice(
            "objective",
            [
                'regression',
                'huber',
                # 'fair',
                # 'regression_l1',
            ])

    def objective(space_sample):

        iteration_start = time.time()
        hyperparams = copy.deepcopy(space_sample)
        boosting_type = {}
        if 'boosting_type' in hyperparams.keys():
            boosting_type = hyperparams.pop('boosting_type')

        hyperparams = {**params, **hyperparams, **boosting_type}

        scores, models, y_oof = train_lightgbm_cv(data=data,
                                                  hyperparams=hyperparams,
                                                  config=config)

        if config.is_classification(): scores['oof'] = -scores['oof']

        iteration_time = time.time() - iteration_start
        log('iteration time %.1f, loss %.5f' % (iteration_time, scores['oof']))

        elapsed_time = (time.time() - config['start_time'])
        have_time = (config["time_limit"] - elapsed_time - iteration_time) > 25
        if have_time:
            save_model(models, hyperparams, scores, y_oof, stored_models_key,
                       save_to_disk, config)

            status = STATUS_OK
        else:
            status = STATUS_FAIL

        return {
            'loss': scores['oof'],
            'runtime': iteration_time,
            'scores': scores,
            'models': models,
            'y_oof': y_oof,
            'status': status
        }

    have_time = True
    eval_n = 0
    trials = Trials()

    while have_time:
        iteration_start = time.time()
        best = hyperopt.fmin(
            fn=objective,
            space=space,
            trials=trials,
            algo=tpe.suggest,
            max_evals=eval_n + 1,
            verbose=1,
            rstate=np.random.RandomState(eval_n)
        )  #TODO: (bug) if seed the same - in some cases it samples same values forever
        iteration_time = time.time() - iteration_start
        elapsed_time = (time.time() - config['start_time'])
        have_time = (config["time_limit"] - elapsed_time - iteration_time) > 25
        eval_n += 1
Beispiel #2
0
def time_series_detect(df: pd.DataFrame, config: Config):
    sample_size = 10000
    model_params = {
        "objective": "regression" if config["mode"] == "regression" else "binary",
        "metric": "rmse" if config["mode"] == "regression" else "auc",
        "learning_rate": 0.01,
        "verbosity": -1,
        "seed": 1,
        "max_depth": -1,
    }

    if config.is_train():
        datetime_columns = [c for c in df if c.startswith("datetime_")]
        id_columns = [c for c in df if c.startswith("id_")]

        sort_columns = []
        for dc in datetime_columns:
            sort_columns.append([dc])
            for ic in id_columns:
                sort_columns.append([ic, dc])
        else:
            for ic in id_columns:
                sort_columns.append([ic])

        scores = []
        config.limit_time_fraction(0.1)
        for sc in sort_columns:
            if config.is_time_fraction_limit():
                break

            Log.silent(True)
            df.sort_values(sc, inplace=True)

            config_sample = copy.deepcopy(config)
            df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy()
            df_sample = df_sample[[c for c in df_sample if c.startswith("number_") or c == "target" or c in sc]]
            shift_columns(df_sample, group= sc[0] if len(sc) > 1 else None)
            transform(df_sample, config_sample)

            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)
            X_train, X_test, y_train, y_test = ts_split(X, y, test_size=0.5)

            model_sorted = lgb.train(model_params, lgb.Dataset(X_train, label=y_train), 3000, lgb.Dataset(X_test, label=y_test),
                              early_stopping_rounds=100, verbose_eval=False)
            score_sorted = model_sorted.best_score["valid_0"][model_params["metric"]]

            sampled_columns = [c for c in X if "_shift" not in c]
            model_sampled = lgb.train(model_params, lgb.Dataset(X_train[sampled_columns], label=y_train), 3000, lgb.Dataset(X_test[sampled_columns], label=y_test),
                              early_stopping_rounds=100, verbose_eval=False)
            score_sampled = model_sampled.best_score["valid_0"][model_params["metric"]]

            if config.is_classification():
                score_sorted = -score_sorted
                score_sampled = -score_sampled

            Log.silent(False)
            Log.print("Sort: {}. Score sorted: {:0.4f}. Score sampled: {:0.4f}".format(sc, score_sorted, score_sampled))
            score_ratio = score_sampled / score_sorted if config.is_regression() else abs(score_sorted / score_sampled)
            if score_ratio >= 1.03:
                Log.print(score_ratio)
                scores.append((score_sorted, sc))

        if len(scores) > 0:
            scores = sorted(scores, key=lambda x: x[0])
            Log.print("Scores: {}".format(scores))
            config["sort_values"] = scores[0][1]
            df.sort_values(config["sort_values"], inplace=True)

            config_sample = copy.deepcopy(config)
            df_sample = df.iloc[-sample_size:].copy() if len(df) > sample_size else df.copy()
            shift_columns(df_sample, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None)
            transform(df_sample, config_sample)

            y = df_sample["target"]
            X = df_sample.drop("target", axis=1)

            model = lgb.train(model_params, lgb.Dataset(X, label=y), 1000)
            fi = pd.Series(model.feature_importance(importance_type="gain"), index=X.columns)
            fi = fi[fi > 0].sort_values()
            selected_columns = fi[fi >= fi.quantile(0.75)].index.tolist()

            selected_shift_columns = [c.replace("_shift", "") for c in selected_columns if "_shift" in c]
            if len(selected_shift_columns) > 0:
                Log.print("Shift columns: {}".format(selected_shift_columns))
                config["shift_columns"] = selected_shift_columns

    if "shift_columns" in config:
        shift_columns(df, group=config["sort_values"][0] if len(config["sort_values"]) > 1 else None, number_columns=config["shift_columns"])