Exemple #1
0
def tune_params(
    base_param: Dict,
    X: pd.DataFrame,
    y: pd.Series,
    cv: BaseCrossValidator,
    time_budget: Optional[int] = None,
) -> Dict:
    train_index, test_index = next(cv.split(X, y))

    dtrain = lgb.Dataset(X.iloc[train_index], y.iloc[train_index])
    dvalid = lgb.Dataset(X.iloc[test_index], y.iloc[test_index])

    params = copy.deepcopy(base_param)
    if "early_stopping_rounds" not in params:
        params["early_stopping_rounds"] = 100

    best_params, tuning_history = dict(), list()
    lightgbm_tuner.train(
        params,
        dtrain,
        valid_sets=[dvalid],
        verbose_eval=0,
        best_params=best_params,
        tuning_history=tuning_history,
        time_budget=time_budget,
    )

    result_param = copy.deepcopy(base_param)
    result_param.update(best_params)
    return result_param
Exemple #2
0
def split_train_valid_data(
        args: ArgumentParser,
        splitter: BaseCrossValidator,
        data: Optional[pd.DataFrame] = None,
        nr_fold: int = 1) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Split Data into Train and Valid"""
    label = args.col_enum_class
    image, kind = args.shared_indices
    image_quality = args.col_image_quality

    if data is None:
        data = pd.read_parquet(args.file_path_train_images_info)

    data = data.reset_index()

    if args.debug:
        data = data.iloc[:2000]

    data[label] = data[label].astype(np.int32)
    df = data.loc[(~data[image].duplicated(keep="first"))]
    for fold, (train_ind, valid_ind) in enumerate(
            splitter.split(X=df[label], y=df[image_quality], groups=df[image]),
            1):
        if nr_fold == fold:
            print(f"using fold {fold:02d} for train valid data split",
                  end="\r")
            break

    train_df = data.loc[data[image].isin(df[image].iloc[train_ind])]
    valid_df = data.loc[data[image].isin(df[image].iloc[valid_ind])]
    print(
        f"using fold {fold:02d} for train valid data split: {train_df.shape}, {valid_df.shape}"
    )
    return train_df, valid_df
def ml_cross_val_score(
        classifier: ClassifierMixin,
        X: pd.DataFrame,
        y: pd.Series,
        cv_gen: BaseCrossValidator,
        sample_weight_train: np.ndarray = None,
        sample_weight_score: np.ndarray = None,
        scoring: Callable[[np.array, np.array], float] = log_loss):
    # pylint: disable=invalid-name
    # pylint: disable=comparison-with-callable
    """
    Advances in Financial Machine Learning, Snippet 7.4, page 110.

    Using the PurgedKFold Class.

    Function to run a cross-validation evaluation of the using sample weights and a custom CV generator.

    Note: This function is different to the book in that it requires the user to pass through a CV object. The book
    will accept a None value as a default and then resort to using PurgedCV, this also meant that extra arguments had to
    be passed to the function. To correct this we have removed the default and require the user to pass a CV object to
    the function.

    Example:

    .. code-block:: python

        cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=samples_info_sets, pct_embargo=pct_embargo)
        scores_array = ml_cross_val_score(classifier, X, y, cv_gen, sample_weight_train=sample_train,
                                          sample_weight_score=sample_score, scoring=accuracy_score)

    :param classifier: (ClassifierMixin) A sk-learn Classifier object instance.
    :param X: (pd.DataFrame) The dataset of records to evaluate.
    :param y: (pd.Series) The labels corresponding to the X dataset.
    :param cv_gen: (BaseCrossValidator) Cross Validation generator object instance.
    :param sample_weight_train: (np.array) Sample weights used to train the model for each record in the dataset.
    :param sample_weight_score: (np.array) Sample weights used to evaluate the model quality.
    :param scoring: (Callable) A metric scoring, can be custom sklearn metric.
    :return: (np.array) The computed score.
    """

    # If no sample_weight then broadcast a value of 1 to all samples (full weight).
    if sample_weight_train is None:
        sample_weight_train = np.ones((X.shape[0],))

    if sample_weight_score is None:
        sample_weight_score = np.ones((X.shape[0],))

    # Score model on KFolds
    ret_scores = []
    for train, test in cv_gen.split(X=X, y=y):
        fit = classifier.fit(X=X.iloc[train, :], y=y.iloc[train], sample_weight=sample_weight_train[train])
        if scoring == log_loss:
            prob = fit.predict_proba(X.iloc[test, :])
            score = -1 * scoring(y.iloc[test], prob, sample_weight=sample_weight_score[test], labels=classifier.classes_)
        else:
            pred = fit.predict(X.iloc[test, :])
            score = scoring(y.iloc[test], pred, sample_weight=sample_weight_score[test])
        ret_scores.append(score)
    return np.array(ret_scores)
def ml_cross_val_score(
        classifier: ClassifierMixin,
        X: pd.DataFrame,
        y: pd.Series,
        cv_gen: BaseCrossValidator,
        sample_weight: np.ndarray = None,
        scoring: str = 'neg_log_loss'):
    # pylint: disable=invalid-name
    """
    Snippet 7.4, page 110, Using the PurgedKFold Class.
    Function to run a cross-validation evaluation of the using sample weights and a custom CV generator.

    Note: This function is different to the book in that it requires the user to pass through a CV object. The book
    will accept a None value as a default and then resort to using PurgedCV, this also meant that extra arguments had to
    be passed to the function. To correct this we have removed the default and require the user to pass a CV object to
    the function.

    Example:

    .. code-block:: python

        cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=samples_info_sets, pct_embargo=pct_embargo)
        scores_array = ml_cross_val_score(classifier, X, y, cv_gen, sample_weight=None, scoring='neg_log_loss')

    :param classifier: A sk-learn Classifier object instance.
    :param X: The dataset of records to evaluate.
    :param y: The labels corresponding to the X dataset.
    :param cv_gen: Cross Validation generator object instance.
    :param sample_weight: A numpy array of weights for each record in the dataset.
    :param scoring: A metric name to use for scoring; currently supports `neg_log_loss`, `accuracy`, `f1`, `precision`,
        `recall`, and `roc_auc`.
    :return: The computed score as a numpy array.
    """
    # Define scoring metrics
    scoring_func_dict = {'neg_log_loss': log_loss, 'accuracy': accuracy_score, 'f1': f1_score,
                         'precision': precision_score, 'recall': recall_score, 'roc_auc': roc_auc_score}
    try:
        scoring_func = scoring_func_dict[scoring]
    except KeyError:
        raise ValueError('Wrong scoring method. Select from: neg_log_loss, accuracy, f1, precision, recall, roc_auc')

    # If no sample_weight then broadcast a value of 1 to all samples (full weight).
    if sample_weight is None:
        sample_weight = np.ones((X.shape[0],))

    # Score model on KFolds
    ret_scores = []
    for train, test in cv_gen.split(X=X, y=y):
        fit = classifier.fit(X=X.iloc[train, :], y=y.iloc[train], sample_weight=sample_weight[train])
        if scoring == 'neg_log_loss':
            prob = fit.predict_proba(X.iloc[test, :])
            score = -1 * scoring_func(y.iloc[test], prob, sample_weight=sample_weight[test], labels=classifier.classes_)
        else:
            pred = fit.predict(X.iloc[test, :])
            score = scoring_func(y.iloc[test], pred, sample_weight=sample_weight[test])
        ret_scores.append(score)
    return np.array(ret_scores)
Exemple #5
0
def evaluate_classifier(classifier: Pipeline, validator: BaseCrossValidator, X,
                        y):

    scores = []

    for train_index, test_index in validator.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        classifier.fit(X_train, y_train)
        # TODO: Something with dropout and the `max_tree` parameter.
        y_pred_test = classifier.predict(X_test)
        # print(list(zip(y_pred_test, y_test)))
        scores.append(np.sum(y_test == y_pred_test) / y_test.shape[0])

    # Return mean score
    return np.mean(scores)
Exemple #6
0
def train_lgbm_kfold(df: pd.DataFrame, fold: BaseCrossValidator, params: dict,
                     output_dir: str):

    y_oof = np.zeros(len(df))

    features = [x for x in df.columns if x != "answered_correctly"]

    df_imp = pd.DataFrame()
    df_imp["feature"] = features
    for i, (train_idx,
            val_idx) in enumerate(fold.split(df, df["answered_correctly"])):
        df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
        train_data = lgb.Dataset(df_train[features],
                                 label=df_train["answered_correctly"])
        valid_data = lgb.Dataset(df_train[features],
                                 label=df_train["answered_correctly"])

        model = lgb.train(params,
                          train_data,
                          valid_sets=[train_data, valid_data],
                          verbose_eval=100)
        y_oof[val_idx] = model.predict(df_val[features])

        df_imp[f"fold{i}"] = model.feature_importance(
            "gain") / model.feature_importance("gain").sum()
        with open(f"{output_dir}/model_fold{i}.pickle", "wb") as f:
            pickle.dump(model, f)

    df_oof = pd.DataFrame()
    df_oof["predict"] = y_oof
    df_oof["target"] = df["answered_correctly"]

    df_oof.to_csv(f"{output_dir}/oof.csv", index=False)

    # feature importance
    df_imp["fold_mean"] = df_imp.drop("feature", axis=1).mean(axis=1)
    df_imp.sort_values("fold_mean",
                       ascending=False).to_csv(f"{output_dir}/imp.csv")
Exemple #7
0
def cross_validation(dataset: np.ndarray, answers: np.ndarray,
                     model: base.ClassifierMixin,
                     cross_validator: model_selection.BaseCrossValidator,
                     save_worst_data: bool) -> float:
    iteration_counter: int = 0
    f1_score_value = 0
    worst_f1_score_value = 1.0
    worst_predicted = None
    worst_actual = None

    for train_index, test_index in cross_validator.split(dataset, answers):
        train_x, test_x = dataset[train_index], dataset[test_index]
        train_y, test_y = answers[train_index], answers[test_index]
        iteration_counter += 1

        # Train
        model.fit(train_x, train_y)

        # Test
        predicted = model.predict(test_x)

        # Evaluate
        f1_iteration_score_value = metrics.f1_score(test_y,
                                                    predicted,
                                                    average='weighted')
        if f1_iteration_score_value <= worst_f1_score_value:
            worst_f1_score_value = f1_iteration_score_value
            worst_predicted = predicted
            worst_actual = test_y

        f1_score_value += f1_iteration_score_value

    if save_worst_data:
        np.savetxt(RESULT_FILENAME + 'predicted.txt', worst_predicted)
        np.savetxt(RESULT_FILENAME + 'actual.txt', worst_actual)

    return f1_score_value / iteration_counter
Exemple #8
0
def run_experiment(
    params: Dict,
    X_train: pd.DataFrame,
    y: pd.Series,
    X_test: pd.DataFrame,
    cv: BaseCrossValidator,
    eval_func: Callable,
    with_auto_hpo: bool = False,
    time_budget: Optional[int] = None,
):

    if with_auto_hpo:
        params = tune_params(params, X_train, y, cv, time_budget=time_budget)

    oof = np.zeros(len(X_train))
    test = np.zeros(len(X_test))

    scores = []
    importance = []
    models = []
    evaluator = Evaluator(load=True)

    for n, (train_idx, valid_idx) in enumerate(cv.split(X_train, y)):
        if "weight" in X_train.columns:
            weight = X_train["weight"].iloc[train_idx]
            del X_train["weight"]
        else:
            weight = None

        dtrain = lgb.Dataset(X_train.iloc[train_idx], y.iloc[train_idx], weight=weight)
        dvalid = lgb.Dataset(X_train.iloc[valid_idx], y.iloc[valid_idx])
        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dtrain, dvalid],
            valid_names=["train", "test"],
            early_stopping_rounds=100,
            verbose_eval=100,
            feval=evaluator.feval,
        )

        test += (
            model.predict(X_test, num_iteration=model.best_iteration)
            / cv.get_n_splits()
        )
        oof[valid_idx] = model.predict(
            X_train.iloc[valid_idx], num_iteration=model.best_iteration
        )
        scores.append(evaluator.wrmsse(y.iloc[valid_idx].values, oof[valid_idx]))
        models.append(model)

        importance.append(_get_importance(model, X_train.columns))

    importance = pd.concat(importance)
    importance = (
        importance.groupby("feature")[["importance"]]
        .mean()
        .sort_values("importance", ascending=False)
    )
    test = pd.DataFrame({"demand": test}, index=X_test.index,)

    # 1foldのときだけよ
    if cv.get_n_splits() == 1:
        valid = pd.DataFrame({"y_true": y.iloc[valid_idx], "preds": oof[valid_idx]})
        output_result(models, test, importance, scores, valid)
    else:
        output_result(models, test, importance, scores)
Exemple #9
0
def cross_validate(
    cv: BaseCrossValidator,
    X: pd.DataFrame,
    y: pd.Series,
    params: Dict[str, Any],
    groups: pd.Series = None,
    tune: bool = False,
    **kwargs,
) -> (List[lgb.Booster], pd.DataFrame, pd.DataFrame):
    """
    Function to run cross-validation.

    Args:
        cv (BaseCrossValidator): Cross-validation generator.
        X (pd.DataFrame): Training data.
        y (pd.Series): Target.
        params (Dict(str, Any)): LightGBM parameters.
        groups (pd.Series, optional): Group labels for the samples. Defaults to None.
        tune (bool, optional): If run tuning or not. Defaults to False.

    Returns:
        List(lgb.Booster): List of trained lightgbm boosters.
        pd.DataFrame: Dataframe with ["true", "pred"] columns, which use for model evaluation.
        pd.DataFrame: Dataframe with ["feature", "split", "gain", "fold"] columns, which use for feature importance plot.
    """
    models = []
    y_true = np.array([])
    y_pred = np.array([])
    imp_df = pd.DataFrame()

    for i, (train_idx, valid_idx) in enumerate(cv.split(X, y, groups=groups)):
        fold = i + 1
        print("--------------------------------------------------")
        print(f"Fold: {fold}/{cv.get_n_splits()}")

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = train_with_lightgbm(X_train,
                                    y_train,
                                    X_valid,
                                    y_valid,
                                    params,
                                    tune=tune,
                                    **kwargs)

        models.append(model)
        y_true = np.concatenate([y_true, y_valid])
        y_pred = np.concatenate([y_pred, model.predict(X_valid)])

        _df = pd.DataFrame()
        _df["feature"] = model.feature_name()
        _df["split"] = model.feature_importance("split")
        _df["gain"] = model.feature_importance("gain")
        _df["fold"] = fold
        imp_df = pd.concat([imp_df, _df])
    print("--------------------------------------------------")

    eval_df = pd.DataFrame({"true": y_true, "pred": y_pred})
    imp_df = imp_df.reset_index(drop=True)

    return models, eval_df, imp_df