def ml_cross_val_score(
        classifier: ClassifierMixin,
        X: pd.DataFrame,
        y: pd.Series,
        cv_gen: BaseCrossValidator,
        sample_weight: np.ndarray = None,
        scoring: str = 'neg_log_loss'):
    # pylint: disable=invalid-name
    """
    Snippet 7.4, page 110, Using the PurgedKFold Class.
    Function to run a cross-validation evaluation of the using sample weights and a custom CV generator.

    Note: This function is different to the book in that it requires the user to pass through a CV object. The book
    will accept a None value as a default and then resort to using PurgedCV, this also meant that extra arguments had to
    be passed to the function. To correct this we have removed the default and require the user to pass a CV object to
    the function.

    Example:

    .. code-block:: python

        cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=samples_info_sets, pct_embargo=pct_embargo)
        scores_array = ml_cross_val_score(classifier, X, y, cv_gen, sample_weight=None, scoring='neg_log_loss')

    :param classifier: A sk-learn Classifier object instance.
    :param X: The dataset of records to evaluate.
    :param y: The labels corresponding to the X dataset.
    :param cv_gen: Cross Validation generator object instance.
    :param sample_weight: A numpy array of weights for each record in the dataset.
    :param scoring: A metric name to use for scoring; currently supports `neg_log_loss`, `accuracy`, `f1`, `precision`,
        `recall`, and `roc_auc`.
    :return: The computed score as a numpy array.
    """
    # Define scoring metrics
    scoring_func_dict = {'neg_log_loss': log_loss, 'accuracy': accuracy_score, 'f1': f1_score,
                         'precision': precision_score, 'recall': recall_score, 'roc_auc': roc_auc_score}
    try:
        scoring_func = scoring_func_dict[scoring]
    except KeyError:
        raise ValueError('Wrong scoring method. Select from: neg_log_loss, accuracy, f1, precision, recall, roc_auc')

    # If no sample_weight then broadcast a value of 1 to all samples (full weight).
    if sample_weight is None:
        sample_weight = np.ones((X.shape[0],))

    # Score model on KFolds
    ret_scores = []
    for train, test in cv_gen.split(X=X, y=y):
        fit = classifier.fit(X=X.iloc[train, :], y=y.iloc[train], sample_weight=sample_weight[train])
        if scoring == 'neg_log_loss':
            prob = fit.predict_proba(X.iloc[test, :])
            score = -1 * scoring_func(y.iloc[test], prob, sample_weight=sample_weight[test], labels=classifier.classes_)
        else:
            pred = fit.predict(X.iloc[test, :])
            score = scoring_func(y.iloc[test], pred, sample_weight=sample_weight[test])
        ret_scores.append(score)
    return np.array(ret_scores)
Exemple #2
0
def evaluate_classifier(classifier: Pipeline, validator: BaseCrossValidator, X,
                        y):

    scores = []

    for train_index, test_index in validator.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        classifier.fit(X_train, y_train)
        # TODO: Something with dropout and the `max_tree` parameter.
        y_pred_test = classifier.predict(X_test)
        # print(list(zip(y_pred_test, y_test)))
        scores.append(np.sum(y_test == y_pred_test) / y_test.shape[0])

    # Return mean score
    return np.mean(scores)
Exemple #3
0
def train_lgbm_kfold(df: pd.DataFrame, fold: BaseCrossValidator, params: dict,
                     output_dir: str):

    y_oof = np.zeros(len(df))

    features = [x for x in df.columns if x != "answered_correctly"]

    df_imp = pd.DataFrame()
    df_imp["feature"] = features
    for i, (train_idx,
            val_idx) in enumerate(fold.split(df, df["answered_correctly"])):
        df_train, df_val = df.iloc[train_idx], df.iloc[val_idx]
        train_data = lgb.Dataset(df_train[features],
                                 label=df_train["answered_correctly"])
        valid_data = lgb.Dataset(df_train[features],
                                 label=df_train["answered_correctly"])

        model = lgb.train(params,
                          train_data,
                          valid_sets=[train_data, valid_data],
                          verbose_eval=100)
        y_oof[val_idx] = model.predict(df_val[features])

        df_imp[f"fold{i}"] = model.feature_importance(
            "gain") / model.feature_importance("gain").sum()
        with open(f"{output_dir}/model_fold{i}.pickle", "wb") as f:
            pickle.dump(model, f)

    df_oof = pd.DataFrame()
    df_oof["predict"] = y_oof
    df_oof["target"] = df["answered_correctly"]

    df_oof.to_csv(f"{output_dir}/oof.csv", index=False)

    # feature importance
    df_imp["fold_mean"] = df_imp.drop("feature", axis=1).mean(axis=1)
    df_imp.sort_values("fold_mean",
                       ascending=False).to_csv(f"{output_dir}/imp.csv")
Exemple #4
0
def cross_validation(dataset: np.ndarray, answers: np.ndarray,
                     model: base.ClassifierMixin,
                     cross_validator: model_selection.BaseCrossValidator,
                     save_worst_data: bool) -> float:
    iteration_counter: int = 0
    f1_score_value = 0
    worst_f1_score_value = 1.0
    worst_predicted = None
    worst_actual = None

    for train_index, test_index in cross_validator.split(dataset, answers):
        train_x, test_x = dataset[train_index], dataset[test_index]
        train_y, test_y = answers[train_index], answers[test_index]
        iteration_counter += 1

        # Train
        model.fit(train_x, train_y)

        # Test
        predicted = model.predict(test_x)

        # Evaluate
        f1_iteration_score_value = metrics.f1_score(test_y,
                                                    predicted,
                                                    average='weighted')
        if f1_iteration_score_value <= worst_f1_score_value:
            worst_f1_score_value = f1_iteration_score_value
            worst_predicted = predicted
            worst_actual = test_y

        f1_score_value += f1_iteration_score_value

    if save_worst_data:
        np.savetxt(RESULT_FILENAME + 'predicted.txt', worst_predicted)
        np.savetxt(RESULT_FILENAME + 'actual.txt', worst_actual)

    return f1_score_value / iteration_counter
Exemple #5
0
def ml_cross_val_score(classifier: ClassifierMixin,
                       X: pd.DataFrame,
                       y: pd.Series,
                       cv_gen: BaseCrossValidator,
                       sample_weight_train: np.ndarray = None,
                       sample_weight_score: np.ndarray = None,
                       scoring: Callable[[np.array, np.array],
                                         float] = log_loss):
    # pylint: disable=invalid-name
    # pylint: disable=comparison-with-callable
    """
    Advances in Financial Machine Learning, Snippet 7.4, page 110.

    Using the PurgedKFold Class.

    Function to run a cross-validation evaluation of the using sample weights and a custom CV generator.

    Note: This function is different to the book in that it requires the user to pass through a CV object. The book
    will accept a None value as a default and then resort to using PurgedCV, this also meant that extra arguments had to
    be passed to the function. To correct this we have removed the default and require the user to pass a CV object to
    the function.

    Example:

    .. code-block:: python

        cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=samples_info_sets, pct_embargo=pct_embargo)
        scores_array = ml_cross_val_score(classifier, X, y, cv_gen, sample_weight_train=sample_train,
                                          sample_weight_score=sample_score, scoring=accuracy_score)

    :param classifier: (ClassifierMixin) A sk-learn Classifier object instance.
    :param X: (pd.DataFrame) The dataset of records to evaluate.
    :param y: (pd.Series) The labels corresponding to the X dataset.
    :param cv_gen: (BaseCrossValidator) Cross Validation generator object instance.
    :param sample_weight_train: (np.array) Sample weights used to train the model for each record in the dataset.
    :param sample_weight_score: (np.array) Sample weights used to evaluate the model quality.
    :param scoring: (Callable) A metric scoring, can be custom sklearn metric.
    :return: (np.array) The computed score.
    """

    # If no sample_weight then broadcast a value of 1 to all samples (full weight).
    if sample_weight_train is None:
        sample_weight_train = np.ones((X.shape[0], ))

    if sample_weight_score is None:
        sample_weight_score = np.ones((X.shape[0], ))

    # Score model on KFolds
    ret_scores = []
    for train, test in cv_gen.split(X=X, y=y):
        fit = classifier.fit(X=X.iloc[train, :],
                             y=y.iloc[train],
                             sample_weight=sample_weight_train[train])
        if scoring == log_loss:
            prob = fit.predict_proba(X.iloc[test, :])
            score = -1 * scoring(y.iloc[test],
                                 prob,
                                 sample_weight=sample_weight_score[test],
                                 labels=classifier.classes_)
        else:
            pred = fit.predict(X.iloc[test, :])
            score = scoring(y.iloc[test],
                            pred,
                            sample_weight=sample_weight_score[test])
        ret_scores.append(score)
    return np.array(ret_scores)
Exemple #6
0
def run_experiment(
    params: Dict,
    X_train: pd.DataFrame,
    y: pd.Series,
    X_test: pd.DataFrame,
    cv: BaseCrossValidator,
    eval_func: Callable,
    with_auto_hpo: bool = False,
    time_budget: Optional[int] = None,
):

    if with_auto_hpo:
        params = tune_params(params, X_train, y, cv, time_budget=time_budget)

    oof = np.zeros(len(X_train))
    test = np.zeros(len(X_test))

    scores = []
    importance = []
    models = []
    evaluator = Evaluator(load=True)

    for n, (train_idx, valid_idx) in enumerate(cv.split(X_train, y)):
        if "weight" in X_train.columns:
            weight = X_train["weight"].iloc[train_idx]
            del X_train["weight"]
        else:
            weight = None

        dtrain = lgb.Dataset(X_train.iloc[train_idx], y.iloc[train_idx], weight=weight)
        dvalid = lgb.Dataset(X_train.iloc[valid_idx], y.iloc[valid_idx])
        model = lgb.train(
            params,
            dtrain,
            valid_sets=[dtrain, dvalid],
            valid_names=["train", "test"],
            early_stopping_rounds=100,
            verbose_eval=100,
            feval=evaluator.feval,
        )

        test += (
            model.predict(X_test, num_iteration=model.best_iteration)
            / cv.get_n_splits()
        )
        oof[valid_idx] = model.predict(
            X_train.iloc[valid_idx], num_iteration=model.best_iteration
        )
        scores.append(evaluator.wrmsse(y.iloc[valid_idx].values, oof[valid_idx]))
        models.append(model)

        importance.append(_get_importance(model, X_train.columns))

    importance = pd.concat(importance)
    importance = (
        importance.groupby("feature")[["importance"]]
        .mean()
        .sort_values("importance", ascending=False)
    )
    test = pd.DataFrame({"demand": test}, index=X_test.index,)

    # 1foldのときだけよ
    if cv.get_n_splits() == 1:
        valid = pd.DataFrame({"y_true": y.iloc[valid_idx], "preds": oof[valid_idx]})
        output_result(models, test, importance, scores, valid)
    else:
        output_result(models, test, importance, scores)
Exemple #7
0
def cross_validate(
    cv: BaseCrossValidator,
    X: pd.DataFrame,
    y: pd.Series,
    params: Dict[str, Any],
    groups: pd.Series = None,
    tune: bool = False,
    **kwargs,
) -> (List[lgb.Booster], pd.DataFrame, pd.DataFrame):
    """
    Function to run cross-validation.

    Args:
        cv (BaseCrossValidator): Cross-validation generator.
        X (pd.DataFrame): Training data.
        y (pd.Series): Target.
        params (Dict(str, Any)): LightGBM parameters.
        groups (pd.Series, optional): Group labels for the samples. Defaults to None.
        tune (bool, optional): If run tuning or not. Defaults to False.

    Returns:
        List(lgb.Booster): List of trained lightgbm boosters.
        pd.DataFrame: Dataframe with ["true", "pred"] columns, which use for model evaluation.
        pd.DataFrame: Dataframe with ["feature", "split", "gain", "fold"] columns, which use for feature importance plot.
    """
    models = []
    y_true = np.array([])
    y_pred = np.array([])
    imp_df = pd.DataFrame()

    for i, (train_idx, valid_idx) in enumerate(cv.split(X, y, groups=groups)):
        fold = i + 1
        print("--------------------------------------------------")
        print(f"Fold: {fold}/{cv.get_n_splits()}")

        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = train_with_lightgbm(X_train,
                                    y_train,
                                    X_valid,
                                    y_valid,
                                    params,
                                    tune=tune,
                                    **kwargs)

        models.append(model)
        y_true = np.concatenate([y_true, y_valid])
        y_pred = np.concatenate([y_pred, model.predict(X_valid)])

        _df = pd.DataFrame()
        _df["feature"] = model.feature_name()
        _df["split"] = model.feature_importance("split")
        _df["gain"] = model.feature_importance("gain")
        _df["fold"] = fold
        imp_df = pd.concat([imp_df, _df])
    print("--------------------------------------------------")

    eval_df = pd.DataFrame({"true": y_true, "pred": y_pred})
    imp_df = imp_df.reset_index(drop=True)

    return models, eval_df, imp_df