def ml_cross_val_score( classifier: ClassifierMixin, X: pd.DataFrame, y: pd.Series, cv_gen: BaseCrossValidator, sample_weight: np.ndarray = None, scoring: str = 'neg_log_loss'): # pylint: disable=invalid-name """ Snippet 7.4, page 110, Using the PurgedKFold Class. Function to run a cross-validation evaluation of the using sample weights and a custom CV generator. Note: This function is different to the book in that it requires the user to pass through a CV object. The book will accept a None value as a default and then resort to using PurgedCV, this also meant that extra arguments had to be passed to the function. To correct this we have removed the default and require the user to pass a CV object to the function. Example: .. code-block:: python cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=samples_info_sets, pct_embargo=pct_embargo) scores_array = ml_cross_val_score(classifier, X, y, cv_gen, sample_weight=None, scoring='neg_log_loss') :param classifier: A sk-learn Classifier object instance. :param X: The dataset of records to evaluate. :param y: The labels corresponding to the X dataset. :param cv_gen: Cross Validation generator object instance. :param sample_weight: A numpy array of weights for each record in the dataset. :param scoring: A metric name to use for scoring; currently supports `neg_log_loss`, `accuracy`, `f1`, `precision`, `recall`, and `roc_auc`. :return: The computed score as a numpy array. """ # Define scoring metrics scoring_func_dict = {'neg_log_loss': log_loss, 'accuracy': accuracy_score, 'f1': f1_score, 'precision': precision_score, 'recall': recall_score, 'roc_auc': roc_auc_score} try: scoring_func = scoring_func_dict[scoring] except KeyError: raise ValueError('Wrong scoring method. Select from: neg_log_loss, accuracy, f1, precision, recall, roc_auc') # If no sample_weight then broadcast a value of 1 to all samples (full weight). if sample_weight is None: sample_weight = np.ones((X.shape[0],)) # Score model on KFolds ret_scores = [] for train, test in cv_gen.split(X=X, y=y): fit = classifier.fit(X=X.iloc[train, :], y=y.iloc[train], sample_weight=sample_weight[train]) if scoring == 'neg_log_loss': prob = fit.predict_proba(X.iloc[test, :]) score = -1 * scoring_func(y.iloc[test], prob, sample_weight=sample_weight[test], labels=classifier.classes_) else: pred = fit.predict(X.iloc[test, :]) score = scoring_func(y.iloc[test], pred, sample_weight=sample_weight[test]) ret_scores.append(score) return np.array(ret_scores)
def evaluate_classifier(classifier: Pipeline, validator: BaseCrossValidator, X, y): scores = [] for train_index, test_index in validator.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] classifier.fit(X_train, y_train) # TODO: Something with dropout and the `max_tree` parameter. y_pred_test = classifier.predict(X_test) # print(list(zip(y_pred_test, y_test))) scores.append(np.sum(y_test == y_pred_test) / y_test.shape[0]) # Return mean score return np.mean(scores)
def train_lgbm_kfold(df: pd.DataFrame, fold: BaseCrossValidator, params: dict, output_dir: str): y_oof = np.zeros(len(df)) features = [x for x in df.columns if x != "answered_correctly"] df_imp = pd.DataFrame() df_imp["feature"] = features for i, (train_idx, val_idx) in enumerate(fold.split(df, df["answered_correctly"])): df_train, df_val = df.iloc[train_idx], df.iloc[val_idx] train_data = lgb.Dataset(df_train[features], label=df_train["answered_correctly"]) valid_data = lgb.Dataset(df_train[features], label=df_train["answered_correctly"]) model = lgb.train(params, train_data, valid_sets=[train_data, valid_data], verbose_eval=100) y_oof[val_idx] = model.predict(df_val[features]) df_imp[f"fold{i}"] = model.feature_importance( "gain") / model.feature_importance("gain").sum() with open(f"{output_dir}/model_fold{i}.pickle", "wb") as f: pickle.dump(model, f) df_oof = pd.DataFrame() df_oof["predict"] = y_oof df_oof["target"] = df["answered_correctly"] df_oof.to_csv(f"{output_dir}/oof.csv", index=False) # feature importance df_imp["fold_mean"] = df_imp.drop("feature", axis=1).mean(axis=1) df_imp.sort_values("fold_mean", ascending=False).to_csv(f"{output_dir}/imp.csv")
def cross_validation(dataset: np.ndarray, answers: np.ndarray, model: base.ClassifierMixin, cross_validator: model_selection.BaseCrossValidator, save_worst_data: bool) -> float: iteration_counter: int = 0 f1_score_value = 0 worst_f1_score_value = 1.0 worst_predicted = None worst_actual = None for train_index, test_index in cross_validator.split(dataset, answers): train_x, test_x = dataset[train_index], dataset[test_index] train_y, test_y = answers[train_index], answers[test_index] iteration_counter += 1 # Train model.fit(train_x, train_y) # Test predicted = model.predict(test_x) # Evaluate f1_iteration_score_value = metrics.f1_score(test_y, predicted, average='weighted') if f1_iteration_score_value <= worst_f1_score_value: worst_f1_score_value = f1_iteration_score_value worst_predicted = predicted worst_actual = test_y f1_score_value += f1_iteration_score_value if save_worst_data: np.savetxt(RESULT_FILENAME + 'predicted.txt', worst_predicted) np.savetxt(RESULT_FILENAME + 'actual.txt', worst_actual) return f1_score_value / iteration_counter
def ml_cross_val_score(classifier: ClassifierMixin, X: pd.DataFrame, y: pd.Series, cv_gen: BaseCrossValidator, sample_weight_train: np.ndarray = None, sample_weight_score: np.ndarray = None, scoring: Callable[[np.array, np.array], float] = log_loss): # pylint: disable=invalid-name # pylint: disable=comparison-with-callable """ Advances in Financial Machine Learning, Snippet 7.4, page 110. Using the PurgedKFold Class. Function to run a cross-validation evaluation of the using sample weights and a custom CV generator. Note: This function is different to the book in that it requires the user to pass through a CV object. The book will accept a None value as a default and then resort to using PurgedCV, this also meant that extra arguments had to be passed to the function. To correct this we have removed the default and require the user to pass a CV object to the function. Example: .. code-block:: python cv_gen = PurgedKFold(n_splits=n_splits, samples_info_sets=samples_info_sets, pct_embargo=pct_embargo) scores_array = ml_cross_val_score(classifier, X, y, cv_gen, sample_weight_train=sample_train, sample_weight_score=sample_score, scoring=accuracy_score) :param classifier: (ClassifierMixin) A sk-learn Classifier object instance. :param X: (pd.DataFrame) The dataset of records to evaluate. :param y: (pd.Series) The labels corresponding to the X dataset. :param cv_gen: (BaseCrossValidator) Cross Validation generator object instance. :param sample_weight_train: (np.array) Sample weights used to train the model for each record in the dataset. :param sample_weight_score: (np.array) Sample weights used to evaluate the model quality. :param scoring: (Callable) A metric scoring, can be custom sklearn metric. :return: (np.array) The computed score. """ # If no sample_weight then broadcast a value of 1 to all samples (full weight). if sample_weight_train is None: sample_weight_train = np.ones((X.shape[0], )) if sample_weight_score is None: sample_weight_score = np.ones((X.shape[0], )) # Score model on KFolds ret_scores = [] for train, test in cv_gen.split(X=X, y=y): fit = classifier.fit(X=X.iloc[train, :], y=y.iloc[train], sample_weight=sample_weight_train[train]) if scoring == log_loss: prob = fit.predict_proba(X.iloc[test, :]) score = -1 * scoring(y.iloc[test], prob, sample_weight=sample_weight_score[test], labels=classifier.classes_) else: pred = fit.predict(X.iloc[test, :]) score = scoring(y.iloc[test], pred, sample_weight=sample_weight_score[test]) ret_scores.append(score) return np.array(ret_scores)
def run_experiment( params: Dict, X_train: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame, cv: BaseCrossValidator, eval_func: Callable, with_auto_hpo: bool = False, time_budget: Optional[int] = None, ): if with_auto_hpo: params = tune_params(params, X_train, y, cv, time_budget=time_budget) oof = np.zeros(len(X_train)) test = np.zeros(len(X_test)) scores = [] importance = [] models = [] evaluator = Evaluator(load=True) for n, (train_idx, valid_idx) in enumerate(cv.split(X_train, y)): if "weight" in X_train.columns: weight = X_train["weight"].iloc[train_idx] del X_train["weight"] else: weight = None dtrain = lgb.Dataset(X_train.iloc[train_idx], y.iloc[train_idx], weight=weight) dvalid = lgb.Dataset(X_train.iloc[valid_idx], y.iloc[valid_idx]) model = lgb.train( params, dtrain, valid_sets=[dtrain, dvalid], valid_names=["train", "test"], early_stopping_rounds=100, verbose_eval=100, feval=evaluator.feval, ) test += ( model.predict(X_test, num_iteration=model.best_iteration) / cv.get_n_splits() ) oof[valid_idx] = model.predict( X_train.iloc[valid_idx], num_iteration=model.best_iteration ) scores.append(evaluator.wrmsse(y.iloc[valid_idx].values, oof[valid_idx])) models.append(model) importance.append(_get_importance(model, X_train.columns)) importance = pd.concat(importance) importance = ( importance.groupby("feature")[["importance"]] .mean() .sort_values("importance", ascending=False) ) test = pd.DataFrame({"demand": test}, index=X_test.index,) # 1foldのときだけよ if cv.get_n_splits() == 1: valid = pd.DataFrame({"y_true": y.iloc[valid_idx], "preds": oof[valid_idx]}) output_result(models, test, importance, scores, valid) else: output_result(models, test, importance, scores)
def cross_validate( cv: BaseCrossValidator, X: pd.DataFrame, y: pd.Series, params: Dict[str, Any], groups: pd.Series = None, tune: bool = False, **kwargs, ) -> (List[lgb.Booster], pd.DataFrame, pd.DataFrame): """ Function to run cross-validation. Args: cv (BaseCrossValidator): Cross-validation generator. X (pd.DataFrame): Training data. y (pd.Series): Target. params (Dict(str, Any)): LightGBM parameters. groups (pd.Series, optional): Group labels for the samples. Defaults to None. tune (bool, optional): If run tuning or not. Defaults to False. Returns: List(lgb.Booster): List of trained lightgbm boosters. pd.DataFrame: Dataframe with ["true", "pred"] columns, which use for model evaluation. pd.DataFrame: Dataframe with ["feature", "split", "gain", "fold"] columns, which use for feature importance plot. """ models = [] y_true = np.array([]) y_pred = np.array([]) imp_df = pd.DataFrame() for i, (train_idx, valid_idx) in enumerate(cv.split(X, y, groups=groups)): fold = i + 1 print("--------------------------------------------------") print(f"Fold: {fold}/{cv.get_n_splits()}") X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx] y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx] model = train_with_lightgbm(X_train, y_train, X_valid, y_valid, params, tune=tune, **kwargs) models.append(model) y_true = np.concatenate([y_true, y_valid]) y_pred = np.concatenate([y_pred, model.predict(X_valid)]) _df = pd.DataFrame() _df["feature"] = model.feature_name() _df["split"] = model.feature_importance("split") _df["gain"] = model.feature_importance("gain") _df["fold"] = fold imp_df = pd.concat([imp_df, _df]) print("--------------------------------------------------") eval_df = pd.DataFrame({"true": y_true, "pred": y_pred}) imp_df = imp_df.reset_index(drop=True) return models, eval_df, imp_df