Esempio n. 1
0
class CatBoost(BaseModel):
    '''
    Wrapper class of LightGBM.
    self.core contains Booster.
    '''
    @timer
    def __init__(self, config):
        self.config = config

    @timer
    def train(self,
              X_train,
              y_train,
              X_val=None,
              y_val=None,
              params=None,
              num_boost_round=100,
              early_stopping_rounds=None,
              fold=0):

        self.core = CatBoostClassifier(
            # **self.config.params,
            **params,
            num_boost_round=num_boost_round)
        self.core.fit(
            X=X_train,
            y=y_train,
            eval_set=(X_val, y_val),
            # verbose=True,
            early_stopping_rounds=early_stopping_rounds)
        return self

    @timer
    def predict(self, X_test):
        y_test = self.core.predict_proba(X_test)[:, 1]
        return y_test

    @property
    def feature_importance(self):
        return self.core.get_feature_importance()

    @property
    def best_iteration(self):
        return self.core.get_best_iteration()

    @property
    def evals_result(self):
        return self.core.get_evals_result()
Esempio n. 2
0
kappa_test = cohen_kappa_score(y_test, pred)

pred = model.predict(data=train_pool, prediction_type='Class')

acc_train = accuracy_score(y_train, pred)

kappa_train = accuracy_score(y_train, pred)

d = pd.DataFrame(data={
    'Accuracy': [acc_train, acc_test],
    'Kappa': [kappa_train, kappa_test]
})

df1 = d.rename(index={0: 'train', 1: 'test'})

#prob= model.predict_proba(data=test_pool)

pp = model.get_evals_result()

dl = pd.DataFrame(data=pp['learn'])

dv = pd.DataFrame(data=pp['validation_0'])

result = pd.concat([dl, dv], axis=1, sort=False)

result.columns = ['Acc_learn', 'Multi_learn', 'Acc_val', 'Multi_val']

result.to_csv(path_or_buf=args.outfile, index=False)

df1.to_csv(path_or_buf=args.ooutfile, index=False)
Esempio n. 3
0
    def train_1fold(self, fold, params, params_custom):
        X_train, X_valid, y_train, y_valid, X_test, vdx, tdx = self.get_fold_data(fold)

        cat_feature_idx = []
        for i, c in enumerate(X_train):
            if not is_numeric_dtype(X_train[c]):
                cat_feature_idx.append(i)

        if fold == 0:
            X_train.dtypes.to_csv(self.models_path + "/dtypes.csv")
            logger.info(f"X_train.shape = {X_train.shape}")

        params2 = copy.deepcopy(params)
        if params2["random_seed"] is not None:
            params2["random_seed"] = params2["random_seed"] + fold
            logger.info(f"Set catboost train random_seed = {params2['random_seed']}")

        model = CatBoostClassifier(**params2)

        model.fit(
            X_train, y_train,
            cat_features=cat_feature_idx,
            eval_set=(X_valid, y_valid)
        )

        model.save_model(self.models_path + f'/model-catboost-f{fold:02d}.bin')
        util.dump_json(model.get_all_params(), self.models_path + "/params.json")

        evals = model.get_evals_result()
        evals_df = pd.DataFrame({
            f"logloss_train_f{fold:02d}":evals["learn"]['Logloss'],
            f"accuracy_train_f{fold:02d}":evals["learn"]['Accuracy'],
            f"logloss_valid_f{fold:02d}":evals['validation']['Logloss'],
            f"accuracy_valid_f{fold:02d}":evals['validation']['Accuracy']
        })
        self.evals_df.append(evals_df)

        preds_valid = model.predict_proba(X_valid)[:,1]
        logger.info(f"len(vdx)={len(vdx)} len(preds_valid)={len(preds_valid)}")
        self.preds_valid_all.loc[vdx, "pred"] = preds_valid

        preds_train = model.predict_proba(X_train)[:,1]
        self.preds_train_all.append(pd.DataFrame({fold:preds_train}, index=tdx))

        preds_test = model.predict_proba(X_test)[:,1]
        self.preds_test_all.append(preds_test)

        acc_valid = accuracy_score(y_valid, np.round(preds_valid))
        acc_train = accuracy_score(y_train, np.round(preds_train))
        logloss_valid = log_loss(y_valid, preds_valid)
        logloss_train = log_loss(y_train, preds_train)

        ms = [fold, acc_train, acc_valid, logloss_train, logloss_valid, model.get_best_iteration()]
        self.mets.append(ms)
        show_mets(*ms)

        for it in ["FeatureImportance"]:
            imp = pd.Series(model.get_feature_importance(type=it), index=X_train.columns)
            imp.name = fold
            imp.index.name = "feature"
            self.importance[it].append(imp)
    use_best_model=True,
    verbose = True)
elapsed_time_training = time.time() - start_time

# Predicting
print('Predicting...')
start_time = time.time()
y_pred = model.predict(dataset.X_test, prediction_type='Class')
elapsed_time_testing = time.time() - start_time

# Analytics
print('Analyzing...')
title = "CatBoost (weights smote)"

eval_results = {
    'MultiClass': np.absolute(model.get_evals_result()['validation_0']['MultiClass']),
    'Accuracy': np.absolute(model.get_evals_result()['validation_0']['Accuracy']),
    #'F1': np.absolute(model.get_evals_result()['validation_0']['TotalF1']),
    #'gmean': model.get_evals_result()['validation_0']['GeometricMean']
    }

save_path = "C:/Users/thoma/source/repos/PythonMachineLearning/PythonMachineLearning/Library/Results"
evaluator = Evaluator(title, save_path)
evaluator.append_to_file(f'Best iteration: {model.get_best_iteration()}', "info.txt")
evaluator.append_to_file(f'Training time (seconds): {elapsed_time_training}', "info.txt")
evaluator.append_to_file(f'Testing time (seconds): {elapsed_time_testing}', "info.txt")
evaluator.save_dict_to_file(dataset_parameters, "dataset_parameters.csv")
evaluator.save_dict_to_file(model_parameters, "model_parameters.csv")
evaluator.save_advanced_metrics(dataset.y_test, y_pred, dataset.class_labels, dataset.class_descriptions)
evaluator.save_eval_scores_to_file(eval_results, "metric_results.csv")
evaluator.create_evaluation_metric_results(eval_results, xlabel='number of trees', ylabel='metric score')