예제 #1
0
파일: test.py 프로젝트: iamnik13/catboost
def test_wrong_feature_count():
    with pytest.raises(CatboostError):
        data = np.random.rand(100, 10)
        label = np.random.randint(2, size=100)
        model = CatBoostClassifier()
        model.fit(data, label)
        model.predict(data[:, :-1])
예제 #2
0
파일: test.py 프로젝트: iamnik13/catboost
def test_full_history():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(od_type='Iter', od_wait=20, random_seed=42, approx_on_full_history=True)
    model.fit(train_pool, eval_set=test_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #3
0
파일: test.py 프로젝트: iamnik13/catboost
def test_pool_after_fit():
    pool1 = Pool(TRAIN_FILE, column_description=CD_FILE)
    pool2 = Pool(TRAIN_FILE, column_description=CD_FILE)
    assert _check_data(pool1.get_features(), pool2.get_features())
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool2)
    assert _check_data(pool1.get_features(), pool2.get_features())
예제 #4
0
def test_raw_predict_equals_to_model_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=10, random_seed=0)
    model.fit(train_pool, eval_set=test_pool)
    pred = model.predict(test_pool, prediction_type='RawFormulaVal')
    assert all(model.get_test_eval() == pred)
def train_preprocessor(path='.', train='train.csv'):
    print('start train trash preprocessor...')
    df = pd.read_csv(os.path.join(path, train))

    train_data = df[:-100]
    validation_data = df[-100: -50]

    vectorizer = CountVectorizer()
    x_train_counts = vectorizer.fit_transform(train_data.text)
    x_validation_counts = vectorizer.transform(validation_data.text)

    model = CatBoostClassifier(iterations=250,
                               train_dir=path,
                               logging_level='Silent',
                               allow_writing_files=False
                               )

    model.fit(X=x_train_counts.toarray(),
              y=train_data.status,
              eval_set=(x_validation_counts.toarray(), validation_data.status),
              use_best_model=True,)

    model.save_model(os.path.join(path, 'trash_model'))
    joblib.dump(vectorizer,os.path.join(path, 'trash_vectorizer'))
    print('end train sentiment preprocessor...')
예제 #6
0
파일: test.py 프로젝트: iamnik13/catboost
def test_ntree_limit():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=100, random_seed=0)
    model.fit(train_pool)
    pred = model.predict_proba(test_pool, ntree_end=10)
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
예제 #7
0
파일: test.py 프로젝트: iamnik13/catboost
def test_non_ones_weight():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    weight = np.arange(1, pool.num_row()+1)
    pool.set_weight(weight)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #8
0
파일: test.py 프로젝트: iamnik13/catboost
def test_zero_baseline():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    baseline = np.zeros(pool.num_row())
    pool.set_baseline(baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #9
0
파일: test.py 프로젝트: iamnik13/catboost
def test_no_cat_in_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()))
    pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices()))
    assert _check_data(pred1, pred2)
예제 #10
0
파일: test.py 프로젝트: iamnik13/catboost
def test_predict_class():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred = model.predict(test_pool, prediction_type="Class")
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
예제 #11
0
파일: test.py 프로젝트: iamnik13/catboost
def test_staged_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=10, random_seed=0)
    model.fit(train_pool)
    preds = []
    for pred in model.staged_predict(test_pool):
        preds.append(pred)
    np.save(PREDS_PATH, np.array(preds))
    return local_canonical_file(PREDS_PATH)
예제 #12
0
파일: test.py 프로젝트: iamnik13/catboost
def test_multiclass():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8)
    classifier.fit(pool)
    classifier.save_model(OUTPUT_MODEL_PATH)
    new_classifier = CatBoostClassifier()
    new_classifier.load_model(OUTPUT_MODEL_PATH)
    pred = new_classifier.predict_proba(pool)
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
예제 #13
0
    def create_model(self, kfold_X_train, y_train, kfold_X_valid, y_test, test):


        best = CatBoostClassifier(loss_function='MultiClassOneVsAll', learning_rate=0.07940735491731761, depth=8)
        best.fit(kfold_X_train, y_train)

        # 对验证集predict
        pred = best.predict_proba(kfold_X_valid)
        results = best.predict_proba(test)

        return pred, results, best
예제 #14
0
파일: test.py 프로젝트: iamnik13/catboost
def test_ignored_features():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model1 = CatBoostClassifier(iterations=5, random_seed=0, ignored_features=[1, 2, 3])
    model2 = CatBoostClassifier(iterations=5, random_seed=0)
    model1.fit(train_pool)
    model2.fit(train_pool)
    predictions1 = model1.predict(test_pool)
    predictions2 = model2.predict(test_pool)
    assert not _check_data(predictions1, predictions2)
    model1.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #15
0
파일: train.py 프로젝트: bamx23/ClickHouse
def train_catboost_model(df, target, cat_features, params, verbose=True):

    if not isinstance(df, DataFrame):
        raise Exception('DataFrame object expected, but got ' + repr(df))

    print 'features:', df.columns.tolist()

    cat_features_index = list(df.columns.get_loc(feature) for feature in cat_features)
    print 'cat features:', cat_features_index
    model = CatBoostClassifier(**params)
    model.fit(df, target, cat_features=cat_features_index, verbose=verbose)
    return model
예제 #16
0
파일: test.py 프로젝트: iamnik13/catboost
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices())
    model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #17
0
def model_1(X,y,test):
	'''
	This is a catBoost model where we need not to encode categorical variables.
	It automatically takes care of them.
	'''
	categorical_features_indices = np.where(X.dtypes != np.float)[0]
	X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234)
	#importing library and building model
	cboost=CatBoostClassifier(iterations=500,learning=0.01,depth=6,loss_function='MultiClass',eval_metric='Accuracy')
	cboost.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)
	#calculating the class wise prediction probability of cboost model
	pred_prob=cboost.predict_proba(test)
	return pred_prob
예제 #18
0
class BesCatBoost:
    """
    catboost_params = {
            'iterations': 500,
            'depth': 3,
            'learning_rate': 0.1,
            'eval_metric': 'AUC',
            'random_seed': 42,
            'logging_level': 'Verbose',
            'l2_leaf_reg': 15.0,
            'bagging_temperature': 0.75,
            'allow_writing_files': False,
            'metric_period': 50
        }
        """

    def __init__(self, params, metric='AUC', maximize=True, verbose=True, model=None):
        self.params = params
        self.metric = metric
        self.maximize = maximize
        self.verbose = verbose
        self.model = model

    def fit(self, X_train, y_train):

        bst = cv(
            Pool(X_train, y_train),
            self.params
        )

        best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1
        print('Best Iteration: {}'.format(best_rounds))

        self.params['iterations'] = best_rounds
        self.model = CatBoostClassifier(**self.params)

        self.model.fit(
            X_train, y_train
        )

    def predict(self, X_test):
        pred_prob = self.model.predict_proba(X_test)[:, -1]
        return pred_prob

    def feature_importance(self):
        pass

    @staticmethod
    def find_best_params(kag):
        pass
예제 #19
0
파일: test.py 프로젝트: iamnik13/catboost
def test_custom_objective():
    class LoglossObjective(object):
        def calc_ders_range(self, approxes, targets, weights):
            assert len(approxes) == len(targets)
            if weights is not None:
                assert len(weights) == len(approxes)

            exponents = []
            for index in xrange(len(approxes)):
                exponents.append(math.exp(approxes[index]))

            result = []
            for index in xrange(len(targets)):
                p = exponents[index] / (1 + exponents[index])
                der1 = (1 - p) if targets[index] > 0.0 else -p
                der2 = -p * (1 - p)

                if weights is not None:
                    der1 *= weights[index]
                    der2 *= weights[index]

                result.append((der1, der2))

            return result

    train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(data=TEST_FILE, column_description=CD_FILE)

    model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True,
                               loss_function=LoglossObjective(), eval_metric="Logloss",
                               # Leaf estimation method and gradient iteration are set to match
                               # defaults for Logloss.
                               leaf_estimation_method="Newton", leaf_estimation_iterations=10)
    model.fit(train_pool, eval_set=test_pool)
    pred1 = model.predict(test_pool, prediction_type='RawFormulaVal')

    model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, loss_function="Logloss")
    model2.fit(train_pool, eval_set=test_pool)
    pred2 = model2.predict(test_pool, prediction_type='RawFormulaVal')

    for p1, p2 in zip(pred1, pred2):
        assert abs(p1 - p2) < EPS
예제 #20
0
파일: test.py 프로젝트: iamnik13/catboost
def test_custom_eval():
    class LoglossMetric(object):
        def get_final_error(self, error, weight):
            return error / (weight + 1e-38)

        def is_max_optimal(self):
            return True

        def evaluate(self, approxes, target, weight):
            assert len(approxes) == 1
            assert len(target) == len(approxes[0])

            approx = approxes[0]

            error_sum = 0.0
            weight_sum = 0.0

            for i in xrange(len(approx)):
                w = 1.0 if weight is None else weight[i]
                weight_sum += w
                error_sum += w * (target[i] * approx[i] - math.log(1 + math.exp(approx[i])))

            return error_sum, weight_sum

    train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(data=TEST_FILE, column_description=CD_FILE)

    model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric=LoglossMetric())
    model.fit(train_pool, eval_set=test_pool)
    pred1 = model.predict(test_pool)

    model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric="Logloss")
    model2.fit(train_pool, eval_set=test_pool)
    pred2 = model2.predict(test_pool)

    for p1, p2 in zip(pred1, pred2):
        assert abs(p1 - p2) < EPS
예제 #21
0
def test_fit_no_label():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool.get_features())
예제 #22
0
class CatBoostClassifierCV(object):
    """cross_val_predict"""

    def __init__(self, params=None, cv=5, random_state=None, n_repeats=None):
        self.clf = CatBoostClassifier()
        if params:
            self.clf.set_params(**params)
        if n_repeats:
            self._kf = RepeatedStratifiedKFold(cv, True, random_state)
            self._num_preds = cv * n_repeats
        else:
            self._kf = StratifiedKFold(cv, True, random_state)
            self._num_preds = cv

    def fit(self, X, y, X_test, feval=roc_auc_score, cat_features=None, sample_weight=None, verbose=100,
            early_stopping_rounds=100, plot=False, silent=None,
            logging_level=None, column_description=None, save_snapshot=None,
            snapshot_file='/fds/data' if cloudml else None, snapshot_interval=None,
            init_model=None):
        """输入数组"""

        self.oof_train = np.zeros(len(X))
        self.oof_test = np.zeros((len(X_test), self._num_preds))
        for n_fold, (train_index, valid_index) in enumerate(self._kf.split(X, y)):
            if verbose:
                print("\033[94mFold %s started at %s\033[0m" % (n_fold + 1, time.ctime()))
            X_train, y_train = X[train_index], y[train_index]
            X_valid, y_valid = X[valid_index], y[valid_index]
            eval_set = [(X_train, y_train), (X_valid, y_valid)]

            ########################################################################
            self.clf.fit(X_train, y_train,
                         cat_features=cat_features,
                         sample_weight=sample_weight,
                         use_best_model=True,
                         eval_set=eval_set,
                         verbose=verbose,
                         logging_level=logging_level,
                         plot=plot,
                         column_description=column_description,
                         silent=silent,
                         early_stopping_rounds=early_stopping_rounds,
                         save_snapshot=save_snapshot,
                         snapshot_file=snapshot_file,
                         snapshot_interval=snapshot_interval,
                         init_model=init_model)

            self.oof_train[valid_index] = self.clf.predict_proba(X_valid)[:, 1]
            self.oof_test[:, n_fold] = self.clf.predict_proba(X_test)[:, 1]
            ########################################################################

        # 输出 测试集 oof
        self.oof_test_rank = pd.DataFrame(self.oof_test).rank().mean(1) / len(self.oof_test)
        self.oof_test = self.oof_test.mean(1)

        # 计算 训练集 oof 得分
        if feval:
            score = feval(y, self.oof_train)
            print(f"\n\033[94mCV Score: {score} ended at {time.ctime()}\033[0m")
            return score

    def oof_save(self, file='./oof_train_and_test.csv'):
        assert isinstance(file, str)
        _ = np.append(self.oof_train, self.oof_test)
        pd.DataFrame(_, columns='oof_train_and_test').to_csv(file, index=False)
예제 #23
0
def test_invalid_loss_classifier():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(loss_function="abcdef")
        model.fit(pool)
#10) border_count: The number of splits for numerical features. Allowed values are integers from 1 to 255 inclusively.
#11) ctr_border_count: The number of splits for categorical features. Allowed values are integers from 1 to 255 inclusively.
#12) leaf_estimation_method: The method used to calculate the values in leaves. Possible values: i)Newton ii)Gradient
#13) gradient_iterations: The number of gradient steps when calculating the values in leaves.
#14) priors: NEED TO EXPLORE ; Use the specified priors during training. Format: <prior 1>:<prior 2>:...:<prior N>; For example:–2:0:0.5:10
#15) feature_priors	: NEED TO EXPLORE ; Specify individual priors for categorical features (used at the Transforming categorical features to numerical features stage). Given in the form of a comma-separated list of prior descriptions for each specified feature. The description for each feature contains a colon-separated feature index and prior values. Format:<ID of feature 1>:<prior 1.1>:<prior 1.2>:...:<prior 1.N1>,...,<ID of feature M>:<prior M.1>:<prior M.2>:...:<prior M.NM>
#16) fold_permutation_block_size: Objects in the dataset are grouped in blocks before the random permutations. This parameter defines the size of the blocks. The smaller is the value, the slower is the training. Large values may result in quality degradation.
#17) has_time: Use the order of objects in the input data (do not perform random permutations during the Transforming categorical features to numerical features and Choosing the tree structure stages).
#18) fold_len_multiplier: Coefficient for changing the length of folds. The value must be greater than 1. The best validation result is achieved with minimum values. With values close to 1 (for example,  ), each iteration takes a quadratic amount of memory and time for the number of objects in the iteration. Thus, low values are possible only when there is a small number of objects.

#For Binary Classification
#model = CatBoostClassifier(iterations=2, learning_rate=1, depth=2, loss_function='Logloss')

model.fit(X_train[features],
          X_train['noOfLanes_encoded'],
          cat_features=cat_cols,
          eval_set=(X_valid[features], X_valid['noOfLanes_encoded']),
          use_best_model=True)
pred = model.predict(X_test[features])
pred_ans = list(pred[:, 0])
#To get probability of predictions
#pred = model.predict_proba(X_test[features])[:,1]

#To get raw
#pred = model.predict(X_test[features],prediction_type='RawFormulaVal')

####OTHER NOTE: Regression using CatBoost
#from catboost import CatBoostRegressor
#model = CatBoostRegressor(iterations=2, learning_rate=1, depth=2)
# Fit model
#model.fit(train_data, train_labels, cat_features)
예제 #25
0
def run(scheme_num=1, file_name="../data/data_v3/training_e"):
    train_set_ls = []
    if scheme_num == 1:
        for i in [16, 17, 22, 23]:
            print("begin to load the dataset")
            file_name1 = file_name + "ld1-" + str(i) + ".csv"
            train_set_temp = pd.read_csv(file_name1, header=0, index_col=None)
            print(train_set_temp.describe())
            train_set_ls.append(train_set_temp)
    elif scheme_num == 2:
        for i in [16, 23]:
            print("begin to load the dataset")
            file_name2 = file_name + "ld1-" + str(i) + ".csv"
            train_set_temp = pd.read_csv(file_name2, header=0, index_col=None)
            print(train_set_temp.describe())
            train_set_ls.append(train_set_temp)
    elif scheme_num == 3:
        for i in [17,18, 19, 20, 21, 22, 23]:
            print("begin to load the dataset")
            file_name3 = file_name + "ld1-" + str(i) + ".csv"
            train_set_temp = pd.read_csv(file_name3, header=0, index_col=None)
            print(train_set_temp.describe())
            train_set_ls.append(train_set_temp)
    val_file_name = file_name + "ld1-23.csv"
    val_set = pd.read_csv(val_file_name, header=0, index_col=None)
    print(val_set.describe())
    train_set = pd.concat(train_set_ls, axis=0)
    ds = train_set.describe()
    print(ds)
    keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"]))

    print("begin to drop the duplicates")
    train_set.drop_duplicates(subset=keep_feature, inplace=True)
    val_set.drop_duplicates(subset=keep_feature, inplace=True)
    print(train_set.describe())
    print(val_set.describe())
    train_label = train_set["label"]
    val_label = val_set["label"]
    train_set = train_set.drop(labels=["label", "user_id"], axis=1)
    val_set = val_set.drop(labels=["label", "user_id"], axis=1)

    print("begin to standardization the data")
    for fea in keep_feature:
        if train_set[fea].var() < 0.000001 or val_set[fea].var() < 0.000001:
            train_set.drop(labels=[fea], axis=1, inplace=True)
            val_set.drop(labels=[fea], axis=1, inplace=True)
        else:
            train_set[fea] = (train_set[fea] - train_set[fea].min()) / (train_set[fea].max() - train_set[fea].min())
            # train_set[fea] = (train_set[fea]-train_set[fea].mean())/(train_set[fea].std())
            val_set[fea] = (val_set[fea] - val_set[fea].min()) / (val_set[fea].max() - val_set[fea].min())
            # val_set[fea] = (val_set[fea]-val_set[fea].mean())/(val_set[fea].std())
    keep_feature = list(set(train_set.columns.values.tolist()) - set(["user_id", "label"]))
    kpca = PCA(n_components=0.99, whiten=True)
    # # kpca = KernelPCA(n_components=None,kernel="linear",copy_X=False,n_jobs=-1)
    kpca.fit(train_set.values)
    train_set = kpca.transform(train_set.values)
    val_set = kpca.transform(val_set.values)
    # # print("eigenvalues of the centered kernel matrix {}".format(kpca.lambdas_))
    print("number of components {}".format(kpca.n_components_))
    print("noise variance {}".format(kpca.noise_variance_))
    print("the explained variance {}".format(kpca.explained_variance_))
    print("the explained variance ratio {}".format(kpca.explained_variance_ratio_))

    print("begin to make prediction with plain features and without tuning parameters")

    initial_params = {
        "colsample_bytree": 0.9956575704604527,
        "learning_rate": 0.03640520807213964,
        "max_bin": 210,
        # "max_depth":7,
        "min_child_samples": 80,
        "min_child_weight": 0.23740522733908753,
        # "min_split_gain": 0.0004147079426427973,
        "n_estimators": 266,
        "num_leaves": 12,
        "reg_alpha": 271.01549892268713,
        "reg_lambda": 0.0001118074055642654,
        # "scale_pos_weight": 0.9914246775102074,
        "subsample": 0.9090257022233618,
        "boosting_type": "dart",
    }
    # train_data = lightgbm.Dataset(train_set.values, label=train_label.values, feature_name=list(train_set.columns))

    # best_f1 =0.0
    # best_params = {"n_estimators":800,"num_leaves":6}
    # for n_estimator in [400,600,800]:
    #     for num_leave in [4,6,8]:
    #         print({"n_estimators":n_estimator,"num_leaves":num_leave,"boosting_type":"dart"})
    #         clf1 = LGBMClassifier(n_estimators=n_estimator, num_leaves=num_leave, boosting_type="dart")
    #         clf1.fit(train_set.values, train_label.values)
    #         print("load the test dataset")
    #         yhat = clf1.predict(val_set.values)
    #         print(classification_report(y_pred=yhat, y_true=val_label.values,digits=4))
    #         f1 = f1_score(y_pred=yhat, y_true=val_label.values)
    #         if best_f1<f1:
    #             best_f1 = f1
    #             best_params = {"n_estimators":n_estimator,"num_leaves":num_leave,"boosting_type":"dart"}
    scoring = {'f1': "f1"}
    # clf1 = GridSearchCV(LGBMClassifier(),
    #                   param_grid={"n_estimators":[200,400,600],"num_leaves": [4,5,6,8],"boosting_type":["dart"]},
    #                   scoring=scoring, cv=4, refit='f1',n_jobs=-1,verbose=1)
    for n_estimator in [500]:
        for depth in [6]:
            print({"n_estimators": n_estimator, "depth": depth})
            clf1 = CatBoostClassifier(iterations=n_estimator, depth=depth,verbose=2)
            # clf1.fit(train_set.values, train_label.values)
            clf1.fit(train_set, train_label.values)
            # clf1.fit(train_set.values, train_label.values,eval_set=(val_set.values,val_label.values),early_stopping_rounds=30)
            # cv_results = cv(initial_params,train_data,num_boost_round=800,nfold=4,early_stopping_rounds=30,verbose_eval=True)
            # bst = lgb.cv(initial_params, train_data, num_boost_round=1000, nfold=3, early_stopping_rounds=30)
            # bs = clf1.best_score_
            # print(bs)
            # bp = clf1.best_params_
            # print(bp)

            print("begin to make classification report for the validation dataset")
            # yhat = clf1.predict(val_set.values)
            # yhat = clf1.predict(val_set.values)
            yhat = clf1.predict(val_set)
            print(classification_report(y_pred=yhat, y_true=val_label.values, digits=4))

            print("begin to make classification report for the training dataset")
            # yhat = clf1.predict(train_set.values)
            yhat = clf1.predict(train_set)
            print(classification_report(y_pred=yhat, y_true=train_label.values, digits=4))

            print("load the test dataset")
            test_file_name = file_name.replace("training", "testing") + "ld1-30.csv"
            test_set = pd.read_csv(test_file_name, header=0, index_col=None, usecols=keep_feature + ["user_id"])
            # test_set = pd.read_csv("data/testing_rld1-30.csv",header=0,index_col=None)
            for fea in keep_feature:
                test_set[fea] = (test_set[fea] - test_set[fea].min()) / (test_set[fea].max() - test_set[fea].min())
                # test_set[fea] = (test_set[fea]-test_set[fea].mean())/(test_set[fea].std())

            print("begin to make prediction")
            param = list(file_name)[-1] + str(scheme_num) + "_" + str(n_estimator) + "_" + str(depth)
            print(param)
            # predict(clf1,test_set,param)
            predict(clf1, test_set, param, kpca)
예제 #26
0
파일: test.py 프로젝트: iamnik13/catboost
def test_wrong_ctr_for_classification():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(ctr_description=['Borders:TargetBorderCount=5:TargetBorderType=Uniform'])
        model.fit(pool)
train_mod_std = feather.read_dataframe(path_train_mod_std)
test_mod_std = feather.read_dataframe(path_test_mod_std)

features = [c for c in train.columns if c not in ['ID_code', 'target']]
target = train['target']
target_std = train_std['target']
target_mod = train_mod['target']
target_mod_std = train_mod_std['target']

print("data install complete")

# feature importances -----------------------------------------------------------------
print("feature importances -------------")
model = CatBoostClassifier(random_state=0)
model.fit(train_mod[features], target)
importances = list(model.feature_importances_)
columns = list(train_mod[features].columns)

importances = pd.DataFrame(model.feature_importances_, columns=["importances"])
columns = pd.DataFrame(train_mod[features].columns, columns=["variable"])

data = pd.concat([columns, importances], axis=1)
sort_data = data.sort_values(by="importances",
                             ascending=False).reset_index(drop=True)

print(
    data.sort_values(by="importances",
                     ascending=False).reset_index(drop=True).head(15))
for i in np.arange(50, train_mod[features].shape[1], 50):
    print("sum of importances by highest {} features: {}".format(
예제 #28
0
파일: test.py 프로젝트: iamnik13/catboost
def test_feature_importance_off():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(iterations=5, random_seed=0, calc_feature_importance=False)
        model.fit(pool)
        model.feature_importances_
예제 #29
0
class CatboostBaseline(BaseBaseline):
    def __init__(self):
        super(CatboostBaseline, self).__init__(name="catboost")

    def fit(self, X_train, y_train, X_val, y_val, categoricals=None):
        results = dict()

        self.all_nan = np.all(np.isnan(X_train), axis=0)
        X_train = X_train[:, ~self.all_nan]
        X_val = X_val[:, ~self.all_nan]

        X_train = np.nan_to_num(X_train)
        X_val = np.nan_to_num(X_val)

        categoricals = [
            ind for ind in range(X_train.shape[1])
            if isinstance(X_train[0, ind], str)
        ]

        early_stopping = 150 if X_train.shape[0] > 10000 else max(
            round(150 * 10000 / X_train.shape[0]), 10)

        X_train_pooled = Pool(data=X_train,
                              label=y_train,
                              cat_features=categoricals)
        X_val_pooled = Pool(data=X_val, label=y_val, cat_features=categoricals)

        self.model = CatBoostClassifier(**self.config)
        self.model.fit(X_train_pooled,
                       eval_set=X_val_pooled,
                       use_best_model=True,
                       early_stopping_rounds=early_stopping)

        pred_train = self.model.predict_proba(X_train)
        pred_val = self.model.predict_proba(X_val)

        results["val_preds"] = pred_val.tolist()
        results["labels"] = y_val.tolist()

        try:
            pred_train = np.argmax(pred_train, axis=1)
            pred_val = np.argmax(pred_val, axis=1)
        except:
            print("==> No probabilities provided in predictions")

        results["train_acc"] = metrics.accuracy_score(y_train, pred_train)
        results["train_balanced_acc"] = metrics.balanced_accuracy_score(
            y_train, pred_train)
        results["val_acc"] = metrics.accuracy_score(y_val, pred_val)
        results["val_balanced_acc"] = metrics.balanced_accuracy_score(
            y_val, pred_val)

        return results

    def score(self, X_test, y_test):
        results = dict()

        y_pred = self.predict(X_test)

        results["test_acc"] = metrics.accuracy_score(y_test, y_pred)
        results["test_balanced_acc"] = metrics.balanced_accuracy_score(
            y_test, y_pred)

        return results

    def predict(self, X_test, predict_proba=False):
        X_test = X_test[:, ~self.all_nan]
        X_test = np.nan_to_num(X_test)
        if predict_proba:
            return self.model.predict_proba(X_test)
        y_pred = self.model.predict(X_test)
        return y_pred
예제 #30
0
파일: test.py 프로젝트: iamnik13/catboost
def test_one_doc_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc')))
    return local_canonical_file(FIMP_PATH)
예제 #31
0
파일: test.py 프로젝트: iamnik13/catboost
def test_fit_no_label():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool.get_features())
예제 #32
0
파일: test.py 프로젝트: iamnik13/catboost
def test_interaction_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='Interaction')))
    return local_canonical_file(FIMP_PATH)
예제 #33
0
파일: test.py 프로젝트: iamnik13/catboost
def test_classification_ctr():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, ctr_description=['Borders', 'Counter'])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #34
0
파일: test.py 프로젝트: iamnik13/catboost
def test_class_weights():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, class_weights=[1, 2])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #35
0
def test_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.feature_importances_))
    return local_canonical_file(FIMP_PATH)
예제 #36
0
model = CatBoostClassifier(iterations=500,
                           depth=10,
                           learning_rate=0.15,
                           one_hot_max_size=31,
                           loss_function='Logloss',
                           logging_level='Verbose',
                           custom_loss='AUC',
                           eval_metric='AUC',
                           rsm=0.78,
                           od_wait=150,
                           metric_period=400,
                           l2_leaf_reg=9,
                           random_seed=967)

model.fit(X_train,
          y_train,
          plot=True,
          cat_features=categorical_features_indices)
import matplotlib.pyplot as plt
fea_ = model.feature_importances_  #feature importance plot
fea_name = model.feature_names_
plt.figure(figsize=(10, 10))
plt.barh(fea_name, fea_, height=0.5)

#AUC-ROC curve/FPR-TPR curve
from catboost.utils import get_roc_curve
import sklearn
from sklearn import metrics
from catboost import Pool
eval_pool = Pool(X_test, y_test, cat_features=categorical_features_indices)
eval_train_pool = Pool(X_train,
                       y_train,
예제 #37
0
import sys
from lib.utils import read_training_file
from catboost import Pool, CatBoostClassifier

TRAIN_FILE_PATH = sys.argv[1]
print('Training File Path: {}'.format(TRAIN_FILE_PATH))

data, label = read_training_file(TRAIN_FILE_PATH)

train_pool = Pool(data, label)

model = CatBoostClassifier()
model.fit(train_pool)

model.save_model('model')
예제 #38
0
파일: test.py 프로젝트: iamnik13/catboost
def test_priors():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, has_time=True, ctr_description=["Borders:Prior=0:Prior=0.6:Prior=1:Prior=5", "Counter:Prior=0:Prior=0.6:Prior=1:Prior=5"])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #39
0
                                                    random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x_train,
                                                  y_train,
                                                  test_size=0.25,
                                                  random_state=42)

model = CatBoostClassifier(
    iterations=500,
    depth=10,
    learning_rate=0.1,
    loss_function='MultiClass',
    # bagging_temperature=2,
    # l2_leaf_reg=4)
)
model.fit(pd.DataFrame(x_train),
          pd.DataFrame(y_train),
          eval_set=(pd.DataFrame(x_val), pd.DataFrame(y_val)),
          plot=True)

features = [
    "Name_length",
    "Name_frequency",
    "Named/un-named",
    "Year",
    "Month",
    "Day",
    "Hour",
    "Day_of_week"
    "Male/Female",
    "Neutered/Intact",
    "Age",
    "Breed1",
예제 #40
0
    print("Train Index:",train_idx,",Val Index:",valid_idx)

    clf = CatBoostClassifier(
        iterations=7500,
        learning_rate=0.02,
        depth=6,
        bootstrap_type='Bernoulli',
        l2_leaf_reg=50,
        #loss_function='auc',
        eval_metric='AUC',
        verbose=True,)

    train_pool = Pool(train_x, train_y)
    validate_pool = Pool(valid_x, valid_y)

    clf.fit(train_pool, use_best_model=True, eval_set=validate_pool)

    oof_preds[valid_idx] = clf.predict_proba(valid_x)[:, 1]
    sub_preds += clf.predict_proba(test_df[feats])[:, 1] / folds.n_splits
    

            
    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[valid_idx])))
    del clf, train_x, train_y, valid_x, valid_y
    gc.collect()

print('Full AUC score %.6f' % roc_auc_score(train_df['TARGET'], oof_preds))

oof = pd.DataFrame({"SK_ID_CURR":train_df["SK_ID_CURR"], "TARGET":oof_preds})
preds = pd.DataFrame({"SK_ID_CURR":test_df["SK_ID_CURR"], "TARGET":sub_preds})
예제 #41
0
def train_model_classification(X,
                               y,
                               params,
                               groups,
                               folds,
                               model_type='lgb',
                               eval_metric='auc',
                               columns=None,
                               plot_feature_importance=False,
                               model=None,
                               verbose=10000,
                               early_stopping_rounds=200,
                               n_estimators=50000,
                               weight=None,
                               seed='no'):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame
    :params: X_test - test data, can be pd.DataFrame
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns is None else columns
    models = []

    metrics_dict = {
        'auc': {
            'lgb_metric_name': 'auc',
            'catboost_metric_name': 'AUC',
            'sklearn_scoring_function': metrics.roc_auc_score
        },
    }

    result_dict = {}

    # out-of-fold predictions on train data
    oof = np.zeros(len(X))

    scores = []
    train_loss = []
    feature_importance = pd.DataFrame()

    if groups is None:
        splits = folds.split(X)

    elif groups == 'stra':
        splits = folds.split(X, y)

    else:
        splits = folds.split(X, groups=groups)
        print('no')

    for fold_n, (train_index, valid_index) in enumerate(splits):

        print(f'Fold {fold_n + 1} started at {time.ctime()}')

        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
            weight_train = weight[train_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[
                valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]
            weight_train = weight[train_index]

        if model_type == 'lgb':

            model = lgb.LGBMClassifier(**params,
                                       n_estimators=n_estimators,
                                       n_jobs=-1)
            model.fit(X_train,
                      y_train,
                      sample_weight=weight_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose,
                      early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            y_pred_train = model.predict_proba(X_train)[:, 1]
            models.append(model)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train,
                                     label=y_train,
                                     feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid,
                                     label=y_valid,
                                     feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data,
                              num_boost_round=n_estimators,
                              evals=watchlist,
                              early_stopping_rounds=early_stopping_rounds,
                              verbose_eval=verbose,
                              params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid,
                                                     feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred_train = model.predict(xgb.DMatrix(X_train,
                                                     feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            models.append(model)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](
                y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')
            models.append(model)

        if model_type == 'cat':
            model = CatBoostClassifier(
                iterations=n_estimators,
                eval_metric=metrics_dict[eval_metric]['catboost_metric_name'],
                **params,
            )
            model.fit(X_train,
                      y_train,
                      eval_set=(X_valid, y_valid),
                      cat_features=[],
                      use_best_model=True,
                      verbose=verbose,
                      early_stopping_rounds=early_stopping_rounds)
            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            y_pred_train = model.predict_proba(X_train)[:, 1]
            models.append(model)

        oof[valid_index] = y_pred_valid.reshape(-1, )
        if eval_metric != 'group_mae':
            scores.append(
                metrics_dict[eval_metric]['sklearn_scoring_function'](
                    y_valid, y_pred_valid))
            train_loss.append(
                metrics_dict[eval_metric]['sklearn_scoring_function'](
                    y_train, y_pred_train))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](
                y_valid, y_pred_valid, X_valid['type']))

        with open(f'./models/models_{model_type}_{seed}.pickle',
                  'wb') as handle:
            pickle.dump(models, handle, protocol=pickle.HIGHEST_PROTOCOL)

        gc.collect()

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["shap_values"] = abs(
                shap.TreeExplainer(model).shap_values(X_valid)
                [:, :len(columns)]).mean(axis=0).T
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)

    print('Train loss mean: {0:.6f}, std: {1:.6f}.'.format(
        np.mean(train_loss), np.std(train_loss)))
    print('CV mean score: {0:.6f}, std: {1:.6f}.'.format(
        np.mean(scores), np.std(scores)))

    result_dict['oof'] = oof
    result_dict['scores'] = scores
    result_dict['models'] = models

    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[
                feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12))
            sns.barplot(x="importance",
                        y="feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LGB Features (avg over folds)')

            result_dict['feature_importance'] = feature_importance

    return result_dict
예제 #42
0
        od_type="Iter",
        od_wait=200,
        task_type="GPU",
        devices="0",
        cat_features=[x for x in range(len(cb_cat_features))],
        bagging_temperature=1.288692494969795,
        grow_policy="Depthwise",
        l2_leaf_reg=9.847870133539244,
        learning_rate=0.01877982653902465,
        max_depth=8,
        min_data_in_leaf=1,
        penalties_coefficient=2.1176668909602734,
    )
    cb_model.fit(
        cb_x_train,
        y_train,
        eval_set=[(cb_x_valid, y_valid)], 
        verbose=0,
    )

    train_oof_preds = cb_model.predict_proba(cb_x_valid)[:,1]
    test_oof_preds = cb_model.predict_proba(test[cb_features])[:,1]
    cb_train_preds[test_index] = train_oof_preds
    cb_test_preds += test_oof_preds / n_folds
    print(": CB - ROC AUC Score = {}".format(roc_auc_score(y_valid, train_oof_preds, average="micro")))
    
    ridge_model = CalibratedClassifierCV(
        RidgeClassifier(random_state=random_state),
        cv=3,
    )
    ridge_model.fit(
        ridge_x_train,
예제 #43
0
X_test = X_test.fillna(-999)
assert X_train.shape == (590540, 432)
assert X_test.shape == (506691, 432)

del train, test_identity, test_transaction, train_identity, train_transaction

#%% Create catboost data pool
# catboost automatically transform categorical features into encoded format
CAT_FEATURES = list(X_train.select_dtypes("object").columns)

train_dataset = Pool(data=X_train, label=Y_train, cat_features=CAT_FEATURES)
test_dataset = Pool(data=X_test, cat_features=CAT_FEATURES)

#%% train model
model = CatBoostClassifier(iterations=1000, task_type="GPU")
model.fit(train_dataset, verbose=True)

#%% save model and predict
if not os.path.exists("./result/"):
    os.makedirs("./result/")
model.save_model("./result/model_catboost.json", format="json")
test_pred = model.predict_proba(test_dataset, verbose=True)[:, 1]
ss["isFraud"] = test_pred

ss.to_csv("./result/submit_catboost.csv")

#%% feature importance
import matplotlib.pylab as plt
fi = pd.DataFrame(index=model.feature_names_)
fi['importance'] = model.feature_importances_
fi.loc[fi['importance'] > 0.1].sort_values('importance').plot(
예제 #44
0
파일: test.py 프로젝트: iamnik13/catboost
def test_no_eval_set():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool, use_best_model=True)
예제 #45
0
        continue
    X.append((dot.log, dot.lat, log(dot.trans_ts - b,
                                    a), log(dot.request_ts - b, c)))
    y.append(dot.label)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=42)

train_p = Pool(X, y)
test_p = Pool(X_test, y_test)
decision = CatBoostClassifier(iterations=35,
                              learning_rate=1,
                              depth=10,
                              loss_function='MultiClass',
                              custom_metric='MultiClassOneVsAll',
                              best_model_min_trees=10000)
decision.fit(train_p)

print('Accuracy: \n', decision.score(test_p))
pred = decision.predict(TEST)
print(decision.feature_importances_)
plt.bar(np.arange(len(decision.feature_importances_)),
        decision.feature_importances_,
        color='black')
plt.show()

with open("answerboost2.txt", 'w') as f:
    for item in pred:
        f.write(f"{int(item)}\n")
예제 #46
0
                       axis=1)
y2 = forcatboost2['churn']
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2,
                                                        y2,
                                                        test_size=.3,
                                                        random_state=0)

# In[154]:

categorical_features_indices2 = np.where(X2.dtypes != np.float)[0]

# In[155]:

model2 = CatBoostClassifier()
model2.fit(X_train2,
           y_train2,
           cat_features=categorical_features_indices2,
           eval_set=(X_test2, y_test2))

# In[156]:

print('Accuracy of CatBoost classifier on training set: {:.2f}'.format(
    model2.score(X_train2, y_train2)))
print('Accuracy of CatBoost classifier on test set: {:.2f}'.format(
    model2.score(X_test2, y_test2)))

# In[157]:

model2.get_feature_importance()

# In[158]:
예제 #47
0
파일: main.py 프로젝트: konodyuk/minikts
 def train_fold(self, data, fold_idx):
     x_train, y_train, x_val, y_val = data
     model = CatBoostClassifier(**hparams.catboost)
     with kts.parse_stdout(kts.patterns.catboost, kts.LoggerCallback(logger=self.logger, FOLD=fold_idx)):
         model.fit(x_train, y_train, eval_set=[(x_val, y_val)])
     return model
예제 #48
0
def model(df4, df_val, dfk):
    
    X_train, X_test, Y_train, Y_test = train_test_split(df4.drop(columns = ['set_clicked'])
                                                        , df4['set_clicked'], test_size = 0.30 ##### change to lightgbm
#                                                         , random_state = 42
                                                       )
    
    print('Ones in train :',Y_train.sum(),'Ones in test:',Y_test.sum())

    rand = random.randint(1,2)
    
#################################### LGBM
    
#     if rand == 1:
        
#         params = {'boosting_type': 'gbdt',
#                   'max_depth' : 4,
#                   'objective': 'binary',
#                   'nthread': 4,
#                   'num_leaves': 64,
#                   'learning_rate': 0.001,
#                   'max_bin': 512,
#                   'subsample_for_bin': 200,
#                   'subsample': 1,
#                   'subsample_freq': 1,
#                   'colsample_bytree': 0.8,
#                   'reg_alpha': 1.2,
#                   'reg_lambda': 1.2,
#                   'min_split_gain': 0.5,
#                   'min_child_weight': 1,
#                   'min_child_samples': 5,
#                   'scale_pos_weight': 1,
#                   'num_class' : 1,
#                   'verbose': -1
#     #               'metric' : 'auc'
#                   }

# # #     making lgbm datasets for train and valid
#         d_train = lgbm.Dataset(X_train, Y_train)
#         d_valid = lgbm.Dataset(X_test, Y_test)

#         def lgb_f1_score(y_hat, data):
#             y_true = data.get_label()
#             y_hat = np.round(y_hat) # scikits f1 doesn't like probabilities
#             return 'f1', f1_score(y_true, y_hat), True

#         evals_result = {}

# #     training with early stop
# #   bst = lgbm.train(params, d_train, 5000, valid_sets=[d_valid], verbose_eval=50, early_stopping_rounds=100)

# #     cat_vars_index = []
# #     for i in cat_vars:
# #         if i in X_train:
# #             cat_vars_index.append(X_train.columns.get_loc(i))


#         bst = lgbm.train(params, d_train, valid_sets=[d_valid, d_train], valid_names=['val', 'train'], feval=lgb_f1_score, evals_result=evals_result)

#################################### LGBM  
    
#################################### XGBoost
    
    if rand == 1:
    
        bst = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
           gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,
           min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
           objective='binary:logistic', reg_alpha=0, reg_lambda=1,
           scale_pos_weight=1, seed=0, silent=True, subsample=1) #, tree_method = 'hist'

        bst.fit(X_train,Y_train)

#     kfold = KFold(n_splits=10, random_state=42)  ##### Important Parameter
#     results = cross_val_score(bst, df_val.drop(columns = ['set_clicked']), df_val['set_clicked'], cv=kfold)

#################################### XGBoost


#################################### CatBoost
    
    if rand == 2:
        
        bst = CatBoostClassifier(eval_metric='F1',use_best_model=True, metric_period = 300, depth = 4)
        bst.fit(X_train,Y_train,eval_set=(X_test,Y_test)) ## cat_features = cat_vars_index

#################################### CatBoost

#     print("CV Score = ",results)
   
#     else:
        
#         bst = dt(max_depth = 4) # class_weight = {0:1,1:4}
#         bst.fit(X_train, Y_train)

    r = np.where(bst.predict(df_val.drop(columns = ['set_clicked'])) > 0.7, 1 ,0)
    
#     if rand in [2,4]:
#         kfold = KFold(n_splits=10, random_state=42)  ##### Important Parameter
#         results = cross_val_score(bst, df_val.drop(columns = ['set_clicked']), df_val['set_clicked'], cv=kfold)
#     else:
#         results = [0]

    results = []
    for i in range(10):
        df_val2 = shuffle(df_val)
        df_val3 = df_val2[0:int(df_val2.shape[0]*0.7)]
        rkf = bst.predict(df_val3.drop(columns = ['set_clicked']))
        results.append(accuracy_score(df_val3['set_clicked'], rkf))
        
    #Print accuracy
    acc_lgbm = accuracy_score(df_val['set_clicked'], r)

    print('Overall accuracy of model:', acc_lgbm, "   overall with only zeroes ", accuracy_score(df_val['set_clicked'], np.zeros(len(r))))
    
    check_increase = accuracy_score(df_val['set_clicked'], r) > accuracy_score(df_val['set_clicked'], np.zeros(len(r)))
#     print('Accuracy increased:',check_increase)
    #Print Area Under Curve
#     plt.figure()
    false_positive_rate, recall, thresholds = roc_curve(df_val['set_clicked'], r)
    roc_auc = auc(false_positive_rate, recall)
#     plt.title('Receiver Operating Characteristic (ROC)')
#     plt.plot(false_positive_rate, recall, 'b', label = 'AUC = %0.3f' %roc_auc)
#     plt.legend(loc='lower right')
#     plt.plot([0,1], [0,1], 'r--')
#     plt.xlim([0.0,1.0])
#     plt.ylim([0.0,1.0])
#     plt.ylabel('Recall')
#     plt.xlabel('Fall-out (1-Specificity)')
#     plt.show()

    print('AUC score:', roc_auc)

    #Print Confusion Matrix
    plt.figure()
    cm = confusion_matrix(df_val['set_clicked'], r)
    # labels = ['No Default', 'Default']
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot = True, fmt='d', cmap="Blues", vmin = 0.2);
    plt.title('Confusion Matrix')
    plt.ylabel('True Class')
    plt.xlabel('Predicted Class')
    plt.show()

    # lgbm.plot_metric(evals_result, metric='f1')
    
    f1 = f1_score(df_val['set_clicked'], r)
#     print(np.unique(bst.predict(dfk.drop(columns = ['set_clicked']))))
    rk = np.where(bst.predict(dfk.drop(columns = ['set_clicked'])) > 0.7, 1, 0)
    dfk['set_clicked'] = rk
    
    print("Completed Modelling - ",datetime.datetime.now())
    
    return acc_lgbm, f1, check_increase, X_train.columns, bst, dfk, results, accuracy_score(df_val['set_clicked'], np.zeros(len(r))), r, rand
예제 #49
0
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    train_size=0.8,
                                                    random_state=23)

for i in scale_list:
    for j in iteration_list:
        scale = i
        scale.fit(x_train)
        x_train = scale.transform(x_train)
        x_test = scale.transform(x_test)

        # for j in model_list:
        model = CatBoostClassifier(iterations=j)
        model.fit(x_train, y_train)

        y_pred = model.predict(x_test)

        acc = accuracy_score(y_test, y_pred)
        loss = log_loss(y_test, y_pred)

        print(
            'scaler : \n' + str(i) + '\nmodel : CatBoostClassifier\n' +
            'acc : \n', acc)
        print('\nloss : \n', loss)
        pickle.dump(
            model,
            open(
                'c:/data/modelcheckpoint/project_catboost_lr_default_' +
                str(i) + '_' + str(j) + '.data', 'wb'))
예제 #50
0
#Let us save the 4 best models

top_12_rf = [
    'PAY_1', 'PAY_2', 'BILL_AMT1', 'PAY_AMT1', 'AGE', 'Closeness_1',
    'PAY_AMT2', 'Closeness_4', 'BILL_AMT2', 'Closeness_2'
]

sclf1.fit(sampled_df[top_12_rf], sampled_df['Default_Status'])
sclf2.fit(sampled_df[top_12_rf], sampled_df['Default_Status'])
sclf3.fit(sampled_df[top_12_rf], sampled_df['Default_Status'])
sclf4.fit(sampled_df[top_12_rf], sampled_df['Default_Status'])

eclf1.fit(sampled_df[top_12_rf], sampled_df['Default_Status'])
eclf2.fit(sampled_df[top_12_rf], sampled_df['Default_Status'])
catboost_model.fit(sampled_df[top_12_rf], sampled_df['Default_Status'])

import pickle
#Saving the four models to a disk

pickle.dump(sclf1, open("stacking1.pkl", "wb"))
pickle.dump(sclf2, open("stacking2.pkl", "wb"))
pickle.dump(sclf3, open("stacking3.pkl", "wb"))
pickle.dump(sclf4, open("stacking4.pkl", "wb"))
pickle.dump(catboost_model, open("catboost.pkl", "wb"))

pickle.dump(eclf1, open("voting1.pkl", "wb"))
pickle.dump(eclf2, open("voting2.pkl", "wb"))

#!pip freeze > requirements.txt
예제 #51
0
def test_predict_sklearn_class():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
예제 #52
0
y = labelencoder_y.fit_transform(y)



# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

#X_test[104][10] = "yes"


cat_featuresind=[0,1,2]

clf = CatBoostClassifier (iterations=10,random_seed=rnd_state, custom_metric='Accuracy')

clf.fit(X_train, y_train, cat_features=cat_featuresind,plot = True)


clf.score(X_test, y_test)





from sklearn.metrics import confusion_matrix,accuracy_score
y_pred = clf.predict(X_test)

#print(clf.predict(X_test[104]))
cm = confusion_matrix (y_test, y_pred)

예제 #53
0
def test_no_eval_set():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool, use_best_model=True)
예제 #54
0
class CatBoostLearner(Learner):

    algorithm_name = "CatBoost"
    algorithm_short_name = "CatBoost"

    def __init__(self, params):
        super(CatBoostLearner, self).__init__(params)
        self.library_version = catboost.__version__
        self.model_file = self.uid + ".cat.model"
        self.model_file_path = os.path.join(storage_path, self.model_file)
        self.snapshot_file_path = os.path.join(
            storage_path, "training_snapshot_" + self.model_file)
        self.rounds = additional.get("one_step", 50)
        self.max_iters = additional.get("max_steps", 10)
        self.learner_params = {
            "learning_rate": self.params.get("learning_rate", 0.025),
            "depth": self.params.get("depth", 6),
            "rsm": self.params.get("rsm", 1),
            "random_strength": self.params.get("random_strength", 1),
            "bagging_temperature": self.params.get("bagging_temperature", 1),
            "l2_leaf_reg": self.params.get("l2_leaf_reg", 3),
            "random_seed": self.params.get("seed", 1),
        }

        log.debug("CatBoostLearner __init__")

        self.model = CatBoostClassifier(
            iterations=0,
            learning_rate=self.learner_params.get("learning_rate"),
            depth=self.learner_params.get("depth"),
            rsm=self.learner_params.get("rsm"),
            random_strength=self.learner_params.get("random_strength"),
            bagging_temperature=self.learner_params.get("bagging_temperature"),
            l2_leaf_reg=self.learner_params.get("l2_leaf_reg"),
            loss_function="Logloss",
            verbose=False,
        )

    def update(self, update_params):
        pass
        # here should be update

    def fit(self, X, y):
        self.model._init_params["iterations"] += self.rounds
        self.model.fit(X,
                       y,
                       save_snapshot=True,
                       snapshot_file=self.snapshot_file_path)

    def predict(self, X):
        return self.model.predict_proba(X)[:, 1]

    def copy(self):
        return copy.deepcopy(self)

    def save(self):
        self.model.save_model(self.model_file_path)

        json_desc = {
            "library_version": self.library_version,
            "algorithm_name": self.algorithm_name,
            "algorithm_short_name": self.algorithm_short_name,
            "uid": self.uid,
            "model_file": self.model_file,
            "model_file_path": self.model_file_path,
            "params": self.params,
        }

        log.debug("CatBoostLearner save model to %s" % self.model_file_path)
        return json_desc

    def load(self, json_desc):

        self.library_version = json_desc.get("library_version",
                                             self.library_version)
        self.algorithm_name = json_desc.get("algorithm_name",
                                            self.algorithm_name)
        self.algorithm_short_name = json_desc.get("algorithm_short_name",
                                                  self.algorithm_short_name)
        self.uid = json_desc.get("uid", self.uid)
        self.model_file = json_desc.get("model_file", self.model_file)
        self.model_file_path = json_desc.get("model_file_path",
                                             self.model_file_path)
        self.params = json_desc.get("params", self.params)

        log.debug("CatBoostLearner load model from %s" % self.model_file_path)

        self.model = CatBoostClassifier()
        self.model.load_model(self.model_file_path)

    def importance(self, column_names, normalize=True):
        return None
예제 #55
0
def test_wrong_ctr_for_classification():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(ctr_description=['Borders:5:Uniform'])
        model.fit(pool)
예제 #56
0
def catboost_classification_learner(
        df: pd.DataFrame,
        features: List[str],
        target: str,
        learning_rate: float = 0.1,
        num_estimators: int = 100,
        extra_params: LogType = None,
        prediction_column: str = "prediction",
        weight_column: str = None,
        encode_extra_cols: bool = True) -> LearnerReturnType:
    """
    Fits an CatBoost classifier to the dataset. It first generates a DMatrix
    with the specified features and labels from `df`. Then, it fits a CatBoost
    model to this DMatrix. Return the predict function for the model and the
    predictions for the input dataset.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with features and target columns.
        The model will be trained to predict the target column
        from the features.

    features : list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    target : str
        The name of the column in `df` that should be used as target for the model.
        This column should be discrete, since this is a classification model.

    learning_rate : float
        Float in the range (0, 1]
        Step size shrinkage used in update to prevents overfitting. After each boosting step,
        we can directly get the weights of new features. and eta actually shrinks the
        feature weights to make the boosting process more conservative.
        See the eta hyper-parameter in:
        https://catboost.ai/docs/concepts/python-reference_parameters-list.html

    num_estimators : int
        Int in the range (0, inf)
        Number of boosted trees to fit.
        See the n_estimators hyper-parameter in:
        https://catboost.ai/docs/concepts/python-reference_parameters-list.html

    extra_params : dict, optional
        Dictionary in the format {"hyperparameter_name" : hyperparameter_value}.
        Other parameters for the CatBoost model. See the list in:
        https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
        If not passed, the default will be used.

    prediction_column : str
        The name of the column with the predictions from the model.
        If a multiclass problem, additional prediction_column_i columns will be added for i in range(0,n_classes).

    weight_column : str, optional
        The name of the column with scores to weight the data.

    encode_extra_cols : bool (default: True)
        If True, treats all columns in `df` with name pattern fklearn_feat__col==val` as feature columns.
    """
    from catboost import Pool, CatBoostClassifier
    import catboost

    weights = df[weight_column].values if weight_column else None
    params = extra_params if extra_params else {}
    params = assoc(params, "eta", learning_rate)
    params = params if "objective" in params else assoc(
        params, "objective", 'Logloss')

    features = features if not encode_extra_cols else expand_features_encoded(
        df, features)

    cat_features = params["cat_features"] if "cat_features" in params else None

    dtrain = Pool(df[features].values,
                  df[target].values,
                  weight=weights,
                  feature_names=list(map(str, features)),
                  cat_features=cat_features)

    cat_boost_classifier = CatBoostClassifier(iterations=num_estimators,
                                              **params)
    cbr = cat_boost_classifier.fit(dtrain, verbose=0)

    def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:

        dtest = Pool(new_df[features].values,
                     feature_names=list(map(str, features)),
                     cat_features=cat_features)

        pred = cbr.predict_proba(dtest)[:, 1]
        if params["objective"] == "MultiClass":
            pred = cbr.predict_proba(dtest)
            col_dict = {
                prediction_column + "_" + str(key): value
                for (key, value) in enumerate(pred.T)
            }
            col_dict.update({prediction_column: pred.argmax(axis=1)})
        else:
            col_dict = {prediction_column: pred}

        if apply_shap:
            import shap
            explainer = shap.TreeExplainer(cbr)
            shap_values = explainer.shap_values(dtest)
            shap_expected_value = explainer.expected_value

            if params["objective"] == "MultiClass":
                shap_values_multiclass = {
                    f"shap_values_{class_index}": list(value)
                    for (class_index, value) in enumerate(shap_values)
                }
                shap_expected_value_multiclass = {
                    f"shap_expected_value_{class_index}":
                    np.repeat(expected_value, len(class_shap_values))
                    for (class_index, (expected_value, class_shap_values)
                         ) in enumerate(zip(shap_expected_value, shap_values))
                }
                shap_output = merge(shap_values_multiclass,
                                    shap_expected_value_multiclass)

            else:
                shap_values = list(shap_values)
                shap_output = {
                    "shap_values":
                    shap_values,
                    "shap_expected_value":
                    np.repeat(shap_expected_value, len(shap_values))
                }

            col_dict = merge(col_dict, shap_output)

        return new_df.assign(**col_dict)

    p.__doc__ = learner_pred_fn_docstring("catboost_classification_learner",
                                          shap=True)

    log = {
        'catboost_classification_learner': {
            'features': features,
            'target': target,
            'prediction_column': prediction_column,
            'package': "catboost",
            'package_version': catboost.__version__,
            'parameters': assoc(params, "num_estimators", num_estimators),
            'feature_importance': cbr.feature_importances_,
            'training_samples': len(df)
        },
        'object': cbr
    }

    return p, p(df), log
예제 #57
0
파일: test.py 프로젝트: iamnik13/catboost
def test_invalid_loss_classifier():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(loss_function="abcdef")
        model.fit(pool)
예제 #58
0
class CatBoostKfold(object):

    def __init__(self, *, input_path_1, input_path_2, output_path):
        self.__input_path_1 = input_path_1
        self.__input_path_2 = input_path_2
        self.__output_path = output_path

        self.__sample_submission = None
        self.__train, self.__test = [None for _ in range(2)]
        self.__train_res, self.__test_res = [None for _ in range(2)]

        self.__train_feature, self.__train_label = [None for _ in range(2)]
        self.__test_feature = None
        self.__categorical_index = None
        self.__encoder = None
        self.__numeric_index = None

        self.__folds = None
        self.__oof_preds = None
        self.__sub_preds = None
        self.__cat = None

    def data_prepare(self):
        self.__sample_submission = pd.read_csv(os.path.join(self.__input_path_1, "sample_submission.csv"))
        self.__train = pd.read_csv(os.path.join(self.__input_path_1, "train_feature_df.csv"))
        self.__test = pd.read_csv(os.path.join(self.__input_path_1, "test_feature_df.csv"))
        self.__train_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_train_res.csv"))
        self.__test_res = pd.read_csv(os.path.join(self.__input_path_2, "feature_test_res.csv"))

        self.__train_label = self.__train["TARGET"]
        self.__train_feature = self.__train.drop(["SK_ID_CURR", "TARGET"], axis=1)
        self.__test_feature = self.__test[self.__train_feature.columns]

        self.__train_res = self.__train_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1)
        self.__test_res = self.__test_res.drop(["EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3"], axis=1)

        self.__train_feature = pd.concat([self.__train_feature, self.__train_res], axis=1)
        self.__test_feature = pd.concat([self.__test_feature, self.__test_res], axis=1)

        self.__categorical_index = np.where(self.__train_feature.dtypes == "object")[0]
        self.__train_feature.iloc[:, self.__categorical_index] = (
            self.__train_feature.iloc[:, self.__categorical_index].fillna("missing")
        )
        self.__test_feature.iloc[:, self.__categorical_index] = (
            self.__test_feature.iloc[:, self.__categorical_index].fillna("missing")
        )

        self.__encoder = TargetEncoder()
        self.__encoder.fit(self.__train_feature.iloc[:, self.__categorical_index], self.__train_label)
        self.__train_feature.iloc[:, self.__categorical_index] = (
            self.__encoder.transform(self.__train_feature.iloc[:, self.__categorical_index])
        )
        self.__test_feature.iloc[:, self.__categorical_index] = (
            self.__encoder.transform(self.__test_feature.iloc[:, self.__categorical_index])
        )

        # There are NaNs in test dataset (feature number 77) but there were no NaNs in learn dataset"
        self.__numeric_index = np.where(self.__train_feature.dtypes != "object")[0]
        self.__train_feature.iloc[:, self.__numeric_index] = (
            self.__train_feature.iloc[:, self.__numeric_index].apply(
                lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0)
            )
        )
        self.__test_feature.iloc[:, self.__numeric_index] = (
            self.__test_feature.iloc[:, self.__numeric_index].apply(
                lambda x: x.fillna(-999999.0) if x.median() > 0 else x.fillna(999999.0)
            )
        )

        # blending 之前需要 shuffle, 这里其实并不需要, 因为后面 StratifiedKFold shuffle
        self.__train_feature, self.__train_label = shuffle(self.__train_feature, self.__train_label)

    def model_fit(self):
        self.__folds = StratifiedKFold(n_splits=5, shuffle=True)
        self.__oof_preds = np.zeros(shape=self.__train_feature.shape[0])
        self.__sub_preds = np.zeros(shape=self.__test_feature.shape[0])

        for n_fold, (trn_idx, val_idx) in enumerate(self.__folds.split(self.__train_feature, self.__train_label)):
            trn_x, trn_y = self.__train_feature.iloc[trn_idx], self.__train_label.iloc[trn_idx]
            val_x, val_y = self.__train_feature.iloc[val_idx], self.__train_label.iloc[val_idx]

            self.__cat = CatBoostClassifier(
                iterations=6000,
                od_wait=200,
                od_type="Iter",
                eval_metric="AUC"
            )
            self.__cat.fit(
                trn_x,
                trn_y,
                eval_set=[(val_x, val_y)],
                use_best_model=True
            )
            pred_val = self.__cat.predict_proba(val_x)[:, 1]
            pred_test = self.__cat.predict_proba(self.__test_feature)[:, 1]

            self.__oof_preds[val_idx] = pred_val
            self.__sub_preds += pred_test / self.__folds.n_splits
            print("Fold %2d AUC : %.6f" % (n_fold + 1, roc_auc_score(val_y, self.__oof_preds[val_idx])))
        print("Full AUC score %.6f" % roc_auc_score(self.__train_label, self.__oof_preds))

    def model_predict(self):
        self.__sample_submission["TARGET"] = self.__sub_preds
        self.__sample_submission.to_csv(os.path.join(self.__output_path, "sample_submission.csv"), index=False)
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.125,
                                                  random_state=1)

# In[17]:

# {'depth': 7, 'iterations': 200, 'learning_rate': 0.1, 'scale_pos_weight': 4}

model_cat = CatBoostClassifier(iterations=200,
                               depth=7,
                               learning_rate=0.1,
                               scale_pos_weight=4)

model_cat.fit(X_train, y_train)

# In[18]:

# test_y_pred4 = model_cat.predict(X_test)

# In[20]:

# make predictions on a test set and get AUC score

# print("Sklearn CatBoost classifier:")
# y_pred = model_cat.predict_proba(X_test)

# print(f" - roc_auc_score: {roc_auc_score(y_test, y_pred[:,1]): .5f}")

# Accuracy score, f1_score
    def train(self, feature_names):
        """
        Input:
            feature_names: directionary of features' names
        Output:
            validity: Dataframe(["MachineIdentifier", "HasDetections", "Predict")
        """
        # Initialize parameters
        validity = None
        model_path = Path(__file__).absolute().parents[2] / "data" / "model" / str(get_version())
        Path.mkdir(model_path, exist_ok=True, parents=True)
        feature_importance = pd.DataFrame()
        START_FOLD = 0
        if get_back_training():
            START_FOLD = len(list(model_path.glob('**/*.model')))
        END_FOLD = 5
        if train_one_round():
            START_FOLD = 0
            END_FOLD = 1
        if START_FOLD == END_FOLD:
            return None

        # Process for each fold
        for fold in range(START_FOLD, END_FOLD):
            log_path = Path(__file__).absolute().parents[2] / "log" / "train" / str(get_version()) / str("fold{}".format(fold))
            Path.mkdir(log_path, exist_ok=True, parents=True)

            # Measure start time of the classification of this fold
            start = time.time()
            getLogger(get_version()).info("\t >> {} folds start".format(fold))
            send_message("\t :cat: {} folds start".format(fold))

            # Generate dataset
            getLogger(get_version()).info("\t \t Generating datasets...")
            send_message("\t \t Generating datasets...")
            valid = "valid{}".format(str(fold))
            trn_x = super().get_feature_df(feature_names, valid, "train")
            val_x = super().get_feature_df(feature_names, valid, "validate")
            trn_x.set_index("MachineIdentifier", inplace=True)
            val_x.set_index("MachineIdentifier", inplace=True)
            trn_y = trn_x["HasDetections"].astype(np.int8)
            val_y = val_x["HasDetections"].astype(np.int8)
            getLogger(get_version()).info("\t \t Datasets were generated.")
            send_message("\t \t Datasets were generated.")

            # Initialize variables for scoring
            if validity is None:
                validity = pd.DataFrame()
                validity["HasDetections"] = pd.concat([trn_y, val_y])
                validity["Predict"] = 0

            # Delete needless features
            del trn_x["HasDetections"], val_x["HasDetections"]

            # Classify
            clf = CatBoostClassifier(iterations=self.params["iterations"],
                                     verbose=self.params["verbose"],
                                     early_stopping_rounds=self.params["early_stopping_rounds"],
                                     random_seed=self.params["random_seed"],
                                     max_depth=self.params["max_depth"],
                                     loss_function=self.params["loss_function"],
                                     custom_metric=self.params["custom_metric"],
                                     eval_metric=self.params["eval_metric"],
                                     rsm=self.params["rsm"],
                                     train_dir=str(log_path))
            clf.fit(trn_x.values, trn_y.values,
                    eval_set=(val_x.values, val_y.values))

            for train_or_valid, metrics in clf.best_score_.items():
                for metric, score in metrics.items():
                    getLogger(get_version()).info("\t\t >> Best {} {}: {}".format(train_or_valid, metric, score))
                    send_message("\t\t :star-struck: Best {} {}: {}".format(train_or_valid, metric, score))
            validity.loc[validity.index.isin(val_x.index), "Predict"] = clf.predict_proba(val_x.values)[:, 1]

            # Calculate feature importance per fold
            if fold == 0:
                feature_importance["feature"] = trn_x.columns
            feature_importance["fold{}".format(fold)] = clf.get_feature_importance()

            # Measure finish time of the classification of this fold
            elapsed_time = int(time.time() - start)
            minutes, sec = divmod(elapsed_time, 60)
            hour, minutes = divmod(minutes, 60)
            getLogger(get_version()).info(
                "\t >> {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}"
                .format(fold, hour, minutes, sec))
            send_message("\t :cat: {} folds finish: [elapsed_time] >> {:0>2}:{:0>2}:{:0>2}".format(fold, hour, minutes, sec))

            # Post-process this fold
            clf.save_model(str(model_path / "valid{}.model".format(fold)))

        # Output CV score
        validity = output_cv(validity, ":cat:")

        # Save importance
        directory_path = Path(__file__).absolute().parents[2] / "importance"
        save_feature_importance(feature_importance, directory_path)

        # Post-process the training
        del feature_importance
        gc.collect()

        return validity