Example #1
0
def test_wrong_feature_count():
    with pytest.raises(CatboostError):
        data = np.random.rand(100, 10)
        label = np.random.randint(2, size=100)
        model = CatBoostClassifier()
        model.fit(data, label)
        model.predict(data[:, :-1])
Example #2
0
def test_raw_predict_equals_to_model_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=10, random_seed=0)
    model.fit(train_pool, eval_set=test_pool)
    pred = model.predict(test_pool, prediction_type='RawFormulaVal')
    assert all(model.get_test_eval() == pred)
Example #3
0
def test_full_history():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(od_type='Iter', od_wait=20, random_seed=42, approx_on_full_history=True)
    model.fit(train_pool, eval_set=test_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #4
0
def test_pool_after_fit():
    pool1 = Pool(TRAIN_FILE, column_description=CD_FILE)
    pool2 = Pool(TRAIN_FILE, column_description=CD_FILE)
    assert _check_data(pool1.get_features(), pool2.get_features())
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool2)
    assert _check_data(pool1.get_features(), pool2.get_features())
def train_preprocessor(path='.', train='train.csv'):
    print('start train trash preprocessor...')
    df = pd.read_csv(os.path.join(path, train))

    train_data = df[:-100]
    validation_data = df[-100: -50]

    vectorizer = CountVectorizer()
    x_train_counts = vectorizer.fit_transform(train_data.text)
    x_validation_counts = vectorizer.transform(validation_data.text)

    model = CatBoostClassifier(iterations=250,
                               train_dir=path,
                               logging_level='Silent',
                               allow_writing_files=False
                               )

    model.fit(X=x_train_counts.toarray(),
              y=train_data.status,
              eval_set=(x_validation_counts.toarray(), validation_data.status),
              use_best_model=True,)

    model.save_model(os.path.join(path, 'trash_model'))
    joblib.dump(vectorizer,os.path.join(path, 'trash_vectorizer'))
    print('end train sentiment preprocessor...')
def cleaning_comments(raw_comments, path='.') -> str:
    print('start cleaning of comments...')

    raw = pd.read_csv(raw_comments)
    cleaned_comments = os.path.join(path, 'cleaned_comments.csv')
    bad_comments = os.path.join(path, 'bad_comments.csv')
    model = CatBoostClassifier().load_model(os.path.join(path, 'trash_model'))
    vectorizer = joblib.load(os.path.join(path, 'trash_vectorizer'))

    hyp = model.predict_proba(vectorizer.transform(raw.text).toarray())
    with open(cleaned_comments, 'w') as cleaned, open(bad_comments, 'w') as bad:
        bad_file = 'likes,status,text\n'
        cleaned_file = 'likes,status,text\n'
        for i in range(len(hyp)):
            if hyp[i][0] < 0.6:
                bad_file += str(raw.likes[i]) + ',1,"' + raw.text[i] + '"\n'
            else:
                cleaned_file += str(raw.likes[i]) + ',0,"' + raw.text[i] + '"\n'
        cleaned.write(cleaned_file)
        bad.write(bad_file)

    os.remove(raw_comments)

    print('end cleaning of comments...')
    return cleaned_comments
Example #7
0
def test_predict_class():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred = model.predict(test_pool, prediction_type="Class")
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
Example #8
0
def test_no_cat_in_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()))
    pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices()))
    assert _check_data(pred1, pred2)
Example #9
0
def test_zero_baseline():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    baseline = np.zeros(pool.num_row())
    pool.set_baseline(baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #10
0
def test_non_ones_weight():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    weight = np.arange(1, pool.num_row()+1)
    pool.set_weight(weight)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #11
0
def test_ntree_limit():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=100, random_seed=0)
    model.fit(train_pool)
    pred = model.predict_proba(test_pool, ntree_end=10)
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
Example #12
0
def test_staged_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=10, random_seed=0)
    model.fit(train_pool)
    preds = []
    for pred in model.staged_predict(test_pool):
        preds.append(pred)
    np.save(PREDS_PATH, np.array(preds))
    return local_canonical_file(PREDS_PATH)
    def create_model(self, kfold_X_train, y_train, kfold_X_valid, y_test, test):


        best = CatBoostClassifier(loss_function='MultiClassOneVsAll', learning_rate=0.07940735491731761, depth=8)
        best.fit(kfold_X_train, y_train)

        # 对验证集predict
        pred = best.predict_proba(kfold_X_valid)
        results = best.predict_proba(test)

        return pred, results, best
Example #14
0
def test_ignored_features():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model1 = CatBoostClassifier(iterations=5, random_seed=0, ignored_features=[1, 2, 3])
    model2 = CatBoostClassifier(iterations=5, random_seed=0)
    model1.fit(train_pool)
    model2.fit(train_pool)
    predictions1 = model1.predict(test_pool)
    predictions2 = model2.predict(test_pool)
    assert not _check_data(predictions1, predictions2)
    model1.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #15
0
def train_catboost_model(df, target, cat_features, params, verbose=True):

    if not isinstance(df, DataFrame):
        raise Exception('DataFrame object expected, but got ' + repr(df))

    print 'features:', df.columns.tolist()

    cat_features_index = list(df.columns.get_loc(feature) for feature in cat_features)
    print 'cat features:', cat_features_index
    model = CatBoostClassifier(**params)
    model.fit(df, target, cat_features=cat_features_index, verbose=verbose)
    return model
Example #16
0
def model_1(X,y,test):
	'''
	This is a catBoost model where we need not to encode categorical variables.
	It automatically takes care of them.
	'''
	categorical_features_indices = np.where(X.dtypes != np.float)[0]
	X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.7, random_state=1234)
	#importing library and building model
	cboost=CatBoostClassifier(iterations=500,learning=0.01,depth=6,loss_function='MultiClass',eval_metric='Accuracy')
	cboost.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_validation, y_validation),plot=True)
	#calculating the class wise prediction probability of cboost model
	pred_prob=cboost.predict_proba(test)
	return pred_prob
Example #17
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices())
    model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #18
0
def test_multiclass():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8)
    classifier.fit(pool)
    classifier.save_model(OUTPUT_MODEL_PATH)
    new_classifier = CatBoostClassifier()
    new_classifier.load_model(OUTPUT_MODEL_PATH)
    pred = new_classifier.predict_proba(pool)
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
class BesCatBoost:
    """
    catboost_params = {
            'iterations': 500,
            'depth': 3,
            'learning_rate': 0.1,
            'eval_metric': 'AUC',
            'random_seed': 42,
            'logging_level': 'Verbose',
            'l2_leaf_reg': 15.0,
            'bagging_temperature': 0.75,
            'allow_writing_files': False,
            'metric_period': 50
        }
        """

    def __init__(self, params, metric='AUC', maximize=True, verbose=True, model=None):
        self.params = params
        self.metric = metric
        self.maximize = maximize
        self.verbose = verbose
        self.model = model

    def fit(self, X_train, y_train):

        bst = cv(
            Pool(X_train, y_train),
            self.params
        )

        best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1
        print('Best Iteration: {}'.format(best_rounds))

        self.params['iterations'] = best_rounds
        self.model = CatBoostClassifier(**self.params)

        self.model.fit(
            X_train, y_train
        )

    def predict(self, X_test):
        pred_prob = self.model.predict_proba(X_test)[:, -1]
        return pred_prob

    def feature_importance(self):
        pass

    @staticmethod
    def find_best_params(kag):
        pass
Example #20
0
def test_clone():
    estimator = CatBoostClassifier(
        custom_metric="Accuracy",
        loss_function="MultiClass",
        iterations=400)

    # This is important for sklearn.base.clone since
    # it uses get_params for cloning estimator.
    params = estimator.get_params()
    new_estimator = CatBoostClassifier(**params)
    new_params = new_estimator.get_params()

    for param in params:
        assert param in new_params
        assert new_params[param] is params[param]
    def fit(self, X_train, y_train):

        bst = cv(
            Pool(X_train, y_train),
            self.params
        )

        best_rounds = int(bst['test-{}-mean'.format(self.metric)].idxmax() * 1.5) + 1
        print('Best Iteration: {}'.format(best_rounds))

        self.params['iterations'] = best_rounds
        self.model = CatBoostClassifier(**self.params)

        self.model.fit(
            X_train, y_train
        )
Example #22
0
def test_custom_objective():
    class LoglossObjective(object):
        def calc_ders_range(self, approxes, targets, weights):
            assert len(approxes) == len(targets)
            if weights is not None:
                assert len(weights) == len(approxes)

            exponents = []
            for index in xrange(len(approxes)):
                exponents.append(math.exp(approxes[index]))

            result = []
            for index in xrange(len(targets)):
                p = exponents[index] / (1 + exponents[index])
                der1 = (1 - p) if targets[index] > 0.0 else -p
                der2 = -p * (1 - p)

                if weights is not None:
                    der1 *= weights[index]
                    der2 *= weights[index]

                result.append((der1, der2))

            return result

    train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(data=TEST_FILE, column_description=CD_FILE)

    model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True,
                               loss_function=LoglossObjective(), eval_metric="Logloss",
                               # Leaf estimation method and gradient iteration are set to match
                               # defaults for Logloss.
                               leaf_estimation_method="Newton", leaf_estimation_iterations=10)
    model.fit(train_pool, eval_set=test_pool)
    pred1 = model.predict(test_pool, prediction_type='RawFormulaVal')

    model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, loss_function="Logloss")
    model2.fit(train_pool, eval_set=test_pool)
    pred2 = model2.predict(test_pool, prediction_type='RawFormulaVal')

    for p1, p2 in zip(pred1, pred2):
        assert abs(p1 - p2) < EPS
Example #23
0
def test_custom_eval():
    class LoglossMetric(object):
        def get_final_error(self, error, weight):
            return error / (weight + 1e-38)

        def is_max_optimal(self):
            return True

        def evaluate(self, approxes, target, weight):
            assert len(approxes) == 1
            assert len(target) == len(approxes[0])

            approx = approxes[0]

            error_sum = 0.0
            weight_sum = 0.0

            for i in xrange(len(approx)):
                w = 1.0 if weight is None else weight[i]
                weight_sum += w
                error_sum += w * (target[i] * approx[i] - math.log(1 + math.exp(approx[i])))

            return error_sum, weight_sum

    train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(data=TEST_FILE, column_description=CD_FILE)

    model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric=LoglossMetric())
    model.fit(train_pool, eval_set=test_pool)
    pred1 = model.predict(test_pool)

    model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric="Logloss")
    model2.fit(train_pool, eval_set=test_pool)
    pred2 = model2.predict(test_pool)

    for p1, p2 in zip(pred1, pred2):
        assert abs(p1 - p2) < EPS
Example #24
0
def test_one_doc_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc')))
    return local_canonical_file(FIMP_PATH)
Example #25
0
def test_classification_ctr():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, ctr_description=['Borders', 'Counter'])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #26
0
def test_priors():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, has_time=True, priors=[0, 0.6, 1, 5])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #27
0
from catboost import CatBoostClassifier

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=22)
predictions = np.zeros((len(X_valid), 2))
oof_preds = np.zeros((len(test_df), 2))
feature_importance_df = pd.DataFrame()
final_preds = []
# random_state = [77,89,22,1007,1997,1890,2000,2020,8989,786,787,1999992,2021,7654]
for fold_, (trn_idx,
            val_idx) in enumerate(folds.split(X_train.values, y_train.values)):
    print("Fold {}".format(fold_))
    X_trn, y_trn = X_train.iloc[trn_idx], y_train.iloc[trn_idx]
    X_val, y_val = X_train.iloc[val_idx], y_train.iloc[val_idx]
    clf = CatBoostClassifier(iterations=10000,
                             depth=3,
                             learning_rate=0.2,
                             eval_metric="Logloss")
    clf.fit(X_trn, y_trn, eval_set=[(X_val, y_val)], cat_features=cat_cols)
    final_preds.append(log_loss(y_pred=clf.predict_proba(X_val), y_true=y_val))
    predictions += clf.predict_proba(X_valid)
    oof_preds += clf.predict_proba(test_df)
    counter = counter + 1

oof_preds = oof_preds / counter

sample['risk_flag'] = oof_preds[:, 1]
sample['risk_flag'] = sample['risk_flag'].apply(lambda x: 0 if x < 0.5 else 1)

sample.to_csv('cat_sfk_loedata.csv', index=False)

print(sum(final_preds) / 5)
Example #28
0
def test_fit_no_label():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool.get_features())
Example #29
0
for fold_n, (train_idx, valid_idx) in enumerate(folds.split(X, y)):
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]
    X_valid = X.iloc[valid_idx]
    y_valid = y.iloc[valid_idx]

    if MODEL_TYPE == "catboost":
        train_dataset = Pool(data=X_train, label=y_train, cat_features=CAT_FEATURES)
        valid_dataset = Pool(data=X_valid, label=y_valid, cat_features=CAT_FEATURES)
        test_dataset = Pool(data=X_test, cat_features=CAT_FEATURES)

        model = CatBoostClassifier(
                iterations=N_ESTIMATORS,
                learning_rate=LEARNING_RATE,
                depth=DEPTH,
                eval_metric=EVAL_METRIC,
                verbose=VERBOSE,
                random_state=RANDOM_STATE,
                thread_count=N_THREADS,
                task_type="GPU")

        model.fit(
                train_dataset,
                eval_set=valid_dataset,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            )
        y_pred_valid = model.predict(valid_dataset)
        y_pred = model.predict(test_dataset)

        fold_importance = pd.DataFrame()
        fold_importance["feature"] = model.feature_names_
    # print(classification_report(pred_lgbm, test_y.values))
    # print(accuracy_score(pred_lgbm, test_y.values))
    #
    # model = XGBClassifier(objective='binary:logistic', subsample=0.5, max_depth=7, gamma=0.2, colsample_bytree=0.5,
    #                       min_child_weight=5, n_estimators=400)
    #
    # model.fit(train_X, train_y)
    # pred_proba_xgb = model.predict_proba(test_X)
    # pred_proba_xgb_train = model.predict_proba(train_X)
    # pred_xgb = model.predict(test_X)
    #
    #
    # print(classification_report(pred_xgb, test_y.values))
    # print(accuracy_score(pred_xgb, test_y.values))

    model = CatBoostClassifier()

    model.fit(train_X, train_y)
    pred_proba_cat = model.predict_proba(test_X)
    pred_proba_cat_train = model.predict_proba(train_X)
    pred_cat = model.predict(test_X)

    print(classification_report(pred_cat, test_y.values))
    print(accuracy_score(pred_cat, test_y.values))
    # stacked_train = np.swapaxes(np.vstack((pred_proba_lgbm_train[:, 1], pred_proba_xgb_train[:, 1], pred_proba_cat_train[:, 1],  np.swapaxes(train_X, 0, 1))), 0, 1)
    #
    # stacked_test = np.swapaxes(np.vstack((pred_proba_lgbm[:, 1], pred_proba_xgb[:, 1], pred_proba_cat[:, 1] , np.swapaxes(test_X, 0, 1))), 0, 1)
    #
    # # Stacked model
    # model = CatBoostClassifier()
    #
Example #31
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices())
    model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #32
0
def test_interaction_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='Interaction')))
    return local_canonical_file(FIMP_PATH)
Example #33
0
def test_one_doc_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc')))
    return local_canonical_file(FIMP_PATH)
Example #34
0
def test_classification_ctr():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, ctr_description=['Borders', 'Counter'])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #35
0
def test_class_weights():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, class_weights=[1, 2])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #36
0
def test_ignored_features():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model1 = CatBoostClassifier(iterations=5, random_seed=0, ignored_features=[1, 2, 3])
    model2 = CatBoostClassifier(iterations=5, random_seed=0)
    model1.fit(train_pool)
    model2.fit(train_pool)
    predictions1 = model1.predict(test_pool)
    predictions2 = model2.predict(test_pool)
    assert not _check_data(predictions1, predictions2)
    model1.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #37
0
def test_feature_importance_off():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(iterations=5, random_seed=0, calc_feature_importance=False)
        model.fit(pool)
        model.feature_importances_
Example #38
0
from transformers import T5Tokenizer


@contextmanager
def suppress_stdout_stderr():
    """A context manager that redirects stdout and stderr to devnull"""
    with open(devnull, 'w') as fnull:
        with redirect_stderr(fnull) as err, redirect_stdout(fnull) as out:
            yield (err, out)


dmp = diff_match_patch()
nlp = spacy.load("en")
annotator = errant.load('en')

classifier = CatBoostClassifier()
classifier.load_model("./models/err_type_classifier.cbm")

tokenizer = T5Tokenizer.from_pretrained('t5-base')
emb_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')


def load_predict_fn(model_path):
    sess = tf.compat.v1.Session()
    graph = tf.compat.v1.get_default_graph()
    dev_name = '/gpu:0'
    if os.environ.get("MODEL_PLACE") == "cpu":
        dev_name = '/cpu:0'
    with tf.device(dev_name):
        tf.compat.v1.reset_default_graph()
        sess = tf.compat.v1.Session()
from functions import clean_data_train_test_split
warnings.filterwarnings('ignore')


font_label={'size': 15}
font_title={'weight': 'bold', 'size': 15}

x_train_train, y_train_train, x_train_val, y_train_val, x_test, y_test = clean_data_train_test_split()

thresholds = np.linspace(0,1,101)

class_weight = [3, .2]

categorical_features_indices = np.where(x_train_train.dtypes != np.float)[0]

modcb=CatBoostClassifier(depth=8, iterations=200, learning_rate=0.05, l2_leaf_reg=30, class_weights=class_weight,
                         use_best_model=True, one_hot_max_size=100, rsm=.5)

modcb.fit(x_train_train, y_train_train,cat_features=categorical_features_indices,eval_set=(x_train_val, y_train_val),plot=True)

y_test_proba = modcb.predict_proba(x_test)[:,1]
FPR, TPR, shresholds = roc_curve(y_test, y_test_proba)

func.plotroc(FPR, TPR)

precisions, aarates = func.get_prec_aa_prof(thresholds, y_test, y_test_proba)
func.plot_prec_aa(precisions, aarates)


profs, prof_thresh = func.profit_curve(np.array([[6,-150],[0,0]]), y_test_proba[:10000], np.array(y_test)[:10000])

func.plot_profit_curve(prof_thresh, profs)
Example #40
0
def test_multiclass():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    classifier = CatBoostClassifier(iterations=2, random_seed=0, loss_function='MultiClass', thread_count=8)
    classifier.fit(pool)
    classifier.save_model(OUTPUT_MODEL_PATH)
    new_classifier = CatBoostClassifier()
    new_classifier.load_model(OUTPUT_MODEL_PATH)
    pred = new_classifier.predict_proba(pool)
    np.save(PREDS_PATH, np.array(pred))
    return local_canonical_file(PREDS_PATH)
Example #41
0




for i in range(4):
    globals()['X_tr_cat%s' % i],globals()['X_tst_cat%s' % i],globals()['y_tr_cat%s' % i],globals()['y_tst_cat%s' % i] = train_test_split(globals()['train_df1_cat%s' % i].drop(columns = ['risk_flag']),globals()['train_df1_cat%s' % i]['risk_flag'],stratify = globals()['train_df1_cat%s' % i]['risk_flag'])



score = {}

from catboost import CatBoostClassifier
for i in range(4):
    
    globals()['catboost%s' % i]=CatBoostClassifier(iterations = 2000,eval_metric = 'F1')
    globals()['catboost%s' % i].fit(globals()['X_tr_cat%s' % i], globals()['y_tr_cat%s' % i],cat_features=cat_cols,eval_set=(globals()['X_tst_cat%s' % i], globals()['y_tst_cat%s' % i]))
    score['catboost{}'.format(i)] = roc_auc_score(globals()['y_tst_cat%s' % i],globals()['catboost%s' % i].predict(globals()['X_tst_cat%s' % i]))


preds = pd.DataFrame()
for i in range(4):
    
    preds['catboost{}'.format(i)] = globals()['catboost%s' % i].predict(test_df_cat)

counts = pd.DataFrame()
counts['1s'] = preds.sum(axis = 1)
counts['0s'] = 4 - counts['1s']


sample['risk_flag'] = counts['1s'].apply(lambda x : 1 if x > 3 else 0)
Example #42
0
def test_predict_without_fit():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.predict(pool)
Example #43
0
def test_wrong_ctr_for_classification():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(ctr_description=['Borders:TargetBorderCount=5:TargetBorderType=Uniform'])
        model.fit(pool)
Example #44
0
 y_train = train_y.loc[x_train.index,:].values.ravel()
 x_val = train_X[train_X.kfold == fold]
 y_val = train_y.loc[x_val.index,:].values.ravel()
 
 #Dropping the kfold column from the dataframes
 x_train.pop("kfold")
 x_val.pop("kfold")
 
 #Creating the model
 est_1 = ("rf",dispatcher.MODELS["random_forest"])
 est_2 = ("svm",dispatcher.MODELS["SVM"])
 est_3 = ("lr",dispatcher.MODELS["logistic_regression"])
 est_4 = ("catboost",dispatcher.MODELS["catBoost"])
 
 model = ensemble.StackingClassifier(estimators = [est_1,est_2, est_4], 
                                     final_estimator = CatBoostClassifier(early_stopping_rounds=5,  
                                                                          class_weights = {0 : 0.25, 1: 0.75}), 
                                     n_jobs = 8, 
                                     verbose = 2)
 
 # model = dispatcher.MODELS["random_forest"]
 model.fit(x_train, y_train)
 train_pred = model.predict(x_train)
 val_pred  = model.predict(x_val)
 
 #Computing the F1 score    
 f1_train = metrics.f1_score(y_train, train_pred)
 f1_val = metrics.f1_score(y_val, val_pred)
 
 avg_f1_train.append(f1_train)
 avg_f1_val.append(f1_val)
 
Example #45
0
def test_feature_importance_off():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(iterations=5, random_seed=0, calc_feature_importance=False)
        model.fit(pool)
        model.feature_importances_
Example #46
0
X, y = hlp.load_train()

def save_clf(clf, filename):
    with open(join(hlp.path_models, filename), 'wb') as dest:
        pickle.dump(clf, dest)

# RANDOM FOREST, ~0.94110
pars_rf = pd.read_csv(join(hlp.path_models, 'pars_rf.csv')).\
    to_dict(orient='records')[0]

pars_rf['n_estimators'] = 200
pars_rf['random_state'] = pr.rseed
pars_rf['class_weight'] = 'balanced'
pars_rf['min_samples_split'] = 3
pars_rf['n_jobs'] = -1

rf_final = RandomForestClassifier(**pars_rf)
rf_final.fit(X, y)
save_clf(rf_final, 'rf.pkl')

# MLP, 0.96286
mlp_final = MLPClassifier(**pr.par_mlp)
mlp_final.fit(X, y)
save_clf(mlp_final, 'mlp.pkl')

# CATBOOST ~0.963
cat_final = CatBoostClassifier(**pr.par_cat)
cat_final.fit(X, y)
cat_final.save_model(join(hlp.path_models, 'catboost.meow'))

# final nn is trained in train_models.py
Example #47
0
def test_predict_sklearn_class():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #48
0
def test_wrong_params_classifier():
    with pytest.raises(CatboostError):
        CatBoostClassifier(wrong_param=1)
Example #49
0
def test_priors():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, has_time=True, ctr_description=["Borders:Prior=0:Prior=0.6:Prior=1:Prior=5", "Counter:Prior=0:Prior=0.6:Prior=1:Prior=5"])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Example #50
0
prob = np.zeros(len(train))
test_prob = np.zeros(len(test))
test_data = test[feat0].values
for idx, (train_index, valid_index) in enumerate(kf.split(train, train_y)):
    train_data = train.loc[train_index][feat0].values
    valid_data = train.loc[valid_index][feat0].values
    model = LGBMClassifier(n_estimators=1000, learning_rate=0.08, num_leaves=15, subsample=0.8, colsample_bytree=0.6, n_jobs=4)
    model.fit(train_data, train_y.loc[train_index], 
              eval_set=(valid_data, train_y.loc[valid_index]), early_stopping_rounds=50)
    prob[valid_index] = model.predict_proba(valid_data)[:, 1]
    test_prob += model.predict_proba(test_data)[:, 1]/5

train['lgb_prob'] = prob
test['lgb_prob'] = test_prob

kf = StratifiedKFold(5, True, random_state=1)
prob = np.zeros(len(train))
test_prob = np.zeros(len(test))
feat1=list(set(feat0 + cat_feat + ['lgb_prob']))
test_data=test[feat1].values
for idx, (train_index, valid_index) in enumerate(kf.split(train, train_y)):
    train_data = train.loc[train_index][feat1]
    valid_data = train.loc[valid_index][feat1]
    model = CatBoostClassifier(iterations=1000, learning_rate=0.08, depth=7, cat_features=cat_feat)
    model.fit(train_data, train_y.loc[train_index], 
              eval_set=(valid_data, train_y.loc[valid_index]), early_stopping_rounds=50)
    prob[valid_index] = model.predict_proba(valid_data)[:,1]
    test_prob += model.predict_proba(test_data)[:,1]/5

test['Label'] = test_prob
test[['ID', 'Label']].to_csv('../output/1120_count_rank.csv', index=False)
Example #51
0
def test_class_weights():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0, class_weights=[1, 2])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
cat_feature_inds = []
for i, c in enumerate(train_feature.columns):
    if c in ['register_type', 'device_type']:
        cat_feature_inds.append(i)
# print("Cat features are: %s" % [train_feature[ind] for ind in cat_feature_inds])

print('开始训练......')
catboost_params = {
    "iterations": 200,
    # "learning_rate": 0.09,
    "loss_function": 'Logloss',
    "eval_metric": 'AUC',
    "random_seed": 2018
}
cb_model = CatBoostClassifier(**catboost_params)
cb_model.fit(
    X_train,
    Y_train,
    eval_set=(X_test, Y_test),
    cat_features=cat_feature_inds,
    use_best_model=True,
    verbose=True,  # you can uncomment this for text output
    # plot = True
)
# https://www.kaggle.com/nicapotato/catboost-aggregate-features/code

cb_model.save_model('../model/catboost_model.txt')

temp = cb_model.predict(X_test)
threshold = 0.42
Example #53
0
def test_interaction_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(pool, fstr_type='Interaction')))
    return local_canonical_file(FIMP_PATH)
Example #54
0
def test_no_eval_set():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool, use_best_model=True)
Example #55
0
    print('X_train Shape', x_train.shape)
    print('X_test shape', x_test.shape)
    print('y_train Shape', y_train.shape)
    print('y_test shape', y_test.shape)

    print('Starting training')

    lgb_train = gbm.Dataset(x_train, y_train)
    lgb_eval = gbm.Dataset(x_test, y_test, reference=lgb_train)

    model = CatBoostClassifier(iterations=1200,
                               learning_rate=0.01,
                               l2_leaf_reg=3.5,
                               depth=8,
                               rsm=0.98,
                               loss_function='CrossEntropy',
                               eval_metric='AUC',
                               use_best_model=True,
                               random_seed=42)

    model.fit(x_train, y_train, cat_features=[], eval_set=(x_test_2, y_test_2))

    # model.fit(x_train, y_train)
    incorrect_x = []
    incorrect_y = []
    y_pred = model.predict_proba(x_test_2)

    print("======>", y_pred)
    # for i, x_sample in enumerate(x_test_2):
    #     if int(round(y_pred[i])) != y_test_2[i]:
Example #56
0
def test_wrong_ctr_for_classification():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(ctr_description=['Borders:5:Uniform'])
        model.fit(pool)
Example #57
0
def train_model_classification(X,
                               X_test,
                               y,
                               params,
                               folds,
                               model_type='lgb',
                               eval_metric='auc',
                               columns=None,
                               plot_feature_importance=False,
                               model=None,
                               verbose=10000,
                               early_stopping_rounds=200,
                               n_estimators=50000,
                               splits=None,
                               n_folds=3,
                               averaging='usual',
                               n_jobs=-1):
    """
    A function to train a variety of classification models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.

    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type

    """
    columns = X.columns if columns is None else columns
    n_splits = folds.n_splits if splits is None else n_folds
    X_test = X_test[columns]

    # to set up scoring parameters
    metrics_dict = {
        'auc': {
            'lgb_metric_name': eval_auc,
            'catboost_metric_name': 'AUC',
            'sklearn_scoring_function': metrics.roc_auc_score
        },
    }

    result_dict = {}
    if averaging == 'usual':
        # out-of-fold predictions on train data
        oof = np.zeros((len(X), 1))

        # averaged predictions on train data
        prediction = np.zeros((len(X_test), 1))

    elif averaging == 'rank':
        # out-of-fold predictions on train data
        oof = np.zeros((len(X), 1))

        # averaged predictions on train data
        prediction = np.zeros((len(X_test), 1))

    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()

    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[
                valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMClassifier(**params,
                                       n_estimators=n_estimators,
                                       n_jobs=n_jobs)
            model.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose,
                      early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict_proba(X_valid)[:, 1]
            y_pred = model.predict_proba(
                X_test, num_iteration=model.best_iteration_)[:, 1]

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train,
                                     label=y_train,
                                     feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid,
                                     label=y_valid,
                                     feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data,
                              num_boost_round=n_estimators,
                              evals=watchlist,
                              early_stopping_rounds=early_stopping_rounds,
                              verbose_eval=verbose,
                              params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid,
                                                     feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test,
                                               feature_names=X.columns),
                                   ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](
                y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')

            y_pred = model.predict_proba(X_test)

        if model_type == 'cat':
            model = CatBoostClassifier(
                iterations=n_estimators,
                eval_metric=metrics_dict[eval_metric]['catboost_metric_name'],
                **params,
                loss_function=Logloss)
            model.fit(X_train,
                      y_train,
                      eval_set=(X_valid, y_valid),
                      cat_features=[],
                      use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        if averaging == 'usual':

            oof[valid_index] = y_pred_valid.reshape(-1, 1)
            scores.append(
                metrics_dict[eval_metric]['sklearn_scoring_function'](
                    y_valid, y_pred_valid))

            prediction += y_pred.reshape(-1, 1)

        elif averaging == 'rank':

            oof[valid_index] = y_pred_valid.reshape(-1, 1)
            scores.append(
                metrics_dict[eval_metric]['sklearn_scoring_function'](
                    y_valid, y_pred_valid))

            prediction += pd.Series(y_pred).rank().values.reshape(-1, 1)

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)

    prediction /= n_splits

    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(
        np.mean(scores), np.std(scores)))

    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores

    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= n_splits
            cols = feature_importance[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[
                feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12))
            sns.barplot(x="importance",
                        y="feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LGB Features (avg over folds)')

            result_dict['feature_importance'] = feature_importance
            result_dict['top_columns'] = cols

    return result_dict
Example #58
0
        all_pred = np.zeros(y_train.shape[0])
        for train, test in cv:
            cnt += 1
            trn_x = x_train[train]
            val_x = x_train[test]
            trn_y = y_train[train]

            val_y = y_train[test]

            trn_w = sample_weight[train]
            val_w = sample_weight[test]

            list_idx = df.loc[test].reset_index(drop=True).groupby(
                'order_id').apply(lambda x: x.index.values).tolist()

            clf = CatBoostClassifier(**params)
            clf.fit(
                trn_x,
                trn_y,
                # sample_weight=trn_w,
                # eval_sample_weight=[val_w],
                eval_set=(val_x, val_y),
                verbose=True,
                # early_stopping_rounds=150
            )
            pred = clf.predict_proba(val_x)[:, 1]
            all_pred[test] = pred

            _score = log_loss(val_y, pred)
            _score2 = -roc_auc_score(val_y, pred)
            _, _score3, _ = f1_metric(val_y.astype(int), pred.astype(float))
Example #59
0
desc_corp = [' '.join(x) for x in desc_corp]

# Creating feature-arrays
X = train.iloc[:, 2:-1].values
y = train.Is_Response.values
X_test = test.iloc[:, 2::].values

# Train-validation split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# A very naive CatBoost model
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations=500,
                           learning_rate=0.2,
                           depth=3,
                           l2_leaf_reg=20,
                           loss_function='Logloss',
                           use_best_model=True)
cat_features = [0, 1]
model.fit(X_train, y_train, cat_features, eval_set=(X_val, y_val))

preds = model.predict(X_test)
preds_val = model.predict(X_val)

## Drawing confusion matrix to check model performance
from sklearn.metrics import confusion_matrix
confusion_matrix(y_val, preds_val)
model.score(X_val, y_val)

# XGBoost model
from xgboost import XGBClassifier
Example #60
0
def test_invalid_loss_classifier():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(loss_function="abcdef")
        model.fit(pool)