def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array( base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array( base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row() + 1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_zero_baseline(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) baseline = np.zeros(pool.num_row()) pool.set_baseline(baseline) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_non_zero_bazeline(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array( base_model.predict(pool, prediction_type='RawFormulaVal')) pool.set_baseline(baseline) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
object_importance = model.get_object_importance(pool, pool_train) model.eval_metrics() params = {'iterations': 500, 'depth': 10, 'loss_function': ['RMSE','MAE'], 'logging_level': 'Silent'} params = { 'iterations' : 500, 'learning_rate': 0.1, 'loss_function': 'RMSE', 'eval_metric': 'R2', 'random_seed': 42, 'logging_level': 'Silent', 'allow_writing_files' : True, 'use_best_model': False } train_pool = Pool(X_train, y_train) train_pool.set_baseline([[int(x)] for x in y_pred_X_train_rd]) validate_pool = Pool(X_test, y_test) validate_pool.set_baseline([[int(x)] for x in y_pred_X_test_rd]) model = CatBoostRegressor(**params) model.fit(train_pool, eval_set=validate_pool, plot=True) best_model_params = params.copy() best_model_params.update({ 'use_best_model': True }) best_model = CatBoostRegressor(**best_model_params) best_model.fit(train_pool, eval_set=validate_pool, plot=True); best_pred2 = best_model.predict(X_test) best_pred_total = np.round(y_pred_X_test_rd + best_pred1) ev = eval_metrics(y_test, np.round(model.predict(X_test)))