def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array( base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array( base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row() + 1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_no_cat_in_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices())) pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices())) assert _check_data(pred1, pred2)
def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def hp_tuning(train, test, validate, column_description, algo_name, result_path): train_pool = Pool(train, column_description=column_description) val_pool = Pool(validate, column_description=column_description) test_pool = Pool(test, column_description=column_description) cat_features = train_pool.get_cat_feature_indices() n_estimators = 1000 max_evals = 50 task_type = 'CPU' if algo_name == 'classic': func = choose_classic elif algo_name == 'hyperopt': func = choose_hyperOpt elif algo_name == 'gaussian': func = choose_gaussian elif algo_name == 'optuna': func = choose_optuna elif algo_name == 'hyperband': func = choose_hyperband2 else: print("INCORRECT ALGO NAME!!!") return start = time.time() results = func(train_pool, val_pool, test_pool, cat_features, n_estimators=n_estimators, max_evals=max_evals, task_type=task_type) end = time.time() results["time"] = end - start with open(result_path, 'w') as outfile: json.dump(results, outfile)
def test_export_to_python_after_load(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=40, random_seed=0) model.fit(train_pool) pred_model = model.predict(test_pool, prediction_type='RawFormulaVal') model.save_model(OUTPUT_MODEL_PATH) model_loaded = CatBoostClassifier() model_loaded.load_model(OUTPUT_MODEL_PATH) model_loaded.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python", pool=train_pool) pred_model_loaded = model_loaded.predict(test_pool, prediction_type='RawFormulaVal') import sys import os.path module_dir = os.path.dirname(OUTPUT_PYTHON_MODEL_PATH) sys.path.insert(0, module_dir) from model import apply_catboost_model as apply_catboost_model_from_python pred_python = [] for test_line in test_pool.get_features(): float_features, cat_features = _split_features( test_line, train_pool.get_cat_feature_indices(), test_pool.get_cat_feature_hash_to_string()) pred_python.append( apply_catboost_model_from_python(float_features, cat_features)) assert _check_data(pred_model, pred_python) assert _check_data(pred_model_loaded, pred_python)
def test_load_df_vs_load_from_file(): pool1 = Pool(TRAIN_FILE, column_description=CD_FILE) data = read_table(TRAIN_FILE, header=None, dtype=str) label = DataFrame(data.iloc[:, TARGET_IDX]) data.drop([TARGET_IDX], axis=1, inplace=True) cat_features = pool1.get_cat_feature_indices() pool2 = Pool(np.array(data), label, cat_features) assert pool1 == pool2
def test_load_df(): pool = Pool(NAN_TRAIN_FILE, column_description=NAN_CD_FILE) data = read_table(NAN_TRAIN_FILE, header=None) label = DataFrame(data.iloc[:, TARGET_IDX]) data.drop([TARGET_IDX], axis=1, inplace=True) cat_features = pool.get_cat_feature_indices() pool2 = Pool(data, label, cat_features) assert _check_data(pool.get_features(), pool2.get_features()) assert _check_data(pool.get_label(), pool2.get_label())
def test_load_series(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) data = read_table(TRAIN_FILE, header=None) label = Series(data.iloc[:, TARGET_IDX]) data.drop([TARGET_IDX], axis=1, inplace=True) data = Series(list(data.values)) cat_features = pool.get_cat_feature_indices() pool2 = Pool(data, label, cat_features) assert _check_data(pool.get_features(), pool2.get_features()) assert _check_data(pool.get_label(), pool2.get_label())
def test_one_doc_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save( FIMP_PATH, np.array( model.get_feature_importance( np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc'))) return local_canonical_file(FIMP_PATH)
def test_export_model_with_cat_features_to_python_from_app(): test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoost() with open(OUTPUT_MODEL_PATH, "w") as model_file: model_file.write(resource.find("cb_adult_model_bin")) model.load_model(OUTPUT_MODEL_PATH) pred_model = model.predict(test_pool, prediction_type='RawFormulaVal') from adult_model import apply_catboost_model as apply_catboost_model_from_app pred_python = [] for test_line in test_pool.get_features(): float_features, cat_features = _split_features( test_line, test_pool.get_cat_feature_indices(), test_pool.get_cat_feature_hash_to_string()) pred_python.append( apply_catboost_model_from_app(float_features, cat_features)) assert _check_data(pred_model, pred_python)
print('测试集 Y_CAT_test 的shape是',Y_CAT_test.shape) print('训练集中label的正负比例分布如下:\n',Y_CAT_train.value_counts()) print('测试集中label的正负比例分布如下:\n',Y_CAT_test.value_counts()) print('可以看出在划分训练集和测试集时设定strtify参数为Y_CAT;使得在训练测试集中正负例所占比例一致。') ## catboost建模 ### Step1: Pool Initialize from catboost import Pool pool_data = Pool(data = X_CAT, label = Y_CAT, cat_features = CAT_features) print('pool_data的 type 是:', type(pool_data)) print('pool_data的 shpe 是:', pool_data.shape) print('pool_data.get_features()返回的是list类型,其长度是:',len(pool_data.get_features())) print('pool_data.get_label()返回的是list类型,其长度是:', len(pool_data.get_label())) print('pool_data中类别变量所在的索引位置是 pool_data.get_cat_feature_indices() :', pool_data.get_cat_feature_indices()) #print('生成的pool_data的各观测的weight:', pool_data.get_weight()) #print('生成的pool_data的各观测的baseline:', pool_data.get_baseline()) #### Step2.1 自定义metric类。用以做最优模型选择和过拟合检测 # **************Custom metric for overfitting detector and best model selection****** import math from catboost import Pool, CatBoostClassifier class Recall_1_Metric(object): def get_final_error(self, error, weight): return error / (weight + 1e-38)
def test_pool_cat_features(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) assert np.all(pool.get_cat_feature_indices() == CAT_FEATURES)
def test_load_ndarray(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) cat_features = pool.get_cat_feature_indices() data = np.array(map_cat_features(pool.get_features(), cat_features)) label = np.array(pool.get_label()) assert _check_shape(Pool(data, label, cat_features))
def test_one_doc_feature_importance(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0) model.fit(pool) np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc'))) return local_canonical_file(FIMP_PATH)