def get_test_result_3class(variant, clf, pool: Pool): probs = clf.predict_proba(pool) poswl, negwl = st.WinLoss(), st.WinLoss() min_pos_proba = variant.min_proba min_neg_proba = variant.get_min_neg_proba() for prob0z1, lab in zip(probs, pool.get_label()): if min_pos_proba is not None and prob0z1[2] >= min_pos_proba: poswl.hit(lab == 1) elif min_neg_proba is not None and prob0z1[0] >= min_neg_proba: negwl.hit(lab == -1) profit, pos_profit, neg_profit = 0.0, 0.0, 0.0 profit_ratios = variant.profit_ratios if poswl: pos_profit = round( poswl.size * (poswl.ratio - profit_ratios.pos_ratio), 3) if negwl: neg_profit = round( negwl.size * (negwl.ratio - profit_ratios.neg_ratio), 3) profit = pos_profit + neg_profit return cco.Result( name=variant.name, mean=cco.fmt((poswl + negwl).ratio), leny=len(pool.get_label()), scr=cco.fmt(clf.score(pool)), poswl=poswl, negwl=negwl, profit=profit, pos_profit=pos_profit, )
def test_load_df(): pool = Pool(NAN_TRAIN_FILE, column_description=NAN_CD_FILE) data = read_table(NAN_TRAIN_FILE, header=None) label = DataFrame(data.iloc[:, TARGET_IDX]) data.drop([TARGET_IDX], axis=1, inplace=True) cat_features = pool.get_cat_feature_indices() pool2 = Pool(data, label, cat_features) assert _check_data(pool.get_features(), pool2.get_features()) assert _check_data(pool.get_label(), pool2.get_label())
def test_load_series(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) data = read_table(TRAIN_FILE, header=None) label = Series(data.iloc[:, TARGET_IDX]) data.drop([TARGET_IDX], axis=1, inplace=True) data = Series(list(data.values)) cat_features = pool.get_cat_feature_indices() pool2 = Pool(data, label, cat_features) assert _check_data(pool.get_features(), pool2.get_features()) assert _check_data(pool.get_label(), pool2.get_label())
def test_load_dumps(): pool_size = (100, 10) data = np.random.randint(10, size=pool_size) label = np.random.randint(2, size=pool_size[0]) pool1 = Pool(data, label) lines = [] for i in range(len(data)): line = [str(label[i])] + [str(x) for x in data[i]] lines.append('\t'.join(line)) text = '\n'.join(lines) with open('test_data_dumps', 'w') as f: f.write(text) pool2 = Pool('test_data_dumps') assert _check_data(pool1.get_features(), pool2.get_features()) assert _check_data(pool1.get_label(), pool2.get_label())
def test_load_generated(): pool_size = (100, 10) data = np.round(np.random.normal(size=pool_size), decimals=3) label = np.random.randint(2, size=pool_size[0]) pool = Pool(data, label) assert _check_data(pool.get_features(), data) assert _check_data(pool.get_label(), label)
def test_non_ones_weight(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) pool2 = Pool(pool.get_features(), pool.get_label(), weight=np.arange(1, pool.num_row()+1)) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool2) model.save_model(OUTPUT_MODEL_PATH) return local_canonical_file(OUTPUT_MODEL_PATH)
def test_zero_baseline(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) baseline = np.zeros((pool.num_row(), 2)) pool = Pool(pool.get_features(), pool.get_label(), baseline=baseline) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool) model.save_model(OUTPUT_MODEL_PATH) return local_canonical_file(OUTPUT_MODEL_PATH)
def test_non_zero_bazeline(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array(base_model.predict(pool)) pool2 = Pool(pool.get_features(), pool.get_label(), baseline=baseline) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(pool2) model.save_model(OUTPUT_MODEL_PATH) return local_canonical_file(OUTPUT_MODEL_PATH)
def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_load_ndarray(): pool = Pool(TRAIN_FILE, column_description=CD_FILE) cat_features = pool.get_cat_feature_indices() data = np.array(map_cat_features(pool.get_features(), cat_features)) label = np.array(pool.get_label()) assert _check_shape(Pool(data, label, cat_features))
bagging_temperature = 19.95 ) model.fit(train_pool, eval_set=test_pool) # %% # print('training pool') # get_precision_recall(model, train_pool) # print('testing pool') # get_precision_recall(model, test_pool) # %% df_pred = ( pd.DataFrame(model.predict(test_pool), columns=['pred']). assign(y=test_pool.get_label()) ) df_pred_summary = df_pred.groupby(['y', 'pred']).size().to_frame('occurence').reset_index() df_pred_summary df_pred[['pred', 'y']].hist() # %% [markdown] # # plot entry and exit # %% # def plot_turnpt(price, turnpts, dates_index): # index_upward = np.where(turnpts==1)[0] # index_downward = np.where(turnpts==-1)[0]
def catboost_bootstrap(dir_, learn_name, test_name, cd_file, classes, learning_rate=None, border_count=32, cnt_values=20, file_result_to=sys.stdout, file_info_to=sys.stdout, iterations=1500): logloss = {} auc = {} for clazz in classes: print('class={}'.format(clazz.WRAPPER_NAME)) print('class={}; step={}'.format(clazz.WRAPPER_NAME, learning_rate[clazz]), file=file_result_to) file_result_to.flush() auc[clazz.WRAPPER_NAME] = [] logloss[clazz.WRAPPER_NAME] = [] tree_counts = [] logloss_curves = [] auc_curves = [] cl = clazz() source_learn_pool = Pool(data=os.path.join(dir_, learn_name), column_description=os.path.join( dir_, cd_file)) beg = time.time() learn_pool = cl.handle_learn_pool(source_learn_pool) end = time.time() print('!!!time: {}'.format(end - beg), file=file_info_to) print('priors: {}'.format(cl.prior), file=file_info_to) print('prior scores: {}'.format(cl.score), file=file_info_to) file_info_to.flush() source_test_pool = Pool(data=os.path.join(dir_, test_name), column_description=os.path.join(dir_, cd_file)) source_test_label = np.array(source_test_pool.get_label()) source_test_features = np.array(source_test_pool.get_features()) cat = CatBoostClassifier(max_ctr_complexity=1, custom_metric='AUC', boosting_type='Plain', random_seed=0, border_count=border_count, iterations=iterations, learning_rate=learning_rate[clazz], thread_count=multiprocessing.cpu_count()) beg = time.time() cat.fit(learn_pool, use_best_model=True) end = time.time() for seed in range(cnt_values): idx = list(range(source_test_features.shape[0])) np.random.seed(seed * 10 + 300) boot_idx = np.random.choice(idx, len(idx), replace=True) boot_test_features = source_test_features[boot_idx] boot_test_label = source_test_label[boot_idx] X, y = cl.handle_test_matrix(boot_test_features, boot_test_label, False) metrics = cat.eval_metrics( Pool(X, y), ['Logloss', 'AUC'], eval_period=1, thread_count=multiprocessing.cpu_count()) for num, loss in enumerate(metrics['Logloss']): print('iter={:10}: loss={:.10}'.format(num + 1, loss)) cnt_trees = np.argmin(metrics['Logloss']) print('choose cnt_trees={}'.format(cnt_trees)) print('overfit={}; AUC={}; logloss={}'.format( cnt_trees, metrics['AUC'][cnt_trees], metrics['Logloss'][cnt_trees]), file=file_result_to) tree_counts.append(cnt_trees) file_result_to.flush() logloss_curves.append(metrics['Logloss']) auc_curves.append(metrics['AUC']) auc[clazz.WRAPPER_NAME].append(metrics['AUC'][cnt_trees]) logloss[clazz.WRAPPER_NAME].append(metrics['Logloss'][cnt_trees]) print('class={}, learn_time={}, mean_tree_count={}'.format( clazz.WRAPPER_NAME, end - beg, sum(tree_counts) / len(tree_counts)), file=file_result_to) print('mean_AUC={}, mean_logloss={}'.format( sum(auc[clazz.WRAPPER_NAME]) / len(auc[clazz.WRAPPER_NAME]), sum(logloss[clazz.WRAPPER_NAME]) / len(logloss[clazz.WRAPPER_NAME])), file=file_result_to) file_result_to.flush() logloss_fig = create_learning_curves_plot( logloss_curves, 'logloss {}'.format(clazz.WRAPPER_NAME)) auc_fig = create_learning_curves_plot( auc_curves, 'AUC {}'.format(clazz.WRAPPER_NAME)) logloss_file = os.path.join( dir_, 'fig_{}_{}'.format('Logloss', clazz.WRAPPER_NAME)) AUC_file = os.path.join(dir_, 'fig_{}_{}'.format('AUC', clazz.WRAPPER_NAME)) plot(logloss_fig, filename=logloss_file, auto_open=False) plot(auc_fig, filename=AUC_file, auto_open=False) file_name = os.path.join(dir_, 'boot.txt') with open(file_name, 'w') as file_to: json.dump(auc, file_to) for cl1 in classes: for cl2 in classes: stat, p_value = wilcoxon(auc[cl1.WRAPPER_NAME], auc[cl2.WRAPPER_NAME], zero_method="pratt") print('for {} & {}: stat: {}, p_value: {}'.format( cl1.WRAPPER_NAME, cl2.WRAPPER_NAME, stat, p_value), file=file_result_to)
print('训练集 Y_CAT_train 的shape是',Y_CAT_train.shape) print('测试集 Y_CAT_test 的shape是',Y_CAT_test.shape) print('训练集中label的正负比例分布如下:\n',Y_CAT_train.value_counts()) print('测试集中label的正负比例分布如下:\n',Y_CAT_test.value_counts()) print('可以看出在划分训练集和测试集时设定strtify参数为Y_CAT;使得在训练测试集中正负例所占比例一致。') ## catboost建模 ### Step1: Pool Initialize from catboost import Pool pool_data = Pool(data = X_CAT, label = Y_CAT, cat_features = CAT_features) print('pool_data的 type 是:', type(pool_data)) print('pool_data的 shpe 是:', pool_data.shape) print('pool_data.get_features()返回的是list类型,其长度是:',len(pool_data.get_features())) print('pool_data.get_label()返回的是list类型,其长度是:', len(pool_data.get_label())) print('pool_data中类别变量所在的索引位置是 pool_data.get_cat_feature_indices() :', pool_data.get_cat_feature_indices()) #print('生成的pool_data的各观测的weight:', pool_data.get_weight()) #print('生成的pool_data的各观测的baseline:', pool_data.get_baseline()) #### Step2.1 自定义metric类。用以做最优模型选择和过拟合检测 # **************Custom metric for overfitting detector and best model selection****** import math from catboost import Pool, CatBoostClassifier class Recall_1_Metric(object): def get_final_error(self, error, weight): return error / (weight + 1e-38)
def catboost_test(dir_, cur_learn_name, cur_test_name, clazz, learning_rate=None, border_count=128, cnt_models=1, file_result_to=sys.stdout, file_info_to=sys.stdout, iterations=1500): full_learn_name = os.path.join(dir_, cur_learn_name) full_test_name = os.path.join(dir_, cur_test_name) if not os.path.exists(full_learn_name): source_learn_pool = Pool(data=os.path.join(dir_, learn_name), column_description=os.path.join(dir_, cd_file)) source_test_pool = Pool(data=os.path.join(dir_, test_name), column_description=os.path.join(dir_, cd_file)) cl = clazz() beg = time.time() learn_pool = cl.handle_learn_pool(source_learn_pool) test_pool = cl.handle_test_pool(source_test_pool) end = time.time() print('!!!time: {}'.format(end - beg), file=file_info_to) print('priors: {}'.format(cl.prior), file=file_info_to) print('prior scores: {}'.format(cl.score), file=file_info_to) file_info_to.flush() learn_label = learn_pool.get_label() learn_features = learn_pool.get_features() learn_data = np.zeros((len(learn_label), len(learn_features[0]) + 1)) learn_data[:, 0] = learn_label learn_data[:, 1:] = learn_features np.savetxt(full_learn_name, learn_data, delimiter='\t', fmt='%.10f') test_label = test_pool.get_label() test_features = test_pool.get_features() test_data = np.zeros((len(test_label), len(test_features[0]) + 1)) test_data[:, 0] = test_label test_data[:, 1:] = test_features np.savetxt(full_test_name, test_data, delimiter='\t', fmt='%.10f') learn_pool = Pool(data=full_learn_name) test_pool = Pool(data=full_test_name) scores = [] auc = [] logloss = [] times =[] tree_counts = [] for seed in range(cnt_models): print(seed) # print(len(learn_pool.get_features()), len(learn_pool.get_features()[0])) # print(len(test_pool.get_features()), len(test_pool.get_features()[0])) beg = time.time() cat = CatBoostClassifier(max_ctr_complexity=1, custom_metric='AUC', boosting_type='Plain', random_seed=seed, border_count=border_count, iterations=iterations, learning_rate=learning_rate, thread_count=multiprocessing.cpu_count()) cat.fit(learn_pool, eval_set=(test_pool), use_best_model=True) end = time.time() X_test = test_pool.get_features() y_test = test_pool.get_label() tree_counts.append(cat.tree_count_) scores.append(cat.score(X_test, y_test)) metrics = cat.eval_metrics(test_pool, ['AUC', 'Logloss'], eval_period=cat.tree_count_ - 1) print('overfit={}; acc={}; AUC={}; logloss={}; learn_time={}'.format(cat.tree_count_, scores[-1], metrics['AUC'][1], metrics['Logloss'][1], end - beg), file=file_result_to) file_result_to.flush() auc.append(metrics['AUC'][1]) logloss.append(metrics['Logloss'][1]) times.append(end - beg) if len(tree_counts) != 0: print('mean tree_count: {}'.format(sum(tree_counts)/len(tree_counts)), file=file_result_to) return sum(scores)/len(scores), sum(auc)/len(auc), sum(logloss)/len(logloss), sum(times)/len(times) else: return 0, 0, 0, 0