def get_test_result_3class(variant, clf, pool: Pool):
    probs = clf.predict_proba(pool)
    poswl, negwl = st.WinLoss(), st.WinLoss()
    min_pos_proba = variant.min_proba
    min_neg_proba = variant.get_min_neg_proba()
    for prob0z1, lab in zip(probs, pool.get_label()):
        if min_pos_proba is not None and prob0z1[2] >= min_pos_proba:
            poswl.hit(lab == 1)
        elif min_neg_proba is not None and prob0z1[0] >= min_neg_proba:
            negwl.hit(lab == -1)
    profit, pos_profit, neg_profit = 0.0, 0.0, 0.0
    profit_ratios = variant.profit_ratios
    if poswl:
        pos_profit = round(
            poswl.size * (poswl.ratio - profit_ratios.pos_ratio), 3)
    if negwl:
        neg_profit = round(
            negwl.size * (negwl.ratio - profit_ratios.neg_ratio), 3)
    profit = pos_profit + neg_profit
    return cco.Result(
        name=variant.name,
        mean=cco.fmt((poswl + negwl).ratio),
        leny=len(pool.get_label()),
        scr=cco.fmt(clf.score(pool)),
        poswl=poswl,
        negwl=negwl,
        profit=profit,
        pos_profit=pos_profit,
    )
Beispiel #2
0
def test_load_df():
    pool = Pool(NAN_TRAIN_FILE, column_description=NAN_CD_FILE)
    data = read_table(NAN_TRAIN_FILE, header=None)
    label = DataFrame(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Beispiel #3
0
def test_load_series():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    data = read_table(TRAIN_FILE, header=None)
    label = Series(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    data = Series(list(data.values))
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Beispiel #4
0
def test_load_dumps():
    pool_size = (100, 10)
    data = np.random.randint(10, size=pool_size)
    label = np.random.randint(2, size=pool_size[0])
    pool1 = Pool(data, label)
    lines = []
    for i in range(len(data)):
        line = [str(label[i])] + [str(x) for x in data[i]]
        lines.append('\t'.join(line))
    text = '\n'.join(lines)
    with open('test_data_dumps', 'w') as f:
        f.write(text)
    pool2 = Pool('test_data_dumps')
    assert _check_data(pool1.get_features(), pool2.get_features())
    assert _check_data(pool1.get_label(), pool2.get_label())
Beispiel #5
0
def test_load_generated():
    pool_size = (100, 10)
    data = np.round(np.random.normal(size=pool_size), decimals=3)
    label = np.random.randint(2, size=pool_size[0])
    pool = Pool(data, label)
    assert _check_data(pool.get_features(), data)
    assert _check_data(pool.get_label(), label)
Beispiel #6
0
def test_non_ones_weight():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    pool2 = Pool(pool.get_features(), pool.get_label(), weight=np.arange(1, pool.num_row()+1))
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool2)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Beispiel #7
0
def test_zero_baseline():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    baseline = np.zeros((pool.num_row(), 2))
    pool = Pool(pool.get_features(), pool.get_label(), baseline=baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Beispiel #8
0
def test_non_zero_bazeline():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool))
    pool2 = Pool(pool.get_features(), pool.get_label(), baseline=baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool2)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Beispiel #9
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices())
    model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Beispiel #10
0
def test_load_ndarray():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    cat_features = pool.get_cat_feature_indices()
    data = np.array(map_cat_features(pool.get_features(), cat_features))
    label = np.array(pool.get_label())
    assert _check_shape(Pool(data, label, cat_features))
Beispiel #11
0
                           bagging_temperature = 19.95
                          )

model.fit(train_pool, eval_set=test_pool)

# %%
# print('training pool')
# get_precision_recall(model, train_pool)

# print('testing pool')
# get_precision_recall(model, test_pool)

# %%
df_pred = (
    pd.DataFrame(model.predict(test_pool), columns=['pred']).
    assign(y=test_pool.get_label())
)

df_pred_summary = df_pred.groupby(['y', 'pred']).size().to_frame('occurence').reset_index()
df_pred_summary


df_pred[['pred', 'y']].hist()

# %% [markdown]
# # plot entry and exit

# %%
# def plot_turnpt(price, turnpts, dates_index):
#     index_upward = np.where(turnpts==1)[0]
#     index_downward = np.where(turnpts==-1)[0]
def catboost_bootstrap(dir_,
                       learn_name,
                       test_name,
                       cd_file,
                       classes,
                       learning_rate=None,
                       border_count=32,
                       cnt_values=20,
                       file_result_to=sys.stdout,
                       file_info_to=sys.stdout,
                       iterations=1500):
    logloss = {}
    auc = {}
    for clazz in classes:
        print('class={}'.format(clazz.WRAPPER_NAME))
        print('class={}; step={}'.format(clazz.WRAPPER_NAME,
                                         learning_rate[clazz]),
              file=file_result_to)
        file_result_to.flush()
        auc[clazz.WRAPPER_NAME] = []
        logloss[clazz.WRAPPER_NAME] = []
        tree_counts = []
        logloss_curves = []
        auc_curves = []

        cl = clazz()
        source_learn_pool = Pool(data=os.path.join(dir_, learn_name),
                                 column_description=os.path.join(
                                     dir_, cd_file))
        beg = time.time()
        learn_pool = cl.handle_learn_pool(source_learn_pool)
        end = time.time()
        print('!!!time: {}'.format(end - beg), file=file_info_to)
        print('priors: {}'.format(cl.prior), file=file_info_to)
        print('prior scores: {}'.format(cl.score), file=file_info_to)
        file_info_to.flush()

        source_test_pool = Pool(data=os.path.join(dir_, test_name),
                                column_description=os.path.join(dir_, cd_file))
        source_test_label = np.array(source_test_pool.get_label())
        source_test_features = np.array(source_test_pool.get_features())

        cat = CatBoostClassifier(max_ctr_complexity=1,
                                 custom_metric='AUC',
                                 boosting_type='Plain',
                                 random_seed=0,
                                 border_count=border_count,
                                 iterations=iterations,
                                 learning_rate=learning_rate[clazz],
                                 thread_count=multiprocessing.cpu_count())
        beg = time.time()
        cat.fit(learn_pool, use_best_model=True)
        end = time.time()

        for seed in range(cnt_values):
            idx = list(range(source_test_features.shape[0]))
            np.random.seed(seed * 10 + 300)
            boot_idx = np.random.choice(idx, len(idx), replace=True)
            boot_test_features = source_test_features[boot_idx]
            boot_test_label = source_test_label[boot_idx]
            X, y = cl.handle_test_matrix(boot_test_features, boot_test_label,
                                         False)
            metrics = cat.eval_metrics(
                Pool(X, y), ['Logloss', 'AUC'],
                eval_period=1,
                thread_count=multiprocessing.cpu_count())
            for num, loss in enumerate(metrics['Logloss']):
                print('iter={:10}:     loss={:.10}'.format(num + 1, loss))
            cnt_trees = np.argmin(metrics['Logloss'])
            print('choose cnt_trees={}'.format(cnt_trees))
            print('overfit={}; AUC={}; logloss={}'.format(
                cnt_trees, metrics['AUC'][cnt_trees],
                metrics['Logloss'][cnt_trees]),
                  file=file_result_to)
            tree_counts.append(cnt_trees)
            file_result_to.flush()
            logloss_curves.append(metrics['Logloss'])
            auc_curves.append(metrics['AUC'])
            auc[clazz.WRAPPER_NAME].append(metrics['AUC'][cnt_trees])
            logloss[clazz.WRAPPER_NAME].append(metrics['Logloss'][cnt_trees])

        print('class={}, learn_time={}, mean_tree_count={}'.format(
            clazz.WRAPPER_NAME, end - beg,
            sum(tree_counts) / len(tree_counts)),
              file=file_result_to)
        print('mean_AUC={}, mean_logloss={}'.format(
            sum(auc[clazz.WRAPPER_NAME]) / len(auc[clazz.WRAPPER_NAME]),
            sum(logloss[clazz.WRAPPER_NAME]) /
            len(logloss[clazz.WRAPPER_NAME])),
              file=file_result_to)
        file_result_to.flush()

        logloss_fig = create_learning_curves_plot(
            logloss_curves, 'logloss {}'.format(clazz.WRAPPER_NAME))
        auc_fig = create_learning_curves_plot(
            auc_curves, 'AUC {}'.format(clazz.WRAPPER_NAME))
        logloss_file = os.path.join(
            dir_, 'fig_{}_{}'.format('Logloss', clazz.WRAPPER_NAME))
        AUC_file = os.path.join(dir_,
                                'fig_{}_{}'.format('AUC', clazz.WRAPPER_NAME))
        plot(logloss_fig, filename=logloss_file, auto_open=False)
        plot(auc_fig, filename=AUC_file, auto_open=False)

    file_name = os.path.join(dir_, 'boot.txt')
    with open(file_name, 'w') as file_to:
        json.dump(auc, file_to)

    for cl1 in classes:
        for cl2 in classes:
            stat, p_value = wilcoxon(auc[cl1.WRAPPER_NAME],
                                     auc[cl2.WRAPPER_NAME],
                                     zero_method="pratt")
            print('for {} & {}: stat: {}, p_value: {}'.format(
                cl1.WRAPPER_NAME, cl2.WRAPPER_NAME, stat, p_value),
                  file=file_result_to)
Beispiel #13
0
print('训练集 Y_CAT_train 的shape是',Y_CAT_train.shape)
print('测试集 Y_CAT_test  的shape是',Y_CAT_test.shape)
print('训练集中label的正负比例分布如下:\n',Y_CAT_train.value_counts())
print('测试集中label的正负比例分布如下:\n',Y_CAT_test.value_counts())
print('可以看出在划分训练集和测试集时设定strtify参数为Y_CAT;使得在训练测试集中正负例所占比例一致。')

## catboost建模
### Step1: Pool Initialize
from catboost import Pool
pool_data = Pool(data = X_CAT,
           label = Y_CAT,
           cat_features = CAT_features)
print('pool_data的 type 是:', type(pool_data))
print('pool_data的 shpe 是:', pool_data.shape)
print('pool_data.get_features()返回的是list类型,其长度是:',len(pool_data.get_features()))
print('pool_data.get_label()返回的是list类型,其长度是:', len(pool_data.get_label()))
print('pool_data中类别变量所在的索引位置是 pool_data.get_cat_feature_indices() :', pool_data.get_cat_feature_indices())
#print('生成的pool_data的各观测的weight:', pool_data.get_weight())
#print('生成的pool_data的各观测的baseline:', pool_data.get_baseline())


#### Step2.1 自定义metric类。用以做最优模型选择和过拟合检测

# **************Custom metric for overfitting detector and best model selection******
import math
from catboost import Pool, CatBoostClassifier


class Recall_1_Metric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)
def catboost_test(dir_, cur_learn_name, cur_test_name, clazz, learning_rate=None, border_count=128, cnt_models=1,
                  file_result_to=sys.stdout, file_info_to=sys.stdout, iterations=1500):
    full_learn_name = os.path.join(dir_, cur_learn_name)
    full_test_name = os.path.join(dir_, cur_test_name)

    if not os.path.exists(full_learn_name):
        source_learn_pool = Pool(data=os.path.join(dir_, learn_name), column_description=os.path.join(dir_, cd_file))
        source_test_pool = Pool(data=os.path.join(dir_, test_name), column_description=os.path.join(dir_, cd_file))
        cl = clazz()
        beg = time.time()
        learn_pool = cl.handle_learn_pool(source_learn_pool)
        test_pool = cl.handle_test_pool(source_test_pool)
        end = time.time()
        print('!!!time: {}'.format(end - beg), file=file_info_to)
        print('priors: {}'.format(cl.prior), file=file_info_to)
        print('prior scores: {}'.format(cl.score), file=file_info_to)
        file_info_to.flush()
        learn_label = learn_pool.get_label()
        learn_features = learn_pool.get_features()
        learn_data = np.zeros((len(learn_label), len(learn_features[0]) + 1))
        learn_data[:, 0] = learn_label
        learn_data[:, 1:] = learn_features
        np.savetxt(full_learn_name, learn_data, delimiter='\t', fmt='%.10f')
        test_label = test_pool.get_label()
        test_features = test_pool.get_features()
        test_data = np.zeros((len(test_label), len(test_features[0]) + 1))
        test_data[:, 0] = test_label
        test_data[:, 1:] = test_features
        np.savetxt(full_test_name, test_data, delimiter='\t', fmt='%.10f')

    learn_pool = Pool(data=full_learn_name)
    test_pool = Pool(data=full_test_name)

    scores = []
    auc = []
    logloss = []
    times =[]
    tree_counts = []
    for seed in range(cnt_models):
        print(seed)
        # print(len(learn_pool.get_features()), len(learn_pool.get_features()[0]))
        # print(len(test_pool.get_features()), len(test_pool.get_features()[0]))
        beg = time.time()
        cat = CatBoostClassifier(max_ctr_complexity=1, custom_metric='AUC', boosting_type='Plain', random_seed=seed, border_count=border_count, iterations=iterations, learning_rate=learning_rate, thread_count=multiprocessing.cpu_count())
        cat.fit(learn_pool, eval_set=(test_pool), use_best_model=True)
        end = time.time()
        X_test = test_pool.get_features()
        y_test = test_pool.get_label()

        tree_counts.append(cat.tree_count_)
        scores.append(cat.score(X_test, y_test))
        metrics = cat.eval_metrics(test_pool, ['AUC', 'Logloss'], eval_period=cat.tree_count_ - 1)
        print('overfit={}; acc={}; AUC={}; logloss={}; learn_time={}'.format(cat.tree_count_, scores[-1], metrics['AUC'][1], metrics['Logloss'][1], end - beg), file=file_result_to)
        file_result_to.flush()
        auc.append(metrics['AUC'][1])
        logloss.append(metrics['Logloss'][1])
        times.append(end - beg)
    if len(tree_counts) != 0:
        print('mean tree_count: {}'.format(sum(tree_counts)/len(tree_counts)), file=file_result_to)
        return sum(scores)/len(scores), sum(auc)/len(auc), sum(logloss)/len(logloss), sum(times)/len(times)
    else:
        return 0, 0, 0, 0