Exemple #1
0
def test_non_ones_weight():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    pool2 = Pool(pool.get_features(), pool.get_label(), weight=np.arange(1, pool.num_row()+1))
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool2)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Exemple #2
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2,
                                    random_seed=0,
                                    loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(
        base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(
        base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2,
                               random_seed=0,
                               loss_function="MultiClass")
    data = map_cat_features(pool.get_features(),
                            pool.get_cat_feature_indices())
    model.fit(data,
              pool.get_label(),
              pool.get_cat_feature_indices(),
              sample_weight=np.arange(1,
                                      pool.num_row() + 1),
              baseline=baseline,
              use_best_model=True,
              eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemple #3
0
def test_non_ones_weight():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    weight = np.arange(1, pool.num_row() + 1)
    pool.set_weight(weight)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemple #4
0
def test_zero_baseline():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    baseline = np.zeros(pool.num_row())
    pool.set_baseline(baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemple #5
0
def test_non_ones_weight():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    weight = np.arange(1, pool.num_row()+1)
    pool.set_weight(weight)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemple #6
0
def test_zero_baseline():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    baseline = np.zeros(pool.num_row())
    pool.set_baseline(baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemple #7
0
def test_zero_baseline():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    baseline = np.zeros((pool.num_row(), 2))
    pool = Pool(pool.get_features(), pool.get_label(), baseline=baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Exemple #8
0
def aggregate_results(name,
                      modes=["single", "ens", "virt"],
                      algorithms=['sgb-fixed', 'sglb-fixed'],
                      num_models=10):

    results = []  # metric values for all algorithms and all folds

    for mode in modes:
        for alg in algorithms:

            if alg == "rf":
                train_pool, y_train, test_pool, y_test, enc = process_classification_dataset(
                    name)

                # process ood data
                cd = read_cd("datasets/" + name + "/pool.cd",
                             data_file="datasets/" + name + "/test")
                try:
                    label_ind = cd['column_type_to_indices']['Label']
                except:
                    label_ind = cd['column_type_to_indices']['Target']

                ood_test_pool = np.loadtxt("datasets/ood/" + name,
                                           delimiter="\t",
                                           dtype="object")
                ood_test_pool = enc.transform(ood_test_pool).astype("float64")
                ood_test_pool = np.delete(ood_test_pool, label_ind, 1)
                ood_size = len(ood_test_pool)

            else:
                test_pool = Pool(data="datasets/" + name + "/test",
                                 column_description="datasets/" + name +
                                 "/pool.cd")
                ood_test_pool = Pool(data="datasets/ood/" + name,
                                     column_description="datasets/" + name +
                                     "/pool.cd")
                ood_size = ood_test_pool.num_row()

                y_test = test_pool.get_label()

            test_size = len(y_test)
            domain_labels = np.concatenate(
                [np.zeros(test_size), np.ones(ood_size)])

            y_test_norm = normalize_test_labels(y_test)

            values = defaultdict(
            )  # metric values for all folds for given algorithm

            if mode == "single":
                # use 0th model from ensemble as a single model
                model = load_model(name, alg, 0)
                preds = model.predict(test_pool)
                preds_proba = model.predict_proba(test_pool)

                values["error"] = (preds != y_test).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)
                values["TU_prr"] = prr_class(y_test_norm, preds_proba,
                                             entropy(preds_proba), False)
                values["KU_prr"] = float("nan")
                values["KU_auc"] = float("nan")

                ood_preds_proba = model.predict_proba(ood_test_pool)
                in_measure = entropy(preds_proba)
                out_measure = entropy(ood_preds_proba)
                values["TU_auc"] = ood_detect(domain_labels,
                                              in_measure,
                                              out_measure,
                                              mode="ROC")

            if mode == "ens":
                all_preds = []  # predictions of all models in ensemble
                all_preds_ood = []

                for i in range(num_models):
                    model = load_model(name, alg, i)
                    preds = model.predict_proba(test_pool)
                    all_preds.append(preds)
                    preds = model.predict_proba(ood_test_pool)
                    all_preds_ood.append(preds)

                all_preds = np.array(all_preds)
                preds_proba = np.mean(all_preds, axis=0)

                all_preds_ood = np.array(all_preds_ood)

                preds = np.argmax(preds_proba, axis=1)
                values["error"] = (preds != y_test_norm).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)

                TU = entropy_of_expected_class(all_preds)
                DU = expected_entropy_class(all_preds)
                KU = TU - DU

                TU_ood = entropy_of_expected_class(all_preds_ood)
                DU_ood = expected_entropy_class(all_preds_ood)
                KU_ood = TU_ood - DU_ood

                values["TU_prr"] = prr_class(y_test_norm, preds_proba, TU,
                                             False)
                values["KU_prr"] = prr_class(y_test_norm, preds_proba, KU,
                                             False)

                values["TU_auc"] = ood_detect(domain_labels,
                                              TU,
                                              TU_ood,
                                              mode="ROC")
                values["KU_auc"] = ood_detect(domain_labels,
                                              KU,
                                              KU_ood,
                                              mode="ROC")

            if mode == "virt":
                if alg in ["sgb", "sgb-fixed"
                           ]:  # we do not evaluate virtual sgb model
                    continue

                # generate virtual ensemble from 0th model
                model = load_model(name, alg, 0)

                all_preds = virtual_ensembles_predict(test_pool, model, alg)

                preds_proba = np.mean(all_preds, axis=0)

                preds = np.argmax(preds_proba, axis=1)
                values["error"] = (preds != y_test_norm).astype(int)
                values["nll"] = nll_class(y_test_norm, preds_proba)

                TU = entropy_of_expected_class(all_preds)
                DU = expected_entropy_class(all_preds)
                KU = TU - DU

                all_preds_ood = virtual_ensembles_predict(
                    ood_test_pool, model, alg)
                TU_ood = entropy_of_expected_class(all_preds_ood)
                DU_ood = expected_entropy_class(all_preds_ood)
                KU_ood = TU_ood - DU_ood

                values["TU_prr"] = prr_class(y_test_norm, preds_proba, TU,
                                             False)
                values["KU_prr"] = prr_class(y_test_norm, preds_proba, KU,
                                             False)

                values["TU_auc"] = ood_detect(domain_labels,
                                              TU,
                                              TU_ood,
                                              mode="ROC")
                values["KU_auc"] = ood_detect(domain_labels,
                                              KU,
                                              KU_ood,
                                              mode="ROC")

            if mode == "virt" and alg in [
                    "sgb", "sgb-fixed"
            ]:  # we do not evaluate virtual sgb model
                continue

            results.append(values)

    return np.array(results)
Exemple #9
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices())
    model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)