def order_by_distribution_drift(labels, training_y_arr, quarter_y_arr):
    pos_training_prev, neg_training_prev = compute_prevalence(
        labels, training_y_arr)
    pos_test_prev, neg_test_prev = compute_prevalence(labels, quarter_y_arr)
    prevalence_drift = dict()
    for label in labels:
        prevalence_drift[label] = absolute_error(pos_training_prev[label],
                                                 neg_training_prev[label],
                                                 pos_test_prev[label],
                                                 neg_test_prev[label])

    return sorted(pairs,
                  key=lambda t: (prevalence_drift[t[0]] + prevalence_drift[t[
                      1]]) / 2), prevalence_drift
Exemple #2
0
def test_alpha_values_single_pair(alpha_cr: float, alpha_cp: float, label_cr, label_cp):
    run_costs = dict()
    labels = [label_cr, label_cp]
    pairs = [(label_cr, label_cp)]
    alpha_labels = {pair: [alpha_cr, alpha_cp] for pair in pairs}
    for train_index, test_index in k_fold.split(train_x):
        train = train_x[train_index]
        test = train_x[test_index]
        classifiers = learn_classifiers(dataset, train, labels, 10, train_index=train_index)
        posterior_probabilities = compute_posterior_probabilities(dataset, test, labels, classifiers)

        train_y = dict()
        test_y = dict()

        for label in labels:
            train_y[label] = np.asarray(dataset.target[train_index, dataset.target_names.searchsorted(label)].todense()).squeeze()
            test_y[label] = np.asarray(dataset.target[test_index, dataset.target_names.searchsorted(label)].todense()).squeeze()

        prior_probabilities, _ = compute_prevalence(labels, train_y)
        costs = Costs(cost_structure_1, pairs, posterior_probabilities, test_y)
        minecore = MineCore(pairs, prior_probabilities, posterior_probabilities, test_y, alpha_labels, 1.0, 1.0)
        tau_rs, tau_ps, _, cm_3 = minecore.run_plusplus(costs)
        for key, value in costs.get_third_phase_costs(cm_3, tau_rs, tau_ps)[0].items():
            prec_val = run_costs.setdefault(key, {(alpha_cr, alpha_cp): 0})
            run_costs[key][(alpha_cr, alpha_cp)] = prec_val[(alpha_cr, alpha_cp)] + value
    logging.info(f"\nRun costs for alpha {(alpha_cr, alpha_cp)}:\n{run_costs}\n")
    return run_costs
def show_graph_deltaacc_priors(bef_emq_priors,
                               after_emq_priors,
                               bef_costs,
                               after_costs,
                               x_label="Delta accuracy priors",
                               y_label="Cost difference"):
    pos_training_prev, neg_training_prev = compute_prevalence(
        labels, quarter_y_arr)
    delta_accuracy = dict()
    for label in labels:
        true_priors = np.array(
            [neg_training_prev[label], pos_training_prev[label]])
        before_diff = np.abs(np.array(bef_emq_priors[label]) -
                             true_priors).mean()
        after_diff = np.abs(np.array(after_emq_priors[label]) -
                            true_priors).mean()
        delta_accuracy[label] = after_diff - before_diff

    for key in after_costs.keys():
        d = after_costs[key] - bef_costs[key]
        acc_mean = delta_accuracy[key[0]] + delta_accuracy[key[1]]
        plt.plot(acc_mean, d, 'go')

    plt.plot(np.linspace(-0.04, 0.02), [0 for _ in range(50)], 'r-')
    plt.plot([0 for _ in range(50)], np.linspace(-5000, 16000), 'r-')
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()
Exemple #4
0
        ('C18', 'GCAT'), ('C18', 'C152'), ('C18', 'C15'), ('C18', 'C17'),
        ('GPOL', 'MCAT'), ('GPOL', 'CCAT'), ('GPOL', 'GCRIM'), ('GPOL', 'E21'),
        ('GPOL', 'GVIO'), ('C152', 'M11'), ('C152', 'C17'), ('C152', 'C31'),
        ('C152', 'C181'), ('C152', 'C18'), ('M14', 'M132'), ('M14', 'M13'),
        ('M14', 'GCAT'), ('M14', 'C24'), ('M14', 'C31'), ('C151', 'C181'),
        ('C151', 'C18'), ('C151', 'C17'), ('C151', 'C31'), ('C151', 'C152'),
        ('ECAT', 'GVIO'), ('ECAT', 'C17'), ('ECAT', 'M13'), ('ECAT', 'GPOL'),
        ('ECAT', 'MCAT')
    ]

    labels = set()
    for cr, cp in pairs:
        labels.add(cr)
        labels.add(cp)

    pos_prevalences, neg_prevalences = compute_prevalence(
        labels, training_y_arr)

    def save(mc, costs, name):
        tau_rs, tau_ps, cm_2, cm_3 = mc.run(costs)
        with open(name, 'wb') as f:
            pickle.dump([tau_rs, tau_ps, cm_2, cm_3], f)

    # Before EMQ
    # costs = Costs(cost_structure_1, pairs, posterior_probs, quarter_y_arr)
    # mc = MineCore(pairs, posterior_probs, quarter_y_arr)
    # t1 = threading.Thread(target=save, args=(mc, costs, "before_emq.pkl"))
    # t1.start()

    # After EMQ
    new_posteriors, pos_priors = emq_new_attempt(posterior_probs,
                                                 pos_prevalences, labels)
Exemple #5
0
for i, c in enumerate(dataset.target_names):
    train_y[c] = np.asarray(dataset.target[0:TRAINING_SET_END,
                                           i].todense()).squeeze()

with open('./pickles/post_prob.pkl', 'rb') as f:
    posterior_probabilities = pickle.load(f)

with open('./pickles/alpha_dict_labels_3108.pkl', 'rb') as f:
    alphas = pickle.load(f)

alphas = dict(
    map(lambda kv: (kv[0], min(kv[1], key=kv[1].get)), alphas.items()))
# classifiers = learn_classifiers(dataset, train_x, labels, 10, training_set_end=TRAINING_SET_END)
# posterior_probabilities = compute_posterior_probabilities(dataset, dataset.data[TEST_SET_START:TEST_SET_END, :], labels, classifiers)
prior_probabilities, neg_priors = compute_prevalence(labels, train_y)
# new_priors = dict()
# for label in labels:
#     print(f"Updating probabilities for label: {label}")
#     _, em_pos, em_neg = emq_attempt(posterior_probabilities[label], prior_probabilities[label], neg_priors[label])
#     new_priors[label] = em_pos

# alphas = dict(map(lambda kv: (kv[0], (1.0, 1.0)), alphas.items()))
ro_r = 0.50
ro_p = 0.99
costs = Costs(cost_structure_1,
              pairs,
              posterior_probabilities,
              quarter_y_arr,
              alphas=alphas,
              prior_probabilities=prior_probabilities,
        ('C18', 'GCAT'), ('C18', 'C152'), ('C18', 'C15'), ('C18', 'C17'),
        ('GPOL', 'MCAT'), ('GPOL', 'CCAT'), ('GPOL', 'GCRIM'), ('GPOL', 'E21'),
        ('GPOL', 'GVIO'), ('C152', 'M11'), ('C152', 'C17'), ('C152', 'C31'),
        ('C152', 'C181'), ('C152', 'C18'), ('M14', 'M132'), ('M14', 'M13'),
        ('M14', 'GCAT'), ('M14', 'C24'), ('M14', 'C31'), ('C151', 'C181'),
        ('C151', 'C18'), ('C151', 'C17'), ('C151', 'C31'), ('C151', 'C152'),
        ('ECAT', 'GVIO'), ('ECAT', 'C17'), ('ECAT', 'M13'), ('ECAT', 'GPOL'),
        ('ECAT', 'MCAT')
    ]

    labels = set()
    for cr, cp in pairs:
        labels.add(cr)
        labels.add(cp)

    train_pos_prevalences, train_neg_prevalences = compute_prevalence(
        labels, quarter_x_arr)
    true_pos_prevalences, true_neg_prevalences = compute_prevalence(
        labels, quarter_y_arr)
    new_posteriors = dict()
    pos_priors = dict()
    neg_priors = dict()
    for label in labels:
        print(f"Updating probabilities for label: {label}")
        new_posteriors[label], pos_priors[label], neg_priors[
            label], s = emq_attempt(posterior_probs[label],
                                    train_pos_prevalences[label],
                                    train_neg_prevalences[label])

    train_errs, emq_errs = check_similarity(train_pos_prevalences,
                                            train_neg_prevalences, pos_priors,
                                            neg_priors, true_pos_prevalences,
        cr_set.add(cr)
        cp_set.add(cp)

    for label in labels:
        train_idxs = list(training_sets[label])
        mask = np.ones(dataset.data.shape[0], dtype=bool)
        mask[train_idxs] = False
        y_arr[label] = np.asarray(dataset.target[
            mask,
            dataset.target_names.searchsorted(label)].todense()).squeeze()
        training_y[label] = np.asarray(dataset.target[
            train_idxs,
            dataset.target_names.searchsorted(label)].todense()).squeeze()
        prob[label] = prob[label][mask]

    pos_prev, neg_prev = compute_prevalence(labels, training_y)
    true_pos_prev, true_neg_prev = compute_prevalence(labels, y_arr)

    show_distribution_drift_graph(pos_prev, true_pos_prev, labels)

    for label in labels:
        print(f"Updating probabilities for label: {label}")
        emq_post[label], emq_pos_priors[label], emq_neg_priors[
            label] = emq_attempt(prob[label], pos_prev[label], neg_prev[label])

    classifier_matrix, emq_matrix = get_contingency_matrix(
        labels, prob, emq_post, y_arr)
    labels_len = len(labels)
    for key, val in classifier_matrix.items():
        classifier_matrix[key] = val / labels_len
        emq_matrix[key] = emq_matrix[key] / labels_len
    if args.compare_run_path:
        with open(args.compare_run_path, 'rb') as f2:
            tau_rs_2, tau_ps_2, cm_2_2, cm_3_2 = pickle.load(f2)

        with open(args.compare_run_posteriors, 'rb') as pf2:
            posteriors_2 = pickle.load(pf2)

    rcv1 = fetch_rcv1()
    full_y_arr, quarter_y_arr, pairs, labels = get_setup_data(rcv1)
    tsv_file = open(args.output_file, 'w')
    writer = csv.writer(tsv_file, delimiter='\t', quotechar='|')
    writer.writerow(('Pair', 'PP', 'PL', 'PW', 'LP', 'LL', 'LW', 'WP', 'WL', 'WW', 'tau_r', 'tau_p', 'missclass_cost',
                    'annot_cost', 'missclass_cost_other', 'annot_cost_other', 'prior_cr', 'prior_cp'))


    priors, _ = compute_prevalence(labels, quarter_y_arr)
    missclass_cost_1, annot_cost_1 = get_misclass_and_annot_costs(pairs, posteriors_1, quarter_y_arr, alpha_values, priors, cm_3_1, tau_rs_1, tau_ps_1)
    if posteriors_2 is not None:
        missclass_cost_2, annot_cost_2 = get_misclass_and_annot_costs(pairs, posteriors_2, quarter_y_arr, None, None, cm_3_2, tau_rs_2, tau_ps_2)

    for pair, cont_vals in sorted(cm_3_1.items()):
        contingency_table = ContingencyTable._make(flatten(cont_vals))
        tau_rs_pair = tau_rs_1[pair]
        tau_ps_pair = tau_ps_1[pair]

        writer.writerow(
            (
                f"{pair[0]}-{pair[1]}",
                contingency_table.PP,
                contingency_table.PL,
                contingency_table.PW,
Exemple #9
0
y_arr = dict()
training_y = dict()
emq_posteriors = dict()

for label in labels:
    train_idxs = list(training_sets[label])
    mask = np.ones(dataset.data.shape[0], dtype=bool)
    mask[train_idxs] = False
    y_arr[label] = np.asarray(dataset.target[
        mask, dataset.target_names.searchsorted(label)].todense()).squeeze()
    training_y[label] = np.asarray(dataset.target[
        train_idxs,
        dataset.target_names.searchsorted(label)].todense()).squeeze()
    prob[label] = prob[label][mask]

pos_prev, neg_prev = compute_prevalence(labels, training_y)

print("Computing EMQ")
for label in labels:
    post, pos_prior, neg_prior = emq_attempt(prob[label], pos_prev[label],
                                             neg_prev[label])
    emq_posteriors[label] = post

costs = Costs(cost_structure_1, pairs, emq_posteriors, y_arr)
minecore = MineCore(pairs, None, emq_posteriors, y_arr, None, 1.0, 1.0)

print("Running standard Minecore")
# standard_results = minecore.run(costs)

# minecore.posterior_probabilities = emq_posteriors