def order_by_distribution_drift(labels, training_y_arr, quarter_y_arr): pos_training_prev, neg_training_prev = compute_prevalence( labels, training_y_arr) pos_test_prev, neg_test_prev = compute_prevalence(labels, quarter_y_arr) prevalence_drift = dict() for label in labels: prevalence_drift[label] = absolute_error(pos_training_prev[label], neg_training_prev[label], pos_test_prev[label], neg_test_prev[label]) return sorted(pairs, key=lambda t: (prevalence_drift[t[0]] + prevalence_drift[t[ 1]]) / 2), prevalence_drift
def test_alpha_values_single_pair(alpha_cr: float, alpha_cp: float, label_cr, label_cp): run_costs = dict() labels = [label_cr, label_cp] pairs = [(label_cr, label_cp)] alpha_labels = {pair: [alpha_cr, alpha_cp] for pair in pairs} for train_index, test_index in k_fold.split(train_x): train = train_x[train_index] test = train_x[test_index] classifiers = learn_classifiers(dataset, train, labels, 10, train_index=train_index) posterior_probabilities = compute_posterior_probabilities(dataset, test, labels, classifiers) train_y = dict() test_y = dict() for label in labels: train_y[label] = np.asarray(dataset.target[train_index, dataset.target_names.searchsorted(label)].todense()).squeeze() test_y[label] = np.asarray(dataset.target[test_index, dataset.target_names.searchsorted(label)].todense()).squeeze() prior_probabilities, _ = compute_prevalence(labels, train_y) costs = Costs(cost_structure_1, pairs, posterior_probabilities, test_y) minecore = MineCore(pairs, prior_probabilities, posterior_probabilities, test_y, alpha_labels, 1.0, 1.0) tau_rs, tau_ps, _, cm_3 = minecore.run_plusplus(costs) for key, value in costs.get_third_phase_costs(cm_3, tau_rs, tau_ps)[0].items(): prec_val = run_costs.setdefault(key, {(alpha_cr, alpha_cp): 0}) run_costs[key][(alpha_cr, alpha_cp)] = prec_val[(alpha_cr, alpha_cp)] + value logging.info(f"\nRun costs for alpha {(alpha_cr, alpha_cp)}:\n{run_costs}\n") return run_costs
def show_graph_deltaacc_priors(bef_emq_priors, after_emq_priors, bef_costs, after_costs, x_label="Delta accuracy priors", y_label="Cost difference"): pos_training_prev, neg_training_prev = compute_prevalence( labels, quarter_y_arr) delta_accuracy = dict() for label in labels: true_priors = np.array( [neg_training_prev[label], pos_training_prev[label]]) before_diff = np.abs(np.array(bef_emq_priors[label]) - true_priors).mean() after_diff = np.abs(np.array(after_emq_priors[label]) - true_priors).mean() delta_accuracy[label] = after_diff - before_diff for key in after_costs.keys(): d = after_costs[key] - bef_costs[key] acc_mean = delta_accuracy[key[0]] + delta_accuracy[key[1]] plt.plot(acc_mean, d, 'go') plt.plot(np.linspace(-0.04, 0.02), [0 for _ in range(50)], 'r-') plt.plot([0 for _ in range(50)], np.linspace(-5000, 16000), 'r-') plt.xlabel(x_label) plt.ylabel(y_label) plt.show()
('C18', 'GCAT'), ('C18', 'C152'), ('C18', 'C15'), ('C18', 'C17'), ('GPOL', 'MCAT'), ('GPOL', 'CCAT'), ('GPOL', 'GCRIM'), ('GPOL', 'E21'), ('GPOL', 'GVIO'), ('C152', 'M11'), ('C152', 'C17'), ('C152', 'C31'), ('C152', 'C181'), ('C152', 'C18'), ('M14', 'M132'), ('M14', 'M13'), ('M14', 'GCAT'), ('M14', 'C24'), ('M14', 'C31'), ('C151', 'C181'), ('C151', 'C18'), ('C151', 'C17'), ('C151', 'C31'), ('C151', 'C152'), ('ECAT', 'GVIO'), ('ECAT', 'C17'), ('ECAT', 'M13'), ('ECAT', 'GPOL'), ('ECAT', 'MCAT') ] labels = set() for cr, cp in pairs: labels.add(cr) labels.add(cp) pos_prevalences, neg_prevalences = compute_prevalence( labels, training_y_arr) def save(mc, costs, name): tau_rs, tau_ps, cm_2, cm_3 = mc.run(costs) with open(name, 'wb') as f: pickle.dump([tau_rs, tau_ps, cm_2, cm_3], f) # Before EMQ # costs = Costs(cost_structure_1, pairs, posterior_probs, quarter_y_arr) # mc = MineCore(pairs, posterior_probs, quarter_y_arr) # t1 = threading.Thread(target=save, args=(mc, costs, "before_emq.pkl")) # t1.start() # After EMQ new_posteriors, pos_priors = emq_new_attempt(posterior_probs, pos_prevalences, labels)
for i, c in enumerate(dataset.target_names): train_y[c] = np.asarray(dataset.target[0:TRAINING_SET_END, i].todense()).squeeze() with open('./pickles/post_prob.pkl', 'rb') as f: posterior_probabilities = pickle.load(f) with open('./pickles/alpha_dict_labels_3108.pkl', 'rb') as f: alphas = pickle.load(f) alphas = dict( map(lambda kv: (kv[0], min(kv[1], key=kv[1].get)), alphas.items())) # classifiers = learn_classifiers(dataset, train_x, labels, 10, training_set_end=TRAINING_SET_END) # posterior_probabilities = compute_posterior_probabilities(dataset, dataset.data[TEST_SET_START:TEST_SET_END, :], labels, classifiers) prior_probabilities, neg_priors = compute_prevalence(labels, train_y) # new_priors = dict() # for label in labels: # print(f"Updating probabilities for label: {label}") # _, em_pos, em_neg = emq_attempt(posterior_probabilities[label], prior_probabilities[label], neg_priors[label]) # new_priors[label] = em_pos # alphas = dict(map(lambda kv: (kv[0], (1.0, 1.0)), alphas.items())) ro_r = 0.50 ro_p = 0.99 costs = Costs(cost_structure_1, pairs, posterior_probabilities, quarter_y_arr, alphas=alphas, prior_probabilities=prior_probabilities,
('C18', 'GCAT'), ('C18', 'C152'), ('C18', 'C15'), ('C18', 'C17'), ('GPOL', 'MCAT'), ('GPOL', 'CCAT'), ('GPOL', 'GCRIM'), ('GPOL', 'E21'), ('GPOL', 'GVIO'), ('C152', 'M11'), ('C152', 'C17'), ('C152', 'C31'), ('C152', 'C181'), ('C152', 'C18'), ('M14', 'M132'), ('M14', 'M13'), ('M14', 'GCAT'), ('M14', 'C24'), ('M14', 'C31'), ('C151', 'C181'), ('C151', 'C18'), ('C151', 'C17'), ('C151', 'C31'), ('C151', 'C152'), ('ECAT', 'GVIO'), ('ECAT', 'C17'), ('ECAT', 'M13'), ('ECAT', 'GPOL'), ('ECAT', 'MCAT') ] labels = set() for cr, cp in pairs: labels.add(cr) labels.add(cp) train_pos_prevalences, train_neg_prevalences = compute_prevalence( labels, quarter_x_arr) true_pos_prevalences, true_neg_prevalences = compute_prevalence( labels, quarter_y_arr) new_posteriors = dict() pos_priors = dict() neg_priors = dict() for label in labels: print(f"Updating probabilities for label: {label}") new_posteriors[label], pos_priors[label], neg_priors[ label], s = emq_attempt(posterior_probs[label], train_pos_prevalences[label], train_neg_prevalences[label]) train_errs, emq_errs = check_similarity(train_pos_prevalences, train_neg_prevalences, pos_priors, neg_priors, true_pos_prevalences,
cr_set.add(cr) cp_set.add(cp) for label in labels: train_idxs = list(training_sets[label]) mask = np.ones(dataset.data.shape[0], dtype=bool) mask[train_idxs] = False y_arr[label] = np.asarray(dataset.target[ mask, dataset.target_names.searchsorted(label)].todense()).squeeze() training_y[label] = np.asarray(dataset.target[ train_idxs, dataset.target_names.searchsorted(label)].todense()).squeeze() prob[label] = prob[label][mask] pos_prev, neg_prev = compute_prevalence(labels, training_y) true_pos_prev, true_neg_prev = compute_prevalence(labels, y_arr) show_distribution_drift_graph(pos_prev, true_pos_prev, labels) for label in labels: print(f"Updating probabilities for label: {label}") emq_post[label], emq_pos_priors[label], emq_neg_priors[ label] = emq_attempt(prob[label], pos_prev[label], neg_prev[label]) classifier_matrix, emq_matrix = get_contingency_matrix( labels, prob, emq_post, y_arr) labels_len = len(labels) for key, val in classifier_matrix.items(): classifier_matrix[key] = val / labels_len emq_matrix[key] = emq_matrix[key] / labels_len
if args.compare_run_path: with open(args.compare_run_path, 'rb') as f2: tau_rs_2, tau_ps_2, cm_2_2, cm_3_2 = pickle.load(f2) with open(args.compare_run_posteriors, 'rb') as pf2: posteriors_2 = pickle.load(pf2) rcv1 = fetch_rcv1() full_y_arr, quarter_y_arr, pairs, labels = get_setup_data(rcv1) tsv_file = open(args.output_file, 'w') writer = csv.writer(tsv_file, delimiter='\t', quotechar='|') writer.writerow(('Pair', 'PP', 'PL', 'PW', 'LP', 'LL', 'LW', 'WP', 'WL', 'WW', 'tau_r', 'tau_p', 'missclass_cost', 'annot_cost', 'missclass_cost_other', 'annot_cost_other', 'prior_cr', 'prior_cp')) priors, _ = compute_prevalence(labels, quarter_y_arr) missclass_cost_1, annot_cost_1 = get_misclass_and_annot_costs(pairs, posteriors_1, quarter_y_arr, alpha_values, priors, cm_3_1, tau_rs_1, tau_ps_1) if posteriors_2 is not None: missclass_cost_2, annot_cost_2 = get_misclass_and_annot_costs(pairs, posteriors_2, quarter_y_arr, None, None, cm_3_2, tau_rs_2, tau_ps_2) for pair, cont_vals in sorted(cm_3_1.items()): contingency_table = ContingencyTable._make(flatten(cont_vals)) tau_rs_pair = tau_rs_1[pair] tau_ps_pair = tau_ps_1[pair] writer.writerow( ( f"{pair[0]}-{pair[1]}", contingency_table.PP, contingency_table.PL, contingency_table.PW,
y_arr = dict() training_y = dict() emq_posteriors = dict() for label in labels: train_idxs = list(training_sets[label]) mask = np.ones(dataset.data.shape[0], dtype=bool) mask[train_idxs] = False y_arr[label] = np.asarray(dataset.target[ mask, dataset.target_names.searchsorted(label)].todense()).squeeze() training_y[label] = np.asarray(dataset.target[ train_idxs, dataset.target_names.searchsorted(label)].todense()).squeeze() prob[label] = prob[label][mask] pos_prev, neg_prev = compute_prevalence(labels, training_y) print("Computing EMQ") for label in labels: post, pos_prior, neg_prior = emq_attempt(prob[label], pos_prev[label], neg_prev[label]) emq_posteriors[label] = post costs = Costs(cost_structure_1, pairs, emq_posteriors, y_arr) minecore = MineCore(pairs, None, emq_posteriors, y_arr, None, 1.0, 1.0) print("Running standard Minecore") # standard_results = minecore.run(costs) # minecore.posterior_probabilities = emq_posteriors