def verify_images_and_plots(x_matrix, mu_vec, z_vec, c_vec, v_eig_vector, p_pca_vector, all_labels, image_index): log_debug("Verify Images!") if not ENABLE_IMAGE_SHOW: return fig = plt.figure() draw_image_subplot(x_matrix[image_index], "X img {}".format(image_index), fig.add_subplot(241)) draw_image_subplot(v_eig_vector[0], "EigVec 0", fig.add_subplot(242)) draw_image_subplot(v_eig_vector[1], "EigVec 1", fig.add_subplot(243)) x_rec1 = (np.dot(p_pca_vector[:, 0:1], v_eig_vector[0:1, :])) + mu_vec draw_image_subplot(x_rec1[image_index], "Rec with Pc1", fig.add_subplot(244)) x_rec2 = (np.dot(p_pca_vector[:, 0:10], v_eig_vector[0:10, :])) + mu_vec draw_image_subplot(x_rec2[image_index], "Rec with Pc10", fig.add_subplot(245)) x_rec2 = (np.dot(p_pca_vector[:, 0:100], v_eig_vector[0:100, :])) + mu_vec draw_image_subplot(x_rec2[image_index], "Rec with Pc100", fig.add_subplot(246)) x_rec = (np.dot(p_pca_vector, v_eig_vector)) + mu_vec draw_image_subplot(x_rec[image_index], "Rec with all PC", fig.add_subplot(247)) fig.tight_layout(pad=0) show()
def get_xi_prob_by_histo(pos_pc_1, pos_pc_2, neg_pc_1, neg_pc_2, binc, xp, xn): log("\n\n\n") log("HISTO") log("-----") log_debug("\tClass +ve Histo:") pos_class_histo = get_histo_matrix(pos_pc_1, pos_pc_2, binc) draw_image(pos_class_histo, "+ve Histo") log("\tClass +ve Histo: ", sum(pos_class_histo)) write_file_ndarray("(20) Hp [{}]".format(POSITIVE_CLASS), pos_class_histo) # pos_class_histo_npformula = np.histogram2d(pos_pc_1, pos_pc_2, binc)[0] # draw_image(pos_class_histo_npformula, "+ve Histo") log_debug("\tClass -ve Histo:") neg_class_histo = get_histo_matrix(neg_pc_1, neg_pc_2, binc) draw_image(neg_class_histo, "-ve Histo") log("\tClass -ve Histo: ", sum(neg_class_histo)) write_file_ndarray("(46) Hn [{}]".format(NEGATIVE_CLASS), neg_class_histo) # neg_class_histo_npformula = np.histogram2d(neg_pc_1, neg_pc_2, binc)[0] # draw_image(neg_class_histo_npformula, "-ve Histo") xp_prob_histo = get_prob_by_histo(pos_class_histo, neg_class_histo, xp, MU, V, "Histo XP{} +ve".format(XP_INDEX)) xn_prob_histo = get_prob_by_histo(pos_class_histo, neg_class_histo, xn, MU, V, "Histo XN{} -ve".format(XN_INDEX)) log("\tXP : ", XP_INDEX, ", truth: ", labels[XP_INDEX], ", Prob Histo {}: {}".format(POSITIVE_CLASS, xp_prob_histo[0])) write_file_array("(89) Result of classifying xp using histograms: ", [labels[XP_INDEX], xp_prob_histo[0]]) log("\tXn : ", XN_INDEX, ", truth: ", labels[XN_INDEX], ", Prob Histo {}: {}".format(NEGATIVE_CLASS, xn_prob_histo[1])) write_file_array("(93) Result of classifying xn using histograms: ", [labels[XN_INDEX], xn_prob_histo[1]]) return [pos_class_histo, neg_class_histo]
def get_training_accuracy_by_histo(hp, hn, mu_vec, v_vec, x_all, x_labels): num_right_predictions = 0 xi_pred = [POSITIVE_CLASS] log("Xi\tPred?\tTruth?") for xi_index in range(len(x_all)): xi_truth = x_labels[xi_index] xi_p_n = get_prob_by_histo(hp, hn, x_all[xi_index], mu_vec, v_vec, "Histo Xi{}".format(xi_index)) if xi_truth == POSITIVE_CLASS: if xi_p_n[0] > xi_p_n[1]: xi_pred = [POSITIVE_CLASS] else: xi_pred = [NEGATIVE_CLASS] else: if xi_p_n[1] > xi_p_n[0]: xi_pred = [NEGATIVE_CLASS] else: xi_pred = [POSITIVE_CLASS] if xi_pred == xi_truth: num_right_predictions += 1 log_debug(xi_index, "\t", xi_pred, "\t", x_labels[xi_index]) else: log_debug(xi_index, "\t", xi_pred, "\t", x_labels[xi_index]) histo_accuracy = (num_right_predictions / (1.0 * len(x_labels))) * 100 log("Histo Accuracy: ", histo_accuracy, ", TP+TN: ", num_right_predictions, ", Total: ", len(x_labels)) write_file_array("(97) Training accuracy attained using histograms: ", [histo_accuracy])
def get_x_feature_vectors(images_3d_array): flat_images = list() for i in images_3d_array: flat_images.append(i.ravel()) x_feature_vectors = np.asarray(flat_images) log_debug("X shape: ", x_feature_vectors.shape) log_debug("X min/max: ", np.amin(x_feature_vectors), np.amax(x_feature_vectors)) return x_feature_vectors
def get_linear_classifier_weights(x_all, t_all): num_rows = len(x_all) num_cols = 2 all_features = np.zeros(shape=(num_rows, num_cols)) for idx in range(0, num_rows): row = [x_all[idx]] row.insert(0, 1) all_features[idx] = row w = compute_weight_vector(all_features, t_all) log_debug(w)
def verify_c_covariance_vector(c_covariance_vector): c_row = c_covariance_vector.shape[0] c_col = c_covariance_vector.shape[1] assert c_row == c_col for idx_r in range(c_row): for idx_c in range(c_col): if idx_r == idx_c: assert c_covariance_vector[idx_r][idx_c] >= 0 assert c_covariance_vector[idx_r][idx_c] == c_covariance_vector[ idx_c][idx_r] log_debug("C Verification: Good!") return
def is_eigen_values_row_aligned(c_covariance_matrix, eig_val, eig_vec, row, col): # log_debug(eig_val) # log_debug(np.dot(c_covariance_matrix, row) / (eig_val[0] * row)) # log_debug(np.dot(c_covariance_matrix, col) / (eig_val[0] * col)) sum_x = round( sum( np.dot(c_covariance_matrix, eig_vec[0]) - np.dot(eig_val[0], eig_vec[0])), 8) log_debug("EigVec Verifi: ", sum_x) if sum_x == 0.0: return True return False
def write_file_array(l_msg, *args): if OUT_SKIP: return global OUT_STREAM row_title = [l_msg] data = [*args] log_debug(row_title, ": ", data) if len(l_msg) > 0: OUT_STREAM.writerow(row_title) for row in data: if isinstance(row, int) or isinstance(row, np.float64): row = [row] OUT_STREAM.writerow(row)
def get_xi_prob_from_sk(x_all, xp_index, xn_index): log("\n\n\n") log("SK") log("-----") skpca = sk_pca(n_components=2) pcs = skpca.fit_transform(x_all) log_debug("\tSK PCA: ", pcs.shape) pos_pcs = [ pcs[idx] for idx in range(len(labels)) if labels[idx] == POSITIVE_CLASS ] neg_pcs = [ pcs[idx] for idx in range(len(labels)) if labels[idx] == NEGATIVE_CLASS ] assert (len(pos_pcs) == len(pos_class_pcs)) assert (len(neg_pcs) == len(neg_class_pcs)) draw_scatter_plot(pcs[:, 0], P[:, 1], pcs[xp_index], pcs[xn_index], labels) get_xi_prob_by_bayes(pos_pcs, neg_pcs, pcs[xp_index], pcs[xn_index])
def get_delta_and_tow_impl(x_t_all): assert isinstance(x_t_all, np.ndarray) assert x_t_all.shape[1] == NUM_FEATURES + 1 num_features = NUM_FEATURES target_idx = NUM_FEATURES delta_array = np.zeros(num_features) tau_array = np.zeros(num_features) target = x_t_all[:, target_idx] for feature_idx in range(0, num_features): feature = x_t_all[:, feature_idx] delta, tau = get_feature_impurity_and_tau(feature, target) delta_array[feature_idx] = delta tau_array[feature_idx] = tau log_debug("\n") assert np.min(delta_array) > 0 return delta_array, tau_array
def get_xi_prob_by_bayes(pos_pcs, neg_pcs, xp_pc, xn_pc): log("\n\n\n") log("BAYES") log("-----") log("\tBayes Query: \n", xp_pc, "\n", xn_pc) mu_p = get_mu_mean_vectors(pos_pcs) mu_n = get_mu_mean_vectors(neg_pcs) log_debug("\tClass +ve Bayesian Mu: ", mu_p, ", Class -ve Bayesian Mu: ", mu_n) write_file_array("(9) mup [{}]".format(POSITIVE_CLASS), mu_p) write_file_array("(10) mun [{}]".format(NEGATIVE_CLASS), mu_n) c_p = get_c_covariance_vector(pos_pcs) verify_c_covariance_vector(c_p) c_n = get_c_covariance_vector(neg_pcs) verify_c_covariance_vector(c_n) log_debug("\tClass +ve Bayesian Cov: ", c_p, ", Class -ve Bayesian Cov: ", c_n) write_file_ndarray("(12) cp [{}]".format(POSITIVE_CLASS), c_p) write_file_ndarray("(14) cn [{}]".format(NEGATIVE_CLASS), c_n) xp_p_pdf = get_bayes_2d_pdf(mu_p, c_p, pos_pcs, xp_pc, "XP{}".format(XP_INDEX)) xp_n_pdf = get_bayes_2d_pdf(mu_n, c_n, neg_pcs, xp_pc, "XP{}".format(XP_INDEX)) xn_p_pdf = get_bayes_2d_pdf(mu_p, c_p, pos_pcs, xn_pc, "XN{}".format(XN_INDEX)) xn_n_pdf = get_bayes_2d_pdf(mu_n, c_n, neg_pcs, xn_pc, "XN{}".format(XN_INDEX)) xp_prob_bayes = xp_p_pdf / (xp_p_pdf + xp_n_pdf) xn_prob_bayes = xn_n_pdf / (xn_p_pdf + xn_n_pdf) log("\tXP : ", XP_INDEX, ", truth: ", labels[XP_INDEX], ", Prob Histo {}: {}".format(POSITIVE_CLASS, xp_prob_bayes)) write_file_array("(90) Result of classifying xp using Bayesian: ", [labels[XP_INDEX], xp_prob_bayes]) log("\tXN : ", XN_INDEX, ", truth: ", labels[XN_INDEX], ", Prob Histo {}: {}".format(NEGATIVE_CLASS, xn_prob_bayes)) write_file_array("(94) Result of classifying xn using Bayesian: ", [labels[XP_INDEX], xn_prob_bayes])
def get_pca(data_type): images, l_labels = load_mnist(data_type, digits=[NEGATIVE_CLASS, POSITIVE_CLASS]) x = get_x_feature_vectors(images) mu = get_mu_mean_vectors(x) z = get_z_variance_vector(x, mu) c = get_c_covariance_vector(z) verify_c_covariance_vector(c) [eig_val, v] = get_eigen_value_n_vector(c) verify_v_eigen_value_n_vector(eig_val, v) eig_row = v[0, :] eig_col = v[:, 0] if is_eigen_values_row_aligned(c, eig_val, v, eig_row, eig_col) and not FORCE_EIGEN_FLIP: log_debug("EigVec is already ROW aligned") else: log_debug("EigVec is COL aligned") eig_val = np.flipud(eig_val) v = np.flipud(v.T) if is_eigen_values_row_aligned(c, eig_val, v, eig_row, eig_col): log_debug("EigVec is already ROW aligned") else: assert "EigVec failed to be ROW aligned!" p = np.dot(z, v.T) verify_pca_vector(p) return [x, mu, z, c, v, p, l_labels]
def get_feature_impurity_and_tau(x, t): x_nd_raw = np.column_stack(zip(x, t)).transpose() x_nd = x_nd_raw[x_nd_raw[:, 0].argsort(kind='mergesort')] t = x_nd[:, 1] x_count = np.alen(x_nd) x_negative = [x_nd[x, 0] for x in range(0, x_count) if x_nd[x, 1] == NEGATIVE_CLASS_MAPPED] t_negative = np.alen(x_negative) x_positive = [x_nd[x, 0] for x in range(0, x_count) if x_nd[x, 1] == POSITIVE_CLASS_MAPPED] t_positive = np.alen(x_positive) log_debug(x_count, t_negative, t_positive) a_negative = 0 a_positive = 0 impurity_initial = impurity_optimal = (t_negative * t_positive) / (x_count * x_count) tow_idx = 0 tow = x[tow_idx] for idx in range(1, x_count): if t[idx - 1] == NEGATIVE_CLASS_MAPPED: a_negative += 1 else: a_positive += 1 impurity_part2_1 = ((a_negative * a_positive) / (a_negative + a_positive)) impurity_part2_2 = ((t_negative - a_negative) * (t_positive - a_positive)) / ( t_negative + t_positive - a_negative - a_positive) impurity_tmp = (1.0 / x_count) * (impurity_part2_1 + impurity_part2_2) if impurity_tmp < impurity_optimal: impurity_optimal = impurity_tmp tow = x[idx] tow_idx = idx delta = impurity_initial - impurity_optimal tau = x[tow_idx] log_debug("Io: ", impurity_initial) log_debug("Iopt: ", impurity_optimal) log_debug("I delta: ", delta) log_debug("Tow: ", tow, ", (i={})".format(tow_idx), tau) return delta, tau
def get_training_accuracy_by_bayes(pos_pcs, neg_pcs, p_all, x_labels): mu_p = get_mu_mean_vectors(pos_pcs) mu_n = get_mu_mean_vectors(neg_pcs) c_p = get_c_covariance_vector(pos_pcs) c_n = get_c_covariance_vector(neg_pcs) num_right_predictions = 0 xi_pred = [POSITIVE_CLASS] p_all_major = p_all[:, 0:2] log("Xi\tPred?\tTruth?") for xi_index in range(len(p_all)): xi_pc = p_all_major[xi_index] xi_truth = x_labels[xi_index] xi_p_pdf = get_bayes_2d_pdf(mu_p, c_p, pos_pcs, xi_pc, "XP{}".format(XP_INDEX)) xi_n_pdf = get_bayes_2d_pdf(mu_n, c_n, neg_pcs, xi_pc, "XP{}".format(XP_INDEX)) xi_p_bayes = xi_p_pdf / (xi_p_pdf + xi_n_pdf) xi_n_bayes = xi_n_pdf / (xi_p_pdf + xi_n_pdf) if xi_truth == POSITIVE_CLASS: if xi_p_bayes > xi_n_bayes: xi_pred = [POSITIVE_CLASS] else: xi_pred = [NEGATIVE_CLASS] else: if xi_n_bayes > xi_p_bayes: xi_pred = [NEGATIVE_CLASS] else: xi_pred = [POSITIVE_CLASS] if xi_pred == xi_truth: num_right_predictions += 1 log_debug(xi_index, "\t", xi_pred, "\t", x_labels[xi_index]) else: log_debug(xi_index, "\t", xi_pred, "\t", x_labels[xi_index]) bayesian_accuracy = (num_right_predictions / (1.0 * len(x_labels))) * 100 log("Bayes Accuracy: ", bayesian_accuracy) write_file_array("(98) Training accuracy attained using Bayesian: ", [bayesian_accuracy])
def draw_scatter_plot(cloud_a, cloud_b, xp, xn, all_labels): if not ENABLE_PLOT: return log_debug("\nPC Scatter Plot: {}, {}!".format(xp, xn)) fig = plt.figure() cols = np.zeros((alen(all_labels), 4)) for idx, ll in enumerate(all_labels): if ll == POSITIVE_CLASS: cols[idx] = [1, 0, 0, SCATTER_PLOT_ALPHA] if ll == NEGATIVE_CLASS: cols[idx] = [0, 0.2, 1, SCATTER_PLOT_ALPHA] random_order = np.arange(np.alen(all_labels)) ax = fig.add_subplot(111, facecolor='white') ax.scatter(cloud_b, cloud_a, s=8, linewidths=0, facecolors=cols[random_order, :], marker='o') ax.plot(xp[1], xp[0], marker='x', color='black', label='XP[{}]'.format(POSITIVE_CLASS)) ax.plot(xn[1], xn[0], marker='*', color='black', label='XN[{}]'.format(NEGATIVE_CLASS)) ax.legend(loc='lower left', numpoints=1, ncol=3, fontsize=10, bbox_to_anchor=(0, 0)) ax.set_aspect('equal') plt.rcParams['axes.facecolor'] = 'b' # plt.gca().invert_yaxis() plt.title( 'Principal Components PC1 and PC2 scatter plot\nManoj Govindassamy') plt.show()
def build_tree_recursive(x_t_all, level=0): x_t_len = np.alen(x_t_all) assert np.alen(x_t_all > 0) if globals.tree_height < level: globals.tree_height = level index_target = NUM_FEATURES prevalence_negative, prevalence_positive = get_prevalence(x_t_all[:, index_target]) prevalence = prevalence_negative * prevalence_positive log_debug("Tree max height so far: ", globals.tree_height) log("X very pure? : subset len: {}, prevalence: {}", x_t_len, prevalence) if prevalence < LIMIT_LEAF_NODE_PREVALENCE or x_t_len < LIMIT_LEAF_NODE_SUBSET_SIZE: log_debug("X very pure. Bailing out: subset len: {}, prevalence: {}", x_t_len, prevalence) return get_leaf_node_by_prevalence(prevalence_negative, prevalence_positive) delta_array, tau_array = get_delta_and_tow_impl(x_t_all) delta_max_idx = np.argmax(delta_array) tau = tau_array[delta_max_idx] log_debug("delta_array: ", delta_array, ", delta_max_idx: ", delta_max_idx, ", tau: ", tau_array) x_t_all_sorted_delta_max = x_t_all[x_t_all[:, delta_max_idx].argsort(kind='mergesort')] x_delta_max = x_t_all_sorted_delta_max[:, delta_max_idx] log_debug("Tau, idx: ", tau, np.where(x_delta_max == tau), ", x_sorted: ", x_delta_max) tau_idx = np.where(x_delta_max == tau)[0][0] assert (tau_idx >= 0) and (tau_idx <= np.alen(x_t_all_sorted_delta_max)) # log_debug("\n level: ", level, ", tau_idx: ", tau_idx) x_t_all_left = x_t_all_sorted_delta_max[0:tau_idx, :] x_t_all_right = x_t_all_sorted_delta_max[tau_idx:, :] if np.alen(x_t_all_left) > 0 and np.alen(x_t_all_right) > 0: node = DNode("RULE", feature_idx=delta_max_idx, tau=tau) assert (tau_idx > 0) # log_debug("\n level:", level, ", x_left: ", x_t_all_left.shape[0]) node.left = build_tree_recursive(x_t_all_left, level + 1) assert(np.alen(x_t_all_sorted_delta_max) - tau_idx > 0) # log_debug("\n level: ", level, ", x_right: ", x_t_all_right.shape[0]) node.right = build_tree_recursive(x_t_all_right, level + 1) else: assert np.alen(x_t_all_left) == 0 or np.alen(x_t_all_right) == 0 node = get_leaf_node(x_t_all_sorted_delta_max[:, 2]) return node
def get_prob_by_histo(h_p, h_n, xi, mu_vec, v_vec, msg): log_debug("\n\n", msg) log_debug("Xi.shape: ", xi.shape) draw_image(xi, msg) zi_vec = get_z_variance_vector(xi, mu_vec) log_debug("Zi.shape: ", zi_vec.shape) vi_2d_vec = v_vec[0:2, :] log_debug("Vi.shape: ", vi_2d_vec.shape) pi_vec = np.dot(zi_vec, vi_2d_vec.T) log_debug("\t", pi_vec) r = get_bin(pi_vec[0], min_pc_1, max_pc_1) c = get_bin(pi_vec[1], min_pc_2, max_pc_2) xi_count = h_p[r][c] xj_count = h_n[r][c] xi_p = xi_count / (xi_count + xj_count) xi_n = xj_count / (xi_count + xj_count) log_debug("\tR:", r, ", C:", c, "i_Histo: ", xi_count, "j_Histo: ", xj_count) if xi_count + xj_count == 0: return "Undecidable!" else: return [xi_p, xi_n]
def get_z_variance_vector(x_feature_vector, mu_mean_vector): z = np.subtract(x_feature_vector, mu_mean_vector) log_debug("Z shape: ", z.shape) return z
def get_c_covariance_vector(z_variance_vector): c = np.cov(z_variance_vector, rowvar=False, ddof=1) log_debug("C shape: ", c.shape) # draw_image(c, "Cov") return c
def get_eigen_value_n_vector(c_covariance_vector): [eig_val, eig_vec] = la.eigh(c_covariance_vector) log_debug("EigVal len: ", len(eig_val)) log_debug("EigVec shape: ", eig_vec.shape) return [eig_val, eig_vec]
def verify_v_eigen_value_n_vector(eig_val, v_vec): for row in v_vec: if round(np.linalg.norm(row)) != 1.0: log_debug(np.linalg.norm(row)) assert round(np.dot(v_vec[10], v_vec[100]), 5) == 0
def verify_pca_vector(p_pca_vector): assert round(max(abs(np.mean(p_pca_vector, axis=0))), 5) == 0 log_debug("P shape: ", p_pca_vector.shape)
def get_2d_pdf(xi, mu_x, cov_x, ni_samples): log_debug("\tN: ", ni_samples) xi_mu = np.array([np.subtract(xi, mu_x)]) cov_inverse = inv(cov_x) log_debug("\tCov Inv: ", cov_inverse) xi_mu_transpose = np.transpose(xi_mu) log_debug("\tMu Trans: ", xi_mu_transpose) xi_scalar = (np.dot(xi_mu, cov_inverse).dot(xi_mu_transpose)) / 2 part_2 = math.exp(-1 * xi_scalar) cov_determinant = det(cov_x) part_1 = ni_samples / (2 * math.pi * math.sqrt(cov_determinant)) log_debug("\tC Det: ", cov_determinant) log_debug("\traised: ", xi_scalar) log_debug("\tbottom: ", part_1 / ni_samples) log_debug("\tpdf: ", part_1 * part_2) return part_1 * part_2
def print_histo_samples(): log_debug("\nSample Size:") write_file_array("(17) Min_pc1 Max_pc1: ", [min_pc_1, max_pc_1]) write_file_array("(18) Min_pc2 Max_pc2: ", [min_pc_2, max_pc_2]) write_file_array("(19) Optimal bin count: ", bin_count)
def get_bayes_2d_pdf(mu_i_vec, cov_i_vec, pc_i_vec, pi_vec, msg_str): log_debug(msg_str) ni_samples = len(pc_i_vec) return get_2d_pdf(pi_vec, mu_i_vec, cov_i_vec, ni_samples)
def get_mu_mean_vectors(x_feature_vector): mu = np.mean(x_feature_vector, axis=0, dtype=np.float64) log_debug("MU shape:", mu.shape) return mu