def process(data, labels, base_path, cluster_count, heirch): patch_size = 16 stride = 1 patches = extract_patches(data, stride, stride, patch_size, patch_size) patches, labels = get_labels_for_patches(patches, labels) print('Fitting clustering (%s)' % str(patches.shape)) clustering = get_clustering(heirch=heirch, n_clusters=cluster_count) preds = clustering.fit_predict(patches) print('Done fitting clustering') # Here a bin corresponds to a cluster binned_labels = ch.bin_labels(labels, preds) index_to_pred = {} i = 0 for b in binned_labels: index_to_pred[i] = b i += 1 bin_probs, totals = ch.convert_bins_to_probs(binned_labels) binned_samples = ch.bin_samples(patches, preds) bin_entropies = ch.bin_entropies(binned_samples) binned_samples_labels = ch.bin_samples_labels(patches, preds, labels) sorted_indices = np.argsort(bin_entropies) if heirch: add_str = '_heirch' else: add_str = '' visualize_bin_entropies(sorted_indices, bin_entropies, 60, 60, base_path, 'entropy_vis%i%s.png' % (cluster_count, add_str), 'Max and Min Entropies (%i)' % cluster_count) visualize_bin_entropies(sorted_indices, totals, 60, 60, base_path, 'count_vis%i%s.png' % (cluster_count, add_str), 'Counts Per Cluster(%i)' % cluster_count) max_probs = [] for cluster_i in bin_probs: label_probs = bin_probs[cluster_i] max_probs.append(label_probs[0][1]) visualize_bin_entropies(sorted_indices, max_probs, 60, 60, base_path, 'prob_dist%i%s.png' % (cluster_count, add_str), 'Max Probability Per Cluster (%i)' % cluster_count) # For the top N bins apply the Saak transform top_indices = sorted_indices[:10] top_bins = [] for i in top_indices: real_i = index_to_pred[i] top_bins.append([real_i, *binned_samples_labels[real_i]])
def recur_fit(self, samples, parent, cur_level): mbk = MiniBatchKMeans(n_clusters=2, init='k-means++') mbk = mbk.fit(samples) centroids = mbk.cluster_centers_ labels = mbk.predict(samples) binned_samples = ch.bin_samples(samples, labels) parent.left = CentroidNode(parent, centroids[0], len(binned_samples[0])) parent.right = CentroidNode(parent, centroids[1], len(binned_samples[1])) cur_level += 1 if cur_level < self.num_levels - 1: self.recur_fit(binned_samples[0], parent.left, cur_level) self.recur_fit(binned_samples[1], parent.right, cur_level)
def gcm_test(feat, labels): print(feat.shape) # Get average for each class. binned = ch.bin_samples(feat, labels) binned_items = [(c, compute_energy(samples.T)) for c, samples in binned.items()] class_mean = [(val[0], np.mean(val[1], axis=1), np.var(val[1], axis=1), val[1]) for val in binned_items] # Get the means of each class for each component. comps = [] first_pass = True for c, mean, var, val in class_mean: for i, (m, v) in enumerate(zip(mean, var)): if first_pass: comps.append([(c, m, v, val[1])]) else: comps[i].append((c, m, v, val[1])) first_pass = False use_index = 1 comps = list( map(lambda means: sorted(means, key=lambda x: x[use_index]), comps)) disp_comps = [[[comp[1], comp[2]] for comp in c] for c in comps] disp_comps = np.array(disp_comps) val_comps = [[comp[3] for comp in c] for c in comps] val_comps = np.array(val_comps) # Select only the mean comp_means = [[classes[use_index] for classes in comp] for comp in comps] comp_dists = [np.diff(comp) for comp in comp_means] # Get the max from each. max_dists = np.array([np.amax(comp_dist) for comp_dist in comp_dists]) print(stats.describe(max_dists)) TAKE_COUNT = 2000 arg_sorted = np.argsort(max_dists) arg_sorted = np.flipud(arg_sorted) idx = arg_sorted[:TAKE_COUNT] selected_class_data = disp_comps[arg_sorted] use_count = 25 plot_dist_hist(val_comps[arg_sorted][:use_count], 'tops') plot_dist_hist(val_comps[arg_sorted][-use_count:], 'bottoms') plot_comps(selected_class_data[:use_count], 'tops', max_dists[arg_sorted][:100]) plot_comps(selected_class_data[-use_count:], 'bottoms', max_dists[arg_sorted][-100:]) raise ValueError() # Take the same elements across each sample #all_idx = [] #for i in range(feat.shape[0]): # all_idx.append(idx) #all_idx = np.array(all_idx) return feat[:, idx], idx
def entropy_test(feat, labels, should_plot=False): PrintHelper.print('Using entroy test to select coeffs') PrintHelper.print(np.array(labels).shape) binned = ch.bin_samples(feat, labels) binned_items = [(c, compute_energy(samples.T)) for c, samples in binned.items()] # Binned items is a dictionary where key is the class # and value is the data belonging to the class in the form of # (# channels, # samples) comps = [] is_first = True for c, energies in binned_items: for i in range(len(energies)): if is_first: comps.append({}) comps[i][c] = energies[i] is_first = False def normalize_hist(dist, eps=1e-4): total = np.sum(dist) dist = np.array(dist) norm = dist / total # Remove any very small values norm[norm < eps] = eps return norm # Get the maximum number of samples across each class # (If using the full dataset the max should just be equal to the number of # samples per class) max_sample_count = np.amax([len(samples) for samples in comps[0].values()]) # Compute the number of bins to use when constructing the histograms bin_count = int(np.sqrt(max_sample_count * 9)) all_data = [ sample for comp in comps for samples in comp.values() for sample in samples ] PrintHelper.print('There are %i bins' % bin_count) entropy_comps = [0.0] * len(comps) if should_plot: base_path = 'data/results/compare_entropies/' if os.path.exists(base_path): print('Removing existing results') shutil.rmtree(base_path) comp_dists = [] for i, comp in tqdm(enumerate(comps)): all_entropy = [] use_bins = bin_count dists = [] for c, samples in comp.items(): this_dist, bin_edges = np.histogram(samples, bins='auto') norm_this_dist = normalize_hist(this_dist) entropy = ch.entropy(norm_this_dist) dists.append((this_dist, bin_edges, i, c, entropy)) all_entropy.append(entropy) comp_dists.append(dists) entropy_comps[i] = np.amin(all_entropy) TAKE_COUNT = 1000 PrintHelper.print('Selecting %i top coeffs' % (TAKE_COUNT)) # We want to go from smallest to largest arg_sorted = np.argsort(entropy_comps) idx = arg_sorted[:TAKE_COUNT] PrintHelper.print('Selected') dists = np.array(dists) plot_count = 10 comp_dists = np.array(comp_dists) top_plot_dists = comp_dists[arg_sorted][:plot_count] bottom_plot_dists = comp_dists[arg_sorted][-plot_count:] def plot_all(plot_dists, add_path): for dist in plot_dists: entropies = [ entropy for this_dist, bin_edges, i, c, entropy in dist ] labels = [c for this_dist, bin_edges, i, c, entropy in dist] comp_i = dist[0][2] plot_entropy_hist(entropies, labels, base_path + add_path + '/', comp_i) if should_plot: print('Plotting') plot_all(top_plot_dists, 'top') plot_all(bottom_plot_dists, 'bottom') return feat[:, idx], idx
def kl_test(feat, labels, should_plot=False): print('Using KL test to select coeffs') binned = ch.bin_samples(feat, labels) binned_items = [(c, compute_energy(samples.T)) for c, samples in binned.items()] # Binned items is a dictionary where key is the class # and value is the data belonging to the class in the form of # (# channels, # samples) comps = [] is_first = True for c, energies in binned_items: for i in range(len(energies)): if is_first: comps.append({}) comps[i][c] = energies[i] is_first = False def agg_other_samples(comp, cur_c): aggr = [] for c, samples in comp.items(): if c != cur_c: aggr.extend(samples) return np.array(aggr) def normalize_hist(dist, eps=1e-4): total = np.sum(dist) dist = np.array(dist) norm = dist / total # Remove any very small values norm[norm < eps] = eps return norm # Get the maximum number of samples across each class # (If using the full dataset the max should just be equal to the number of # samples per class) max_sample_count = np.amax([len(samples) for samples in comps[0].values()]) # Compute the number of bins to use when constructing the histograms bin_count = int(np.sqrt(max_sample_count * 9)) all_data = [ sample for comp in comps for samples in comp.values() for sample in samples ] #min_data = np.amin(all_data) #max_data = np.amax(all_data) #step_size = (max_data - min_data) / bin_count #bin_edges = np.arange(min_data, max_data, step_size) print('There are %i bins' % bin_count) # Initialize components with to an empty array kl_comps = [0.0] * len(comps) base_path = 'data/results/compare_entropies/' if os.path.exists(base_path): shutil.rmtree(base_path) comp_dists = [] for i, comp in tqdm(enumerate(comps)): all_kl = [] use_bins = bin_count dists = [] for c, samples in comp.items(): # Aggregate samples for every other class others = agg_other_samples(comp, c) # Get data distributions for both datasets. # (numpy auto setting will automatically determine the number of # bins to use) this_dist, bin_edges = np.histogram(samples, bins=bin_count) other_dist, _ = np.histogram(others, bins=bin_edges) norm_this_dist = normalize_hist(this_dist) norm_other_dist = normalize_hist(other_dist) # Compute the KL divergence between the two distributions. kl_div_1 = ch.kl_div(norm_this_dist, norm_other_dist) #kl_div_2 = ch.kl_div(norm_other_dist, norm_this_dist) kl_div = kl_div_1 #kl_div = np.abs((kl_div_1 + kl_div_2) / 2.0) dists.append((this_dist, other_dist, bin_edges, i, c, kl_div)) all_kl.append(kl_div) comp_dists.append(dists) kl_comps[i] = np.amax(all_kl) TAKE_COUNT = 2000 arg_sorted = np.argsort(kl_comps) arg_sorted = np.flipud(arg_sorted) idx = arg_sorted[:TAKE_COUNT] dists = np.array(dists) plot_count = 5 comp_dists = np.array(comp_dists) print('Comp dists len ', len(comp_dists)) plot_dists = comp_dists[arg_sorted][:plot_count] #TODO: # I messed up the plotting code and accidently deleted it. # But I know it's in a previous version of the git history. return feat[:, idx], idx
def patch_method(feat, labels): base_path = 'data/results/patches/' if os.path.exists(base_path): shutil.rmtree(base_path) os.makedirs(base_path) # Important Notes # - Flattening the number of samples, color channel, width, height stride_h = 4 stride_w = 4 patch_width = 8 patch_height = 8 patches = extract_patches(feat, stride_h, stride_w, patch_width, patch_height) patch_data, patch_labels = get_labels_for_patches() base_path = 'data/results/patch_clusters/' print('Deleting everything in ' + base_path) if os.path.exists(base_path): shutil.rmtree(base_path) os.makedirs(base_path) for cluster_count in [512, 1024]: print('Fitting MBK') mbk = MiniBatchKMeans(n_clusters=cluster_count, init='k-means++') print('Done fitting MBK') preds = mbk.fit_predict(patch_data) binned_labels = ch.bin_labels(patch_labels, preds) bin_probs, totals = ch.convert_bins_to_probs(binned_labels) binned_samples = ch.bin_samples(patch_data, preds) entropies = ch.bin_entropies(binned_samples) max_probs = [] colors = [] color_map = { 0: 'b', 1: 'g', 2: 'r', 3: 'c', 4: 'm', 5: 'y', 6: 'k', 7: 'w', 8: 'pink', 9: 'saddlebrown', } for b in bin_probs: max_prob_ele = bin_probs[b][0] max_probs.append(max_prob_ele[1]) colors.append(color_map[max_prob_ele[0]]) plt.title('Max Class Probability per Cluster (%i)' % cluster_count) plt.bar(np.arange(len(max_probs)), max_probs, color=colors) plt.savefig(base_path + 'patch_cluster_hist%i.png' % cluster_count) plt.clf() plt.title('Entropies per Cluster (%i)' % cluster_count) plt.bar(np.arange(len(entropies)), entropies) plt.savefig(base_path + 'patch_entropies%i.png' % cluster_count) plt.clf() plt.title('Count per patch (%i)' % cluster_count) plt.bar(np.arange(len(totals)), totals) plt.savefig(base_path + 'patch_counts%i.png' % cluster_count) plt.clf() print('Saved all!')