Example #1
0
def process(data, labels, base_path, cluster_count, heirch):
    patch_size = 16
    stride = 1

    patches = extract_patches(data, stride, stride, patch_size, patch_size)

    patches, labels = get_labels_for_patches(patches, labels)

    print('Fitting clustering (%s)' % str(patches.shape))
    clustering = get_clustering(heirch=heirch, n_clusters=cluster_count)
    preds = clustering.fit_predict(patches)
    print('Done fitting clustering')

    # Here a bin corresponds to a cluster
    binned_labels = ch.bin_labels(labels, preds)

    index_to_pred = {}
    i = 0
    for b in binned_labels:
        index_to_pred[i] = b
        i += 1

    bin_probs, totals = ch.convert_bins_to_probs(binned_labels)
    binned_samples = ch.bin_samples(patches, preds)
    bin_entropies = ch.bin_entropies(binned_samples)

    binned_samples_labels = ch.bin_samples_labels(patches, preds, labels)

    sorted_indices = np.argsort(bin_entropies)

    if heirch:
        add_str = '_heirch'
    else:
        add_str = ''

    visualize_bin_entropies(sorted_indices, bin_entropies, 60, 60, base_path,
                            'entropy_vis%i%s.png' % (cluster_count, add_str),
                            'Max and Min Entropies (%i)' % cluster_count)

    visualize_bin_entropies(sorted_indices, totals, 60, 60, base_path,
                            'count_vis%i%s.png' % (cluster_count, add_str),
                            'Counts Per Cluster(%i)' % cluster_count)

    max_probs = []
    for cluster_i in bin_probs:
        label_probs = bin_probs[cluster_i]
        max_probs.append(label_probs[0][1])

    visualize_bin_entropies(sorted_indices, max_probs, 60, 60, base_path,
                            'prob_dist%i%s.png' % (cluster_count, add_str),
                            'Max Probability Per Cluster (%i)' % cluster_count)

    # For the top N bins apply the Saak transform
    top_indices = sorted_indices[:10]
    top_bins = []
    for i in top_indices:
        real_i = index_to_pred[i]
        top_bins.append([real_i, *binned_samples_labels[real_i]])
Example #2
0
    def recur_fit(self, samples, parent, cur_level):
        mbk = MiniBatchKMeans(n_clusters=2, init='k-means++')

        mbk = mbk.fit(samples)
        centroids = mbk.cluster_centers_
        labels = mbk.predict(samples)

        binned_samples = ch.bin_samples(samples, labels)

        parent.left = CentroidNode(parent, centroids[0],
                len(binned_samples[0]))
        parent.right = CentroidNode(parent, centroids[1],
                len(binned_samples[1]))

        cur_level += 1

        if cur_level < self.num_levels - 1:
            self.recur_fit(binned_samples[0], parent.left, cur_level)
            self.recur_fit(binned_samples[1], parent.right, cur_level)
Example #3
0
def gcm_test(feat, labels):
    print(feat.shape)
    # Get average for each class.
    binned = ch.bin_samples(feat, labels)

    binned_items = [(c, compute_energy(samples.T))
                    for c, samples in binned.items()]

    class_mean = [(val[0], np.mean(val[1], axis=1), np.var(val[1],
                                                           axis=1), val[1])
                  for val in binned_items]

    # Get the means of each class for each component.
    comps = []
    first_pass = True
    for c, mean, var, val in class_mean:
        for i, (m, v) in enumerate(zip(mean, var)):
            if first_pass:
                comps.append([(c, m, v, val[1])])
            else:
                comps[i].append((c, m, v, val[1]))

        first_pass = False

    use_index = 1

    comps = list(
        map(lambda means: sorted(means, key=lambda x: x[use_index]), comps))

    disp_comps = [[[comp[1], comp[2]] for comp in c] for c in comps]
    disp_comps = np.array(disp_comps)

    val_comps = [[comp[3] for comp in c] for c in comps]
    val_comps = np.array(val_comps)

    # Select only the mean
    comp_means = [[classes[use_index] for classes in comp] for comp in comps]
    comp_dists = [np.diff(comp) for comp in comp_means]

    # Get the max from each.
    max_dists = np.array([np.amax(comp_dist) for comp_dist in comp_dists])
    print(stats.describe(max_dists))

    TAKE_COUNT = 2000
    arg_sorted = np.argsort(max_dists)
    arg_sorted = np.flipud(arg_sorted)
    idx = arg_sorted[:TAKE_COUNT]

    selected_class_data = disp_comps[arg_sorted]

    use_count = 25

    plot_dist_hist(val_comps[arg_sorted][:use_count], 'tops')
    plot_dist_hist(val_comps[arg_sorted][-use_count:], 'bottoms')
    plot_comps(selected_class_data[:use_count], 'tops',
               max_dists[arg_sorted][:100])
    plot_comps(selected_class_data[-use_count:], 'bottoms',
               max_dists[arg_sorted][-100:])
    raise ValueError()

    # Take the same elements across each sample

    #all_idx = []
    #for i in range(feat.shape[0]):
    #    all_idx.append(idx)

    #all_idx = np.array(all_idx)

    return feat[:, idx], idx
Example #4
0
def entropy_test(feat, labels, should_plot=False):
    PrintHelper.print('Using entroy test to select coeffs')
    PrintHelper.print(np.array(labels).shape)
    binned = ch.bin_samples(feat, labels)

    binned_items = [(c, compute_energy(samples.T))
                    for c, samples in binned.items()]
    # Binned items is a dictionary where key is the class
    # and value is the data belonging to the class in the form of
    # (# channels, # samples)

    comps = []
    is_first = True
    for c, energies in binned_items:
        for i in range(len(energies)):
            if is_first:
                comps.append({})
            comps[i][c] = energies[i]

        is_first = False

    def normalize_hist(dist, eps=1e-4):
        total = np.sum(dist)
        dist = np.array(dist)
        norm = dist / total
        # Remove any very small values
        norm[norm < eps] = eps
        return norm

    # Get the maximum number of samples across each class
    # (If using the full dataset the max should just be equal to the number of
    # samples per class)
    max_sample_count = np.amax([len(samples) for samples in comps[0].values()])
    # Compute the number of bins to use when constructing the histograms

    bin_count = int(np.sqrt(max_sample_count * 9))

    all_data = [
        sample for comp in comps for samples in comp.values()
        for sample in samples
    ]

    PrintHelper.print('There are %i bins' % bin_count)

    entropy_comps = [0.0] * len(comps)

    if should_plot:
        base_path = 'data/results/compare_entropies/'
        if os.path.exists(base_path):
            print('Removing existing results')
            shutil.rmtree(base_path)

    comp_dists = []
    for i, comp in tqdm(enumerate(comps)):
        all_entropy = []
        use_bins = bin_count
        dists = []

        for c, samples in comp.items():
            this_dist, bin_edges = np.histogram(samples, bins='auto')

            norm_this_dist = normalize_hist(this_dist)

            entropy = ch.entropy(norm_this_dist)

            dists.append((this_dist, bin_edges, i, c, entropy))
            all_entropy.append(entropy)

        comp_dists.append(dists)

        entropy_comps[i] = np.amin(all_entropy)

    TAKE_COUNT = 1000
    PrintHelper.print('Selecting %i top coeffs' % (TAKE_COUNT))
    # We want to go from smallest to largest
    arg_sorted = np.argsort(entropy_comps)
    idx = arg_sorted[:TAKE_COUNT]
    PrintHelper.print('Selected')

    dists = np.array(dists)
    plot_count = 10

    comp_dists = np.array(comp_dists)

    top_plot_dists = comp_dists[arg_sorted][:plot_count]
    bottom_plot_dists = comp_dists[arg_sorted][-plot_count:]

    def plot_all(plot_dists, add_path):
        for dist in plot_dists:
            entropies = [
                entropy for this_dist, bin_edges, i, c, entropy in dist
            ]
            labels = [c for this_dist, bin_edges, i, c, entropy in dist]

            comp_i = dist[0][2]

            plot_entropy_hist(entropies, labels, base_path + add_path + '/',
                              comp_i)

    if should_plot:
        print('Plotting')

        plot_all(top_plot_dists, 'top')
        plot_all(bottom_plot_dists, 'bottom')

    return feat[:, idx], idx
Example #5
0
def kl_test(feat, labels, should_plot=False):
    print('Using KL test to select coeffs')
    binned = ch.bin_samples(feat, labels)

    binned_items = [(c, compute_energy(samples.T))
                    for c, samples in binned.items()]
    # Binned items is a dictionary where key is the class
    # and value is the data belonging to the class in the form of
    # (# channels, # samples)

    comps = []
    is_first = True
    for c, energies in binned_items:
        for i in range(len(energies)):
            if is_first:
                comps.append({})
            comps[i][c] = energies[i]

        is_first = False

    def agg_other_samples(comp, cur_c):
        aggr = []
        for c, samples in comp.items():
            if c != cur_c:
                aggr.extend(samples)

        return np.array(aggr)

    def normalize_hist(dist, eps=1e-4):
        total = np.sum(dist)
        dist = np.array(dist)
        norm = dist / total
        # Remove any very small values
        norm[norm < eps] = eps
        return norm

    # Get the maximum number of samples across each class
    # (If using the full dataset the max should just be equal to the number of
    # samples per class)
    max_sample_count = np.amax([len(samples) for samples in comps[0].values()])
    # Compute the number of bins to use when constructing the histograms

    bin_count = int(np.sqrt(max_sample_count * 9))

    all_data = [
        sample for comp in comps for samples in comp.values()
        for sample in samples
    ]

    #min_data = np.amin(all_data)
    #max_data = np.amax(all_data)
    #step_size = (max_data - min_data) / bin_count

    #bin_edges = np.arange(min_data, max_data, step_size)

    print('There are %i bins' % bin_count)

    # Initialize components with to an empty array
    kl_comps = [0.0] * len(comps)

    base_path = 'data/results/compare_entropies/'
    if os.path.exists(base_path):
        shutil.rmtree(base_path)

    comp_dists = []
    for i, comp in tqdm(enumerate(comps)):
        all_kl = []
        use_bins = bin_count
        dists = []
        for c, samples in comp.items():
            # Aggregate samples for every other class
            others = agg_other_samples(comp, c)

            # Get data distributions for both datasets.
            # (numpy auto setting will automatically determine the number of # bins to use)
            this_dist, bin_edges = np.histogram(samples, bins=bin_count)
            other_dist, _ = np.histogram(others, bins=bin_edges)

            norm_this_dist = normalize_hist(this_dist)
            norm_other_dist = normalize_hist(other_dist)

            # Compute the KL divergence between the two distributions.
            kl_div_1 = ch.kl_div(norm_this_dist, norm_other_dist)
            #kl_div_2 = ch.kl_div(norm_other_dist, norm_this_dist)

            kl_div = kl_div_1

            #kl_div = np.abs((kl_div_1 + kl_div_2) / 2.0)

            dists.append((this_dist, other_dist, bin_edges, i, c, kl_div))
            all_kl.append(kl_div)

        comp_dists.append(dists)

        kl_comps[i] = np.amax(all_kl)

    TAKE_COUNT = 2000
    arg_sorted = np.argsort(kl_comps)
    arg_sorted = np.flipud(arg_sorted)
    idx = arg_sorted[:TAKE_COUNT]

    dists = np.array(dists)
    plot_count = 5

    comp_dists = np.array(comp_dists)
    print('Comp dists len ', len(comp_dists))
    plot_dists = comp_dists[arg_sorted][:plot_count]

    #TODO:
    # I messed up the plotting code and accidently deleted it.
    # But I know it's in a previous version of the git history.

    return feat[:, idx], idx
def patch_method(feat, labels):
    base_path = 'data/results/patches/'
    if os.path.exists(base_path):
        shutil.rmtree(base_path)
    os.makedirs(base_path)

    # Important Notes
    # - Flattening the number of samples, color channel, width, height

    stride_h = 4
    stride_w = 4
    patch_width = 8
    patch_height = 8

    patches = extract_patches(feat, stride_h, stride_w, patch_width, patch_height)

    patch_data, patch_labels = get_labels_for_patches()

    base_path = 'data/results/patch_clusters/'
    print('Deleting everything in ' + base_path)
    if os.path.exists(base_path):
        shutil.rmtree(base_path)
    os.makedirs(base_path)

    for cluster_count in [512, 1024]:
        print('Fitting MBK')
        mbk = MiniBatchKMeans(n_clusters=cluster_count, init='k-means++')
        print('Done fitting MBK')
        preds = mbk.fit_predict(patch_data)
        binned_labels = ch.bin_labels(patch_labels, preds)
        bin_probs, totals = ch.convert_bins_to_probs(binned_labels)

        binned_samples = ch.bin_samples(patch_data, preds)
        entropies = ch.bin_entropies(binned_samples)

        max_probs = []
        colors = []
        color_map = {
                0: 'b',
                1: 'g',
                2: 'r',
                3: 'c',
                4: 'm',
                5: 'y',
                6: 'k',
                7: 'w',
                8: 'pink',
                9: 'saddlebrown',
            }
        for b in bin_probs:
            max_prob_ele = bin_probs[b][0]
            max_probs.append(max_prob_ele[1])
            colors.append(color_map[max_prob_ele[0]])

        plt.title('Max Class Probability per Cluster (%i)' % cluster_count)
        plt.bar(np.arange(len(max_probs)), max_probs, color=colors)
        plt.savefig(base_path + 'patch_cluster_hist%i.png' % cluster_count)
        plt.clf()

        plt.title('Entropies per Cluster (%i)' % cluster_count)
        plt.bar(np.arange(len(entropies)), entropies)
        plt.savefig(base_path + 'patch_entropies%i.png' %
                cluster_count)
        plt.clf()

        plt.title('Count per patch (%i)' % cluster_count)
        plt.bar(np.arange(len(totals)), totals)
        plt.savefig(base_path + 'patch_counts%i.png' % cluster_count)
        plt.clf()
        print('Saved all!')