Ejemplo n.º 1
0
    def analysis_for_train_epoch(self, out_dir, **kwargs):
        pred_dir = os.path.join(out_dir, 'feats')
        os.makedirs(pred_dir, exist_ok=True)

        embeddings = self.metrics.metrics['embeddings'].result().detach().cpu().numpy()
        names = self.metrics.metrics['name'].result()

        # Names and classes are at a sentence level, change these to segment level for use in the scatter plot.
        n_segments = self.metrics.metrics['n_segments'].result().detach().cpu().numpy().squeeze(1)
        segment_names = [f'{names[i]}_{j}' for i, n_segment in enumerate(n_segments) for j in range(n_segment)]

        segment_mean_F0 = self.metrics.metrics['segment_mean_F0'].result().detach().cpu().numpy().squeeze(1)

        title = out_dir.split('experiments/')[-1]
        for proj in ['PCA', 'tSNE']:
            viz.scatter_plot(
                embeddings, segment_names,
                gradients=segment_mean_F0, gradient_title='Mean phrase F0 (Hz)', projection=proj,
                title=title, out_file=os.path.join(out_dir, f'scatter_{proj}_mean_F0.pdf'))

        n_clusters = kwargs.get('n_clusters', 20)
        k_means.cluster(embeddings, n_clusters, names=segment_names, out_dir=out_dir)
Ejemplo n.º 2
0
def load_k_means(k, document_path=DOCUMENT_PATH, seed=None):
    path = '{}_{}{}.pickle'.format(document_path, k,
                                   '' if seed is None else '_{}'.format(seed))
    try:
        with open(path, 'rb') as f:
            print('loading cached clusters from {}'.format(path))
            return pickle.load(f)
    except FileNotFoundError:
        print('no cached clusters found')
        documents = load_documents(document_path)
        clusters = k_means.cluster(k, documents, seed=seed)
        with open(path, 'wb') as f:
            pickle.dump(clusters, f)
        return clusters
Ejemplo n.º 3
0
def main():
    train = pd.read_pickle("cluster.pkl")
    reduced_data = PCA.reduce(train.values, 50,
                              PCA.getU("PCA_eigen_cluster.pkl").values,
                              PCA.calc_mean(train.values))
    heterogeneity_k_means = []
    heterogeneity_spectral = []
    ks = range(1, 51)
    spectral_laplacian = spectral.setup(train.values)
    for k in ks:
        print "k: " + str(k)
        bestSSD_k_means = sys.maxint
        bestSSD_spectral = sys.maxint
        spectral_eigen = spectral.computeEigen(spectral_laplacian, k)
        # do clustering 3 times for each k
        for i in range(5):
            print "i: " + str(i)
            print "k_means"
            cluster_center_k_means, cluster_idx_k_means = k_means.cluster(
                reduced_data, k)
            ssd_k_means = SSD(reduced_data, cluster_center_k_means,
                              cluster_idx_k_means)
            if ssd_k_means < bestSSD_k_means:
                bestSSD_k_means = ssd_k_means
            print "Spectral"
            cluster_center_spectral, cluster_idx_spectral = spectral.cluster(
                spectral_eigen, k)
            ssd_spectral = SSD(spectral_eigen, cluster_center_spectral,
                               cluster_idx_spectral)
            if ssd_spectral < bestSSD_spectral:
                bestSSD_spectral = ssd_spectral
        # append best ssd
        heterogeneity_k_means.append(bestSSD_k_means)
        heterogeneity_spectral.append(bestSSD_spectral)
    plt.figure(1)
    plt.plot(ks, heterogeneity_k_means, marker=".")
    plt.ylabel("Heterogeneity")
    plt.xlabel("k")
    plt.title("k vs Heterogeneity for k means")
    plt.xticks(np.arange(0, max(ks), 2.0))
    plt.savefig("heterogeneity_k_means_cluster.png")
    plt.figure(2)
    plt.plot(ks, heterogeneity_spectral, marker=".")
    plt.ylabel("Heterogeneity")
    plt.xlabel("k")
    plt.title("k vs Heterogeneity for spectral")
    plt.xticks(np.arange(0, max(ks), 2.0))
    plt.savefig("heterogeneity_spectral_cluster.png")
Ejemplo n.º 4
0
def board(image, intersections, show_all, do_something, logger):
    """Find stone colors and return board situation."""

#    image_c = filters.color_enhance(image)
#    if show_all:
#        do_something(image_c, "white balance")

    image_c = image
    
    board_raw = []
    
    for line in intersections:
        board_raw.append([stone_color_raw(image_c, intersection) for intersection in
                      line])
    board_raw = sum(board_raw, [])

    ### Show color distribution

    if show_all:
        import matplotlib.pyplot as pyplot
        from PIL import Image
        fig = pyplot.figure(figsize=(8, 6))
        luma = [s[0] for s in board_raw]
        saturation = [s[1] for s in board_raw]
        pyplot.scatter(luma, saturation, 
                       color=[s[2] for s in board_raw])
        pyplot.xlim(0,1)
        pyplot.ylim(0,1)
        fig.canvas.draw()
        size = fig.canvas.get_width_height()
        buff = fig.canvas.tostring_rgb()
        image_p = Image.fromstring('RGB', size, buff, 'raw')
        do_something(image_p, "color distribution")

    color_data = [(s[0], s[1]) for s in board_raw]

    init_x = sum(c[0] for c in color_data) / float(len(color_data))

    clusters, score = k_means.cluster(3, 2,zip(color_data, range(len(color_data))),
                               [[0., 0.5], [init_x, 0.5], [1., 0.5]])

    if show_all:
        fig = pyplot.figure(figsize=(8, 6))
        pyplot.scatter([d[0][0] for d in clusters[0]], [d[0][1] for d in clusters[0]],
                                                 color=(1,0,0,1))
        pyplot.scatter([d[0][0] for d in clusters[1]], [d[0][1] for d in clusters[1]],
                                                 color=(0,1,0,1))
        pyplot.scatter([d[0][0] for d in clusters[2]], [d[0][1] for d in clusters[2]],
                                                 color=(0,0,1,1))
        pyplot.xlim(0,1)
        pyplot.ylim(0,1)
        fig.canvas.draw()
        size = fig.canvas.get_width_height()
        buff = fig.canvas.tostring_rgb()
        image_p = Image.fromstring('RGB', size, buff, 'raw')
        do_something(image_p, "color clustering")

    clusters[0] = [(p[1], 'B') for p in clusters[0]]
    clusters[1] = [(p[1], '.') for p in clusters[1]]
    clusters[2] = [(p[1], 'W') for p in clusters[2]]

    board_rl = sum(clusters, [])
    board_rl.sort()
    board_rg = (p[1] for p in board_rl)
    
    board_r = []

    #TODO 19 should be a size parameter
    try:
        for i in xrange(19):
            for _ in xrange(19):
                board_r.append(board_rg.next())
    except StopIteration:
        pass
    
    return output.Board(19, board_r)
Ejemplo n.º 5
0
import convert_to_csv
import k_means
import cluster_means
import render_image

ip_path = "WIN_20170402_17_26_41_Pro.jpg"
op_path = "output_img.jpg"
convert_to_csv.img_to_csv(ip_path)
k_means.cluster()
cluster_means.calc_mean()
render_image.im_save(op_path)
Ejemplo n.º 6
0
    return (json_object["label"], digit_vector)


with open("digits.base64.json","r") as f:
    data = []
    for i in range(60000): # This is the size of the dataset we are loading in. Value can be as high as 60000 to load all data.
        data.append(f.readline())
    digits = list(map(read_in_data, data))

k_means_training = digits[59000:] # the set of data used to perform k-means clustering



k = 20
print("starting clustering for k =", k)
clusters, hidden_layer = cluster(k_means_training, k)
print("ending clustering")

weights = init_weights(k, 10)
beta_values = init_betas(clusters, hidden_layer)

accuracy = []
dropout_chance = 0.5
for i in range(60):
    print("Epoch ", i, "/60")
    train_dropout(digits[(0+(500*i)):(500+(500*i))], hidden_layer, weights, beta_values, dropout_chance)
    accuracy.append(test_dropout(digits[30000:31000], hidden_layer, weights, beta_values, dropout_chance))

plt.plot(accuracy)
plt.ylabel("accuracy")
plt.xlabel("epochs")
Ejemplo n.º 7
0
kmean_reduced_50_euc_mistakes = []
kmean_reduced_10_euc_mistakes = []
# get mistakes for spectral, k means, k means reduced for k = 2,...,50
for k in range(2, 51):
    print k
    best = sys.maxint
    for i in range(5):
        spectral_centers, spectral_indx = spectral.cluster(
            spectral.computeEigen(affinity, k), k)
        s_mistakes = computeMistakes(spectral_indx, y, k)
        if s_mistakes < best:
            best = s_mistakes
    spectral_mistakes.append(best)
    best = sys.maxint
    for i in range(5):
        kmeans_centers, kmeans_indx = k_means.cluster(data.values, k)
        k_mistakes = computeMistakes(kmeans_indx, y, k)
        if k_mistakes < best:
            best = k_mistakes
    kmean_mistakes.append(best)
    best = sys.maxint
    for i in range(5):
        kmeans_reduced_50_centers, kmeans_reduced_50_indx = k_means.cluster(
            reduced_data_50, k)
        kr_50_mistakes = computeMistakes(kmeans_reduced_50_indx, y, k)
        if kr_50_mistakes < best:
            best = kr_50_mistakes
    kmean_reduced_50_mistakes.append(best)
    best = sys.maxint
    for i in range(5):
        kmeans_reduced_10_centers, kmeans_reduced_10_indx = k_means.cluster(
Ejemplo n.º 8
0
def distortion_for_seed(seed):
    centroids = k_means_plus_plus(16, documents, seed=seed)
    # centroids = random_documents(16, documents, seed=seed)
    clusters = cluster(16, documents, centroids=centroids)
    return distortion(clusters)
Ejemplo n.º 9
0
from load import load_documents
import k_means


k = 16
documents = load_documents('train100')
clusters_rand = k_means.cluster(k, documents, init=k_means.random_documents)
clusters_pp = k_means.cluster(k, documents)

for clusters in [clusters_pp, clusters_rand]:
    print(sum(cluster.distortion() for cluster in clusters))
    for cluster in clusters:
        print(len(cluster))
Ejemplo n.º 10
0
def cluster(U, k, Euc=False):
    # run k_means
    k_means_result = k_means.cluster(U, k, Euc)
    return k_means_result
import pandas as pd
from k_means import cluster, update, assign
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt

ks = [4, 6, 8, 10, 12, 14, 16]

distortions = []
for k in ks:
    print('Number of Clusters:', k)
    print('Clustering...')
    centroids = cluster(k)
    data = pd.read_csv('result_data' + str(k) + '.csv')
    print('Clustering complete! Calculating distortion...')
    distances = []
    for i in range(len(data)):
        centroid = centroids[-1][int(data.iloc[i].closest)]
        datapt = data.loc[i, 't:0':'t:160'].to_list()
        distance = sum([(a - b)**2 for a, b in zip(centroid, datapt)])
        distances.append(distance)
    distortion = sum(distances) / len(data)
    distortions.append(distortion)
    print('Distortion', distortion)

plt.figure()
plt.plot(ks, distortions, label='Distortion')
plt.legend(loc='best')
plt.title('Elbow Plot')
plt.show()
Ejemplo n.º 12
0
                                           factor=0.5,
                                           noise=0.05)
noisy_moons, moons_y = datasets.make_moons(n_samples=n_samples, noise=0.05)
blobs, blobs_y = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure, no_y = np.random.rand(n_samples, 2), None

colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)

circ_lap = spectral.setup(circles, 0.125)
circ_eig = spectral.computeEigen(circ_lap, 2)
circle_center, circle_indx = spectral.cluster(circ_eig, 2, False)
plt.figure(1)
plt.scatter(circles[:, 0], circles[:, 1], color=colors[circle_indx].tolist())

circle_center_means, circle_indx_means = k_means.cluster(circles, 2, False)
plt.figure(2)
plt.scatter(circles[:, 0],
            circles[:, 1],
            color=colors[circle_indx_means].tolist())

circ_lap = spectral.setup(noisy_moons, 0.125)
circ_eig = spectral.computeEigen(circ_lap, 2)
circle_center, circle_indx = spectral.cluster(circ_eig, 2, False)
plt.figure(3)
plt.scatter(noisy_moons[:, 0],
            noisy_moons[:, 1],
            color=colors[circle_indx].tolist())

circle_center_means, circle_indx_means = k_means.cluster(noisy_moons, 2, False)
plt.figure(4)
import pandas as pd 
from k_means import cluster
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.animation import FuncAnimation

normalized_data = pd.read_csv('cleaned_data.csv')
centroid_data = cluster(15)

fig = plt.figure()
ax = plt.axes(xlim=(0, 4), ylim=(-2, 2))

line, = ax.plot([], [], lw=3)

centroids, = ax.plot([], [], 'bo', ms=6)

def init():
    centroids.set_data([], [])
    return centroids,
def animate(i):
    x = np.linspace(0, 4, 1000)
    y = np.sin(2 * np.pi * (x - 0.01 * i))
    line.set_data(x, y)
    return line,

anim = FuncAnimation(fig, animate, init_func=init, frames=200, interval=20, blit=True)