def analysis_for_train_epoch(self, out_dir, **kwargs): pred_dir = os.path.join(out_dir, 'feats') os.makedirs(pred_dir, exist_ok=True) embeddings = self.metrics.metrics['embeddings'].result().detach().cpu().numpy() names = self.metrics.metrics['name'].result() # Names and classes are at a sentence level, change these to segment level for use in the scatter plot. n_segments = self.metrics.metrics['n_segments'].result().detach().cpu().numpy().squeeze(1) segment_names = [f'{names[i]}_{j}' for i, n_segment in enumerate(n_segments) for j in range(n_segment)] segment_mean_F0 = self.metrics.metrics['segment_mean_F0'].result().detach().cpu().numpy().squeeze(1) title = out_dir.split('experiments/')[-1] for proj in ['PCA', 'tSNE']: viz.scatter_plot( embeddings, segment_names, gradients=segment_mean_F0, gradient_title='Mean phrase F0 (Hz)', projection=proj, title=title, out_file=os.path.join(out_dir, f'scatter_{proj}_mean_F0.pdf')) n_clusters = kwargs.get('n_clusters', 20) k_means.cluster(embeddings, n_clusters, names=segment_names, out_dir=out_dir)
def load_k_means(k, document_path=DOCUMENT_PATH, seed=None): path = '{}_{}{}.pickle'.format(document_path, k, '' if seed is None else '_{}'.format(seed)) try: with open(path, 'rb') as f: print('loading cached clusters from {}'.format(path)) return pickle.load(f) except FileNotFoundError: print('no cached clusters found') documents = load_documents(document_path) clusters = k_means.cluster(k, documents, seed=seed) with open(path, 'wb') as f: pickle.dump(clusters, f) return clusters
def main(): train = pd.read_pickle("cluster.pkl") reduced_data = PCA.reduce(train.values, 50, PCA.getU("PCA_eigen_cluster.pkl").values, PCA.calc_mean(train.values)) heterogeneity_k_means = [] heterogeneity_spectral = [] ks = range(1, 51) spectral_laplacian = spectral.setup(train.values) for k in ks: print "k: " + str(k) bestSSD_k_means = sys.maxint bestSSD_spectral = sys.maxint spectral_eigen = spectral.computeEigen(spectral_laplacian, k) # do clustering 3 times for each k for i in range(5): print "i: " + str(i) print "k_means" cluster_center_k_means, cluster_idx_k_means = k_means.cluster( reduced_data, k) ssd_k_means = SSD(reduced_data, cluster_center_k_means, cluster_idx_k_means) if ssd_k_means < bestSSD_k_means: bestSSD_k_means = ssd_k_means print "Spectral" cluster_center_spectral, cluster_idx_spectral = spectral.cluster( spectral_eigen, k) ssd_spectral = SSD(spectral_eigen, cluster_center_spectral, cluster_idx_spectral) if ssd_spectral < bestSSD_spectral: bestSSD_spectral = ssd_spectral # append best ssd heterogeneity_k_means.append(bestSSD_k_means) heterogeneity_spectral.append(bestSSD_spectral) plt.figure(1) plt.plot(ks, heterogeneity_k_means, marker=".") plt.ylabel("Heterogeneity") plt.xlabel("k") plt.title("k vs Heterogeneity for k means") plt.xticks(np.arange(0, max(ks), 2.0)) plt.savefig("heterogeneity_k_means_cluster.png") plt.figure(2) plt.plot(ks, heterogeneity_spectral, marker=".") plt.ylabel("Heterogeneity") plt.xlabel("k") plt.title("k vs Heterogeneity for spectral") plt.xticks(np.arange(0, max(ks), 2.0)) plt.savefig("heterogeneity_spectral_cluster.png")
def board(image, intersections, show_all, do_something, logger): """Find stone colors and return board situation.""" # image_c = filters.color_enhance(image) # if show_all: # do_something(image_c, "white balance") image_c = image board_raw = [] for line in intersections: board_raw.append([stone_color_raw(image_c, intersection) for intersection in line]) board_raw = sum(board_raw, []) ### Show color distribution if show_all: import matplotlib.pyplot as pyplot from PIL import Image fig = pyplot.figure(figsize=(8, 6)) luma = [s[0] for s in board_raw] saturation = [s[1] for s in board_raw] pyplot.scatter(luma, saturation, color=[s[2] for s in board_raw]) pyplot.xlim(0,1) pyplot.ylim(0,1) fig.canvas.draw() size = fig.canvas.get_width_height() buff = fig.canvas.tostring_rgb() image_p = Image.fromstring('RGB', size, buff, 'raw') do_something(image_p, "color distribution") color_data = [(s[0], s[1]) for s in board_raw] init_x = sum(c[0] for c in color_data) / float(len(color_data)) clusters, score = k_means.cluster(3, 2,zip(color_data, range(len(color_data))), [[0., 0.5], [init_x, 0.5], [1., 0.5]]) if show_all: fig = pyplot.figure(figsize=(8, 6)) pyplot.scatter([d[0][0] for d in clusters[0]], [d[0][1] for d in clusters[0]], color=(1,0,0,1)) pyplot.scatter([d[0][0] for d in clusters[1]], [d[0][1] for d in clusters[1]], color=(0,1,0,1)) pyplot.scatter([d[0][0] for d in clusters[2]], [d[0][1] for d in clusters[2]], color=(0,0,1,1)) pyplot.xlim(0,1) pyplot.ylim(0,1) fig.canvas.draw() size = fig.canvas.get_width_height() buff = fig.canvas.tostring_rgb() image_p = Image.fromstring('RGB', size, buff, 'raw') do_something(image_p, "color clustering") clusters[0] = [(p[1], 'B') for p in clusters[0]] clusters[1] = [(p[1], '.') for p in clusters[1]] clusters[2] = [(p[1], 'W') for p in clusters[2]] board_rl = sum(clusters, []) board_rl.sort() board_rg = (p[1] for p in board_rl) board_r = [] #TODO 19 should be a size parameter try: for i in xrange(19): for _ in xrange(19): board_r.append(board_rg.next()) except StopIteration: pass return output.Board(19, board_r)
import convert_to_csv import k_means import cluster_means import render_image ip_path = "WIN_20170402_17_26_41_Pro.jpg" op_path = "output_img.jpg" convert_to_csv.img_to_csv(ip_path) k_means.cluster() cluster_means.calc_mean() render_image.im_save(op_path)
return (json_object["label"], digit_vector) with open("digits.base64.json","r") as f: data = [] for i in range(60000): # This is the size of the dataset we are loading in. Value can be as high as 60000 to load all data. data.append(f.readline()) digits = list(map(read_in_data, data)) k_means_training = digits[59000:] # the set of data used to perform k-means clustering k = 20 print("starting clustering for k =", k) clusters, hidden_layer = cluster(k_means_training, k) print("ending clustering") weights = init_weights(k, 10) beta_values = init_betas(clusters, hidden_layer) accuracy = [] dropout_chance = 0.5 for i in range(60): print("Epoch ", i, "/60") train_dropout(digits[(0+(500*i)):(500+(500*i))], hidden_layer, weights, beta_values, dropout_chance) accuracy.append(test_dropout(digits[30000:31000], hidden_layer, weights, beta_values, dropout_chance)) plt.plot(accuracy) plt.ylabel("accuracy") plt.xlabel("epochs")
kmean_reduced_50_euc_mistakes = [] kmean_reduced_10_euc_mistakes = [] # get mistakes for spectral, k means, k means reduced for k = 2,...,50 for k in range(2, 51): print k best = sys.maxint for i in range(5): spectral_centers, spectral_indx = spectral.cluster( spectral.computeEigen(affinity, k), k) s_mistakes = computeMistakes(spectral_indx, y, k) if s_mistakes < best: best = s_mistakes spectral_mistakes.append(best) best = sys.maxint for i in range(5): kmeans_centers, kmeans_indx = k_means.cluster(data.values, k) k_mistakes = computeMistakes(kmeans_indx, y, k) if k_mistakes < best: best = k_mistakes kmean_mistakes.append(best) best = sys.maxint for i in range(5): kmeans_reduced_50_centers, kmeans_reduced_50_indx = k_means.cluster( reduced_data_50, k) kr_50_mistakes = computeMistakes(kmeans_reduced_50_indx, y, k) if kr_50_mistakes < best: best = kr_50_mistakes kmean_reduced_50_mistakes.append(best) best = sys.maxint for i in range(5): kmeans_reduced_10_centers, kmeans_reduced_10_indx = k_means.cluster(
def distortion_for_seed(seed): centroids = k_means_plus_plus(16, documents, seed=seed) # centroids = random_documents(16, documents, seed=seed) clusters = cluster(16, documents, centroids=centroids) return distortion(clusters)
from load import load_documents import k_means k = 16 documents = load_documents('train100') clusters_rand = k_means.cluster(k, documents, init=k_means.random_documents) clusters_pp = k_means.cluster(k, documents) for clusters in [clusters_pp, clusters_rand]: print(sum(cluster.distortion() for cluster in clusters)) for cluster in clusters: print(len(cluster))
def cluster(U, k, Euc=False): # run k_means k_means_result = k_means.cluster(U, k, Euc) return k_means_result
import pandas as pd from k_means import cluster, update, assign from scipy.spatial.distance import cdist import numpy as np import matplotlib.pyplot as plt ks = [4, 6, 8, 10, 12, 14, 16] distortions = [] for k in ks: print('Number of Clusters:', k) print('Clustering...') centroids = cluster(k) data = pd.read_csv('result_data' + str(k) + '.csv') print('Clustering complete! Calculating distortion...') distances = [] for i in range(len(data)): centroid = centroids[-1][int(data.iloc[i].closest)] datapt = data.loc[i, 't:0':'t:160'].to_list() distance = sum([(a - b)**2 for a, b in zip(centroid, datapt)]) distances.append(distance) distortion = sum(distances) / len(data) distortions.append(distortion) print('Distortion', distortion) plt.figure() plt.plot(ks, distortions, label='Distortion') plt.legend(loc='best') plt.title('Elbow Plot') plt.show()
factor=0.5, noise=0.05) noisy_moons, moons_y = datasets.make_moons(n_samples=n_samples, noise=0.05) blobs, blobs_y = datasets.make_blobs(n_samples=n_samples, random_state=8) no_structure, no_y = np.random.rand(n_samples, 2), None colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk']) colors = np.hstack([colors] * 20) circ_lap = spectral.setup(circles, 0.125) circ_eig = spectral.computeEigen(circ_lap, 2) circle_center, circle_indx = spectral.cluster(circ_eig, 2, False) plt.figure(1) plt.scatter(circles[:, 0], circles[:, 1], color=colors[circle_indx].tolist()) circle_center_means, circle_indx_means = k_means.cluster(circles, 2, False) plt.figure(2) plt.scatter(circles[:, 0], circles[:, 1], color=colors[circle_indx_means].tolist()) circ_lap = spectral.setup(noisy_moons, 0.125) circ_eig = spectral.computeEigen(circ_lap, 2) circle_center, circle_indx = spectral.cluster(circ_eig, 2, False) plt.figure(3) plt.scatter(noisy_moons[:, 0], noisy_moons[:, 1], color=colors[circle_indx].tolist()) circle_center_means, circle_indx_means = k_means.cluster(noisy_moons, 2, False) plt.figure(4)
import pandas as pd from k_means import cluster import numpy as np from matplotlib import pyplot as plt from matplotlib.animation import FuncAnimation normalized_data = pd.read_csv('cleaned_data.csv') centroid_data = cluster(15) fig = plt.figure() ax = plt.axes(xlim=(0, 4), ylim=(-2, 2)) line, = ax.plot([], [], lw=3) centroids, = ax.plot([], [], 'bo', ms=6) def init(): centroids.set_data([], []) return centroids, def animate(i): x = np.linspace(0, 4, 1000) y = np.sin(2 * np.pi * (x - 0.01 * i)) line.set_data(x, y) return line, anim = FuncAnimation(fig, animate, init_func=init, frames=200, interval=20, blit=True)