def main (k, m="means", init_type="random"): # Starting clustering timer start_cluster = timeit.default_timer() # Initialize clusters if init_type == "random": initial_clusters = Initialize.random_centers(k) else: init_type = "kplusplus" initial_clusters = Initialize.kmeans_plusplus(k, train_images_flat,\ dist_fn=Distance.sumsq) # Run clustering algorithm final_responsibilities, final_clusters = Kmeans.kmeans(k,train_images_flat, initial_clusters, distfn = Distance.sumsq, method=m) # Find and print clustering time end_cluster = timeit.default_timer() clustering_time = end_cluster - start_cluster print "Time spent clustering : ", clustering_time # Save representative images to file. title = m + "_" + init_type + "_cluster" + str(k) File.save_images(k, train_images, final_responsibilities, final_clusters, title) ########################################################################### # Calculate Accuracy # ########################################################################### # Calculate final accuracy for clusters final, cluster_set = Accuracy.final_accuracy(final_responsibilities, train_labels, train_images_flat, final_clusters) # Now see how well we can classify the dataset start_cluster_test = timeit.default_timer() predictions = ClassifyClusters.classify(cluster_set, test_images_flat, test_labels, distfn = Distance.sumsq) finish_cluster_test = timeit.default_timer() # find time it took to test testing_time = finish_cluster_test - start_cluster_test print "Time spent testing : ", testing_time ########################################################################### # Outputs # ########################################################################### # k, prediction level, cluster_set, results = {"k" : k, "prediction_accuracy" : predictions[1], "cluster_means" : cluster_set, "cluster_stats" : final, "clustering_time" : clustering_time, "testing_time" : testing_time} with open('./results/' + title + '/' + title + '_results.json', 'w') as outfile: json.dump(results, outfile, cls=File.NumpyEncoder)
test_images,test_labels = File.load_mnist("testing",path=os.getcwd()) # flatten training images into 60,000 x 784 array train_images_flat = np.array([np.ravel(img) for img in train_images]) test_images_flat = np.array([np.ravel(img) for img in test_images]) ############################################################################### # Run Scikit_learn # ############################################################################### k = int(sys.argv[1]) # number of clusters (system argument) # Train k means model kmeans = KMeans(init='k-means++', n_clusters=k, n_init=10) kmeans_fit = kmeans.fit(train_images_flat) # Get the cluster assignments of each point of training images kmeans_labels = kmeans_fit.labels_ kmeans_centers = kmeans_fit.cluster_centers_ # Initialize a vector of responsibilities in a one-hot-coded format. final_responsibilities = np.zeros((len(train_images_flat),k)) # For each cluster assignment, assign the appropriate vector in the # one-hot-coded format to a 1. for imgnum in range(len(train_images_flat)): final_responsibilities[imgnum][kmeans_labels[imgnum]] = 1 # Obtain predictions for each point. Z = kmeans.predict(test_images_flat) # Determine accuracies. Accuracy.final_accuracy(final_responsibilities, train_labels, train_images_flat, kmeans_centers)