def main(argv): X, K, init, movie_ids = DataProcessing.process_input(argv) if init == "random": clusters, centroids, distances = KMeans.execute(X, K) print( "Ran k-means. Start Distance={:.0f}, End Distance={:.0f}. Clusters = {}." .format(distances[0], distances[-1], clusters)) Utils.write_output_csv(clusters, "output.csv", movie_ids) elif init == "k-means++": clusters, centroids, distances = KMeanspp.execute(X, K) print( "Ran k-means++. Start Distance={:.0f}, End Distance={:.0f}. Clusters = {}." .format(distances[0], distances[-1], clusters)) Utils.write_output_csv(clusters, "output.csv", movie_ids) elif init == "1d": X = PCAHelper.pca_helper(X, 1) X.astype(np.float16) distances_by_k, cluster, centroids = OneDKmeans( X, K).run() # KMeans.execute(X, k) print("Ran 1d K-means. Distance={}".format(distances_by_k[-1])) # plot_data_opt_k3(distances_by_k, list(range(1,K+1))) Utils.write_output_csv(cluster, "output.csv", movie_ids) else: assert Error("init parameter was not inputted correctly!")
def main(): if len(sys.argv) < 3: assert Error("need input argument.") _, csv_path, K = sys.argv K = int(K) X = pd.read_csv(csv_path).values X = PCAHelper.parse_data(X) # Steps 1-5 # k-means and k-means++ execution km_clusters, km_centroids, km_distances = KMeans.execute(X, K) kmpp_clusters, kmpp_centroids, kmpp_distances = KMeanspp.execute(X, K) print("km dist={}, kmpp dist={}".format(km_distances[-1], kmpp_distances[-1])) # pca X = PCAHelper.pca_helper(X, 2) # plot Utils.plot_data2(X, K, km_clusters, title="K-means clustering with PCA", xaxis="First Principal Component", yaxis="Second Principal Component") Utils.plot_data2(X, K, kmpp_clusters, title="K-means++ clustering with PCA", xaxis="First Principal Component", yaxis="Second Principal Component")
def find_optimum_k(): print("Start find_optimum_k") iter_per_K = 5 start_range, end_range, step = 1, 40, 2 km_distance_by_K, kmpp_distance_by_K = np.array([]), np.array([]) # Process Input. X = process_input(sys.argv) # Try K=1,3,5 .... 29 K_range = range(start_range, end_range, step) print("Will try K = ", list(K_range)) for K in K_range: print("Trying K = ", K) # Run k-means and k-means++ 10 times. Take the average of distance. Append to distance_by_K. km_dist, kmpp_dist = np.array([]), np.array([]) for _ in range(iter_per_K): km_clusters, km_centroids, km_distances = KMeans.execute(X, K) km_clusters, km_centroids, kmpp_distances = KMeanspp.execute(X, K) km_dist = np.append(km_dist, km_distances[-1]) kmpp_dist = np.append(kmpp_dist, kmpp_distances[-1]) km_ave_dist, kmpp_ave_dist = np.mean(km_dist), np.mean(kmpp_dist) print("Average Distance | K={} | km={}, kmpp = {}".format( K, km_ave_dist, kmpp_ave_dist)) km_distance_by_K, kmpp_distance_by_K = np.append( km_distance_by_K, km_ave_dist), np.append(kmpp_distance_by_K, kmpp_ave_dist) print("Plotting Graph") # 2. Plot graph for K for km_distance_by_K, kmpp_distance_by_K to pick best K. plot_data_opt_k(km_distance_by_K, kmpp_distance_by_K, list(K_range)) # plot_data_opt_k2(km_distance_by_K, list(K_range), "K-means") # plot_data_opt_k2(kmpp_distance_by_K, list(K_range), "K-means++") print("Results:") print("K range") print(list(K_range)) print("K-Means Distances:") print(km_distance_by_K) print("K-Means++ Distances:") print(kmpp_distance_by_K)