def test_kmeans_when_k_is_3(datasetPath, expected1, expected2, expected3): random.seed(1) dataset = read.read_csv(datasetPath) expected_clustering1 = read.read_csv(expected1) expected_clustering2 = read.read_csv(expected2) expected_clustering3 = read.read_csv(expected3) clustering = kmeans.k_means(dataset=dataset, k=3) cost = kmeans.cost_function(clustering) for _ in range(10): new_clustering = kmeans.k_means(dataset=dataset, k=3) new_cost = kmeans.cost_function(clustering) if new_cost < cost: clustering = new_clustering cost = new_cost assert len(clustering.keys()) == 3 assert clustered_all_points(clustering, dataset) is True clustered = [] for assignment in clustering: clustered.append(clustering[assignment]) assert clustered == [ expected_clustering1, expected_clustering2, expected_clustering3 ]
def test_kmeans_when_k_is_1(datasetPath): dataset = read.read_csv(datasetPath) expected_clustering = dataset clustering = kmeans.k_means(dataset=dataset, k=1) assert len(clustering.keys()) == 1 assert clustered_all_points(clustering, dataset) is True clustered = [] for assignment in clustering: clustered.append(clustering[assignment]) assert clustered == [expected_clustering]
def kmeans(self, args): """Run the kmeans command """ import csv dataset = read.read_csv(args.dataset_file) clustering = kmeans.k_means(dataset=dataset, k=int(args.k)) cost = kmeans.cost_function(clustering) for _ in range(100): new_clustering = kmeans.k_means(dataset=dataset, k=int(args.k)) new_cost = kmeans.cost_function(clustering) if new_cost < cost: clustering = new_clustering cost = new_cost for assignment in clustering.keys(): file_name = str(args.dataset).split(".")[0]+"_k_is_"+args.k+"_"+str(assignment)+".csv" with open(file_name, "w") as f: writer = csv.writer(f) print("assignement ", assignment, " is: ", clustering[assignment]) writer.writerows(clustering[assignment]) f.close()