def test_kmeans_when_k_is_3(datasetPath, expected1, expected2, expected3):
    random.seed(1)
    dataset = read.read_csv(datasetPath)
    expected_clustering1 = read.read_csv(expected1)
    expected_clustering2 = read.read_csv(expected2)
    expected_clustering3 = read.read_csv(expected3)
    clustering = kmeans.k_means(dataset=dataset, k=3)
    cost = kmeans.cost_function(clustering)

    for _ in range(10):
        new_clustering = kmeans.k_means(dataset=dataset, k=3)
        new_cost = kmeans.cost_function(clustering)
        if new_cost < cost:
            clustering = new_clustering
            cost = new_cost

    assert len(clustering.keys()) == 3
    assert clustered_all_points(clustering, dataset) is True

    clustered = []
    for assignment in clustering:
        clustered.append(clustering[assignment])
    assert clustered == [
        expected_clustering1, expected_clustering2, expected_clustering3
    ]
Esempio n. 2
0
def test_kmeans_when_k_is_2(datasetPath, expected1, expected2):
    dataset = read.read_csv(datasetPath)
    expected_clustering1 = read.read_csv(expected1)
    expected_clustering2 = read.read_csv(expected2)
    clustering = kmeans.k_means_pp(dataset=dataset, k=2)
    cost = kmeans.cost_function(clustering)

    for _ in range(10):
        new_clustering = kmeans.k_means_pp(dataset=dataset, k=2)
        new_cost = kmeans.cost_function(clustering)
        if new_cost < cost:
            clustering = new_clustering
            cost = new_cost

    assert len(clustering.keys()) == 2
    assert clustered_all_points(clustering, dataset) is True
    clustered = []
    for assignment in clustering:
        clustered.append(clustering[assignment])
    assert clustered.sort() == [expected_clustering1,
                                expected_clustering2].sort()
Esempio n. 3
0
    def kmeans(self, args):
        """Run the kmeans command
        """
        import csv
        dataset = read.read_csv(args.dataset_file)
        clustering = kmeans.k_means(dataset=dataset, k=int(args.k))
        cost = kmeans.cost_function(clustering)

        for _ in range(100):
            new_clustering = kmeans.k_means(dataset=dataset, k=int(args.k))
            new_cost = kmeans.cost_function(clustering)
            if new_cost < cost:
                clustering = new_clustering
                cost = new_cost

        for assignment in clustering.keys():
            file_name = str(args.dataset).split(".")[0]+"_k_is_"+args.k+"_"+str(assignment)+".csv"
            with open(file_name, "w") as f:
                writer = csv.writer(f)
                print("assignement ", assignment, " is: ", clustering[assignment])
                writer.writerows(clustering[assignment])
            f.close()