def test_kmeans_when_k_is_3(datasetPath, expected1, expected2, expected3):
    random.seed(1)
    dataset = read.read_csv(datasetPath)
    expected_clustering1 = read.read_csv(expected1)
    expected_clustering2 = read.read_csv(expected2)
    expected_clustering3 = read.read_csv(expected3)
    clustering = kmeans.k_means(dataset=dataset, k=3)
    cost = kmeans.cost_function(clustering)

    for _ in range(10):
        new_clustering = kmeans.k_means(dataset=dataset, k=3)
        new_cost = kmeans.cost_function(clustering)
        if new_cost < cost:
            clustering = new_clustering
            cost = new_cost

    assert len(clustering.keys()) == 3
    assert clustered_all_points(clustering, dataset) is True

    clustered = []
    for assignment in clustering:
        clustered.append(clustering[assignment])
    assert clustered == [
        expected_clustering1, expected_clustering2, expected_clustering3
    ]
def test_read(dataset, expected):
    actual_data = read.read_csv(dataset)
    expected_data = expected

    assert len(actual_data) == len(expected_data)
    for i in range(len(actual_data)):
        assert actual_data[i] == expected_data[i]
Beispiel #3
0
def test_kmeans_when_k_is_2(datasetPath, expected1, expected2):
    dataset = read.read_csv(datasetPath)
    expected_clustering1 = read.read_csv(expected1)
    expected_clustering2 = read.read_csv(expected2)
    clustering = kmeans.k_means_pp(dataset=dataset, k=2)
    cost = kmeans.cost_function(clustering)

    for _ in range(10):
        new_clustering = kmeans.k_means_pp(dataset=dataset, k=2)
        new_cost = kmeans.cost_function(clustering)
        if new_cost < cost:
            clustering = new_clustering
            cost = new_cost

    assert len(clustering.keys()) == 2
    assert clustered_all_points(clustering, dataset) is True
    clustered = []
    for assignment in clustering:
        clustered.append(clustering[assignment])
    assert clustered.sort() == [expected_clustering1,
                                expected_clustering2].sort()
def test_kmeans_when_k_is_1(datasetPath):
    dataset = read.read_csv(datasetPath)
    expected_clustering = dataset
    clustering = kmeans.k_means(dataset=dataset, k=1)

    assert len(clustering.keys()) == 1
    assert clustered_all_points(clustering, dataset) is True

    clustered = []
    for assignment in clustering:
        clustered.append(clustering[assignment])
    assert clustered == [expected_clustering]
Beispiel #5
0
def test_read(dataset):
    actual_data = read.read_csv(dataset)
    expected_data = [[138, 143], [93, 104], [61, 69], [179, 260], [48, 75],
                     [37, 63], [29, 50], [23, 48], [30, 111], [2, 50],
                     [38, 52], [46, 53], [71, 79], [25, 57], [298, 317],
                     [74, 93], [50, 58], [76, 80], [381, 464], [387, 459],
                     [78, 106], [60, 57], [507, 634], [50, 64], [77, 89],
                     [64, 77], [40, 60], [136, 139], [243, 291], [256, 288],
                     [94, 85], [36, 46], [45, 53], [67, 67], [120, 115],
                     [172, 183], [66, 86], [46, 65], [121, 113], [44, 58],
                     [64, 63], [56, 142], [40, 64], [116, 130], [87, 105],
                     [43, 61], [43, 50], [161, 232], [36, 54]]

    assert len(actual_data) == len(expected_data)
    for i in range(len(actual_data)):
        assert actual_data[i] == expected_data[i]
Beispiel #6
0
    def kmeans(self, args):
        """Run the kmeans command
        """
        import csv
        dataset = read.read_csv(args.dataset_file)
        clustering = kmeans.k_means(dataset=dataset, k=int(args.k))
        cost = kmeans.cost_function(clustering)

        for _ in range(100):
            new_clustering = kmeans.k_means(dataset=dataset, k=int(args.k))
            new_cost = kmeans.cost_function(clustering)
            if new_cost < cost:
                clustering = new_clustering
                cost = new_cost

        for assignment in clustering.keys():
            file_name = str(args.dataset).split(".")[0]+"_k_is_"+args.k+"_"+str(assignment)+".csv"
            with open(file_name, "w") as f:
                writer = csv.writer(f)
                print("assignement ", assignment, " is: ", clustering[assignment])
                writer.writerows(clustering[assignment])
            f.close()
Beispiel #7
0
    while assignments != old_assignments:
        new_centers = update_centers(dataset, assignments)
        old_assignments = assignments
        assignments = assign_points(dataset, new_centers)
    clustering = defaultdict(list)
    for assignment, point in zip(assignments, dataset):
        clustering[assignment].append(point)
    return clustering


def k_means(dataset, k):
    if k not in range(1, len(dataset)+1):
        raise ValueError("lengths must be in [1, len(dataset)]")
    
    k_points = generate_k(dataset, k)
    return _do_lloyds_algo(dataset, k_points)


def k_means_pp(dataset, k):
    if k not in range(1, len(dataset)+1):
        raise ValueError("lengths must be in [1, len(dataset)]")

    k_points = generate_k_pp(dataset, k)
    return _do_lloyds_algo(dataset, k_points)

if __name__ =='__main__':
    from cs506 import read

    data = read.read_csv('D:/OneDrive/College Notebook/Boston University/Fall Senior Year/CS 506/CS506-Fall2020/02-library/tests/test_files/dataset_1.csv')
    res = (k_means(data, 4))
    print(res[0])