Ejemplo n.º 1
0
    def templateLengthProcessWithMetric(path_to_file, initial_medoids, expected_cluster_length, metric, ccore_flag, **kwargs):
        sample = read_sample(path_to_file)
        data_type = kwargs.get('data_type', 'points')
        input_type = kwargs.get('input_type', 'list')
        initialize_medoids = kwargs.get('initialize_medoids', None)
        itermax = kwargs.get('itermax', 200)

        if metric is None:
            metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)

        input_data = sample
        if data_type == 'distance_matrix':
            input_data = calculate_distance_matrix(sample)

            if input_type == 'numpy':
                input_data = numpy.array(input_data)

        testing_result = False
        testing_attempts = 1
        if initialize_medoids is not None:  # in case center initializer randomization appears
            testing_attempts = 10

        for _ in range(testing_attempts):
            if initialize_medoids is not None:
                initial_medoids = kmeans_plusplus_initializer(sample, initialize_medoids).initialize(return_index=True)

            kmedoids_instance = kmedoids(input_data, initial_medoids, 0.001, ccore_flag, metric=metric, data_type=data_type, itermax=itermax)
            kmedoids_instance.process()

            clusters = kmedoids_instance.get_clusters()
            medoids = kmedoids_instance.get_medoids()

            if itermax == 0:
                assertion.eq([], clusters)
                assertion.eq(medoids, initial_medoids)
                return

            if len(clusters) != len(medoids):
                continue

            if len(set(medoids)) != len(medoids):
                continue

            obtained_cluster_sizes = [len(cluster) for cluster in clusters]
            if len(sample) != sum(obtained_cluster_sizes):
                continue

            if expected_cluster_length is not None:
                obtained_cluster_sizes.sort()
                expected_cluster_length.sort()
                if obtained_cluster_sizes != expected_cluster_length:
                    continue

            testing_result = True

        assertion.true(testing_result)
Ejemplo n.º 2
0
    def templateClusteringDistanceMatrix(path_to_file, radius, neighbors, expected_length_clusters, ccore):
        sample = read_sample(path_to_file)
        distance_matrix = calculate_distance_matrix(sample)

        dbscan_instance = dbscan(distance_matrix, radius, neighbors, ccore, data_type='distance_matrix')
        dbscan_instance.process()

        clusters = dbscan_instance.get_clusters()
        noise = dbscan_instance.get_noise()

        assertion.eq(len(sample), sum([len(cluster) for cluster in clusters]) + len(noise))
        assertion.eq(sum(expected_length_clusters), sum([len(cluster) for cluster in clusters]))
        assertion.eq(expected_length_clusters, sorted([len(cluster) for cluster in clusters]))
Ejemplo n.º 3
0
    def templateClusteringResultsSpecificData(data_type, path, radius,
                                              neighbors, amount_clusters,
                                              expected_length_clusters, ccore):
        sample = read_sample(path)

        if data_type == 'distance_matrix':
            input_data = calculate_distance_matrix(sample)
        else:
            input_data = sample

        optics_instance = optics(input_data,
                                 radius,
                                 neighbors,
                                 amount_clusters,
                                 ccore,
                                 data_type=data_type)
        optics_instance.process()

        clusters = optics_instance.get_clusters()
        noise = optics_instance.get_noise()
        optics_objects = optics_instance.get_optics_objects()

        object_indexes = set([obj.index_object for obj in optics_objects])
        assertion.eq(len(optics_objects), len(object_indexes))
        for obj in optics_objects:
            if obj.core_distance is not None:
                assertion.ge(obj.core_distance, 0)

            if obj.reachability_distance is not None:
                assertion.ge(obj.reachability_distance, 0)

        assert sum([len(cluster)
                    for cluster in clusters]) + len(noise) == len(sample)
        assert len(clusters) == len(expected_length_clusters)
        assert sum([len(cluster)
                    for cluster in clusters]) == sum(expected_length_clusters)
        assert sorted([len(cluster) for cluster in clusters
                       ]) == sorted(expected_length_clusters)

        if amount_clusters is not None:
            analyser = ordering_analyser(optics_instance.get_ordering())
            assert len(analyser) > 0

            amount_clusters, borders = analyser.extract_cluster_amount(
                optics_instance.get_radius())
            assert amount_clusters == len(expected_length_clusters)
            assert len(borders) == amount_clusters - 1
Ejemplo n.º 4
0
    def correct_scores(sample_path, answer_path, ccore_flag, **kwargs):
        data_type = kwargs.get('data_type', 'points')

        sample = read_sample(sample_path)
        if data_type == 'distance_matrix':
            sample = calculate_distance_matrix(sample, distance_metric(type_metric.EUCLIDEAN_SQUARE))

        clusters = answer_reader(answer_path).get_clusters()

        scores = silhouette(sample, clusters, ccore=ccore_flag, data_type=data_type).process().get_score()

        assertion.eq(len(sample), len(scores))
        for score in scores:
            assertion.le(-1.0, score)
            assertion.ge(1.0, score)

        return scores
Ejemplo n.º 5
0
def template_clustering(start_medoids,
                        path,
                        tolerance=0.25,
                        show=True,
                        **kwargs):
    ccore = kwargs.get('ccore', True)
    data_type = kwargs.get('data_type', 'points')

    original_data = read_sample(path)
    sample = original_data
    if data_type == 'distance_matrix':
        sample = calculate_distance_matrix(sample)

    metric = distance_metric(type_metric.EUCLIDEAN_SQUARE, data=sample)

    kmedoids_instance = kmedoids(sample,
                                 start_medoids,
                                 tolerance,
                                 metric=metric,
                                 ccore=ccore,
                                 data_type=data_type)
    (ticks, result) = timedcall(kmedoids_instance.process)

    clusters = kmedoids_instance.get_clusters()
    print("Iterations:", kmedoids_instance.get_iterations())
    print([len(cluster) for cluster in clusters])
    print(clusters)
    medoids = kmedoids_instance.get_medoids()
    print("Sample: ", path, "\t\tExecution time: ", ticks, "\n")

    if show is True:
        visualizer = cluster_visualizer(1)
        visualizer.append_clusters(clusters, original_data, 0)
        visualizer.append_cluster(
            [original_data[index] for index in start_medoids],
            marker='*',
            markersize=15)
        visualizer.append_cluster(medoids,
                                  data=original_data,
                                  marker='*',
                                  markersize=15)
        visualizer.show()

    return original_data, clusters
Ejemplo n.º 6
0
    def templateClusterAllocationOneDimensionDataSpecificData(data_type, ccore_flag):
        for _ in range(50):
            sample = [[random()] for _ in range(10)] + [[random() + 3] for _ in range(10)] + [[random() + 6] for _ in range(10)] + [[random() + 9] for _ in range(10)]

            if data_type == 'distance_matrix':
                input_data = calculate_distance_matrix(sample)
            elif data_type == 'points':
                input_data = sample
            else:
                raise ValueError("Incorrect data type '%s' is specified" % data_type)

            dbscan_instance = dbscan(input_data, 1.0, 2, ccore_flag, data_type=data_type)
            dbscan_instance.process()

            clusters = dbscan_instance.get_clusters()

            assertion.eq(4, len(clusters))
            for cluster in clusters:
                assertion.eq(10, len(cluster))
Ejemplo n.º 7
0
    def templateClusterAllocationOneDimensionDataSpecificData(data_type, ccore_flag):
        for _ in range(50):
            sample = [[random()] for _ in range(10)] + [[random() + 3] for _ in range(10)] + [[random() + 6] for _ in range(10)] + [[random() + 9] for _ in range(10)]

            if data_type == 'distance_matrix':
                input_data = calculate_distance_matrix(sample)
            elif data_type == 'points':
                input_data = sample
            else:
                raise ValueError("Incorrect data type '%s' is specified" % data_type)

            dbscan_instance = dbscan(input_data, 1.0, 2, ccore_flag, data_type=data_type)
            dbscan_instance.process()

            clusters = dbscan_instance.get_clusters()

            assertion.eq(4, len(clusters))
            for cluster in clusters:
                assertion.eq(10, len(cluster))
Ejemplo n.º 8
0
    def templateLengthProcessSpecificData(data_type, path_to_file, radius, min_number_neighbors, max_number_neighbors, ccore):
        for _ in range(min_number_neighbors, max_number_neighbors, 1):
            sample = read_sample(path_to_file)

            if data_type == 'distance_matrix':
                input_data = calculate_distance_matrix(sample)
            elif data_type == 'points':
                input_data = sample
            else:
                raise ValueError("Incorrect data type '%s' is specified" % data_type)

            dbscan_instance = dbscan(input_data, radius, min_number_neighbors, ccore, data_type=data_type)
            dbscan_instance.process()

            clusters = dbscan_instance.get_clusters()
            noise = dbscan_instance.get_noise()

            length = len(noise)
            length += sum([len(cluster) for cluster in clusters])

            assertion.eq(len(sample), length)
    def templateClusteringDistanceMatrix(path_to_file, radius, neighbors,
                                         expected_length_clusters, ccore):
        sample = read_sample(path_to_file)
        distance_matrix = calculate_distance_matrix(sample)

        dbscan_instance = dbscan(distance_matrix,
                                 radius,
                                 neighbors,
                                 ccore,
                                 data_type='distance_matrix')
        dbscan_instance.process()

        clusters = dbscan_instance.get_clusters()
        noise = dbscan_instance.get_noise()

        assertion.eq(len(sample),
                     sum([len(cluster) for cluster in clusters]) + len(noise))
        assertion.eq(sum(expected_length_clusters),
                     sum([len(cluster) for cluster in clusters]))
        assertion.eq(expected_length_clusters,
                     sorted([len(cluster) for cluster in clusters]))
Ejemplo n.º 10
0
    def templateLengthProcessSpecificData(data_type, path_to_file, radius, min_number_neighbors, max_number_neighbors, ccore):
        for _ in range(min_number_neighbors, max_number_neighbors, 1):
            sample = read_sample(path_to_file)

            if data_type == 'distance_matrix':
                input_data = calculate_distance_matrix(sample)
            elif data_type == 'points':
                input_data = sample
            else:
                raise ValueError("Incorrect data type '%s' is specified" % data_type)

            dbscan_instance = dbscan(input_data, radius, min_number_neighbors, ccore, data_type=data_type)
            dbscan_instance.process()

            clusters = dbscan_instance.get_clusters()
            noise = dbscan_instance.get_noise()

            length = len(noise)
            length += sum([len(cluster) for cluster in clusters])

            assertion.eq(len(sample), length)
Ejemplo n.º 11
0
    def templateLengthProcessWithMetric(path_to_file, initial_medoids,
                                        expected_cluster_length, metric,
                                        ccore_flag, **kwargs):
        sample = read_sample(path_to_file)
        data_type = kwargs.get('data_type', 'points')
        input_type = kwargs.get('input_type', 'list')

        if metric is None:
            metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)

        input_data = sample
        if data_type == 'distance_matrix':
            input_data = calculate_distance_matrix(sample)

            if input_type == 'numpy':
                input_data = numpy.matrix(input_data)

        kmedoids_instance = kmedoids(input_data,
                                     initial_medoids,
                                     0.025,
                                     ccore_flag,
                                     metric=metric,
                                     data_type=data_type)
        kmedoids_instance.process()

        clusters = kmedoids_instance.get_clusters()
        medoids = kmedoids_instance.get_medoids()

        assertion.eq(len(clusters), len(medoids))
        assertion.eq(len(set(medoids)), len(medoids))

        obtained_cluster_sizes = [len(cluster) for cluster in clusters]
        assertion.eq(len(sample), sum(obtained_cluster_sizes))

        obtained_cluster_sizes.sort()
        expected_cluster_length.sort()
        assertion.eq(obtained_cluster_sizes, expected_cluster_length)
Ejemplo n.º 12
0
    def templateClusteringResultsSpecificData(data_type, path, radius, neighbors, amount_clusters, expected_length_clusters, ccore):
        sample = read_sample(path)

        if data_type == 'distance_matrix':
            input_data = calculate_distance_matrix(sample)
        else:
            input_data = sample

        optics_instance = optics(input_data, radius, neighbors, amount_clusters, ccore, data_type=data_type)
        optics_instance.process()

        clusters = optics_instance.get_clusters()
        noise = optics_instance.get_noise()
        optics_objects = optics_instance.get_optics_objects()

        object_indexes = set( [ obj.index_object for obj in optics_objects ] )
        assertion.eq(len(optics_objects), len(object_indexes))
        for obj in optics_objects:
            if obj.core_distance is not None:
                assertion.ge(obj.core_distance, 0)

            if obj.reachability_distance is not None:
                assertion.ge(obj.reachability_distance, 0)

        assert sum([len(cluster) for cluster in clusters]) + len(noise) == len(sample)
        assert len(clusters) == len(expected_length_clusters)
        assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters)
        assert sorted([len(cluster) for cluster in clusters]) == sorted(expected_length_clusters)

        if amount_clusters is not None:
            analyser = ordering_analyser(optics_instance.get_ordering())
            assert len(analyser) > 0

            amount_clusters, borders = analyser.extract_cluster_amount(optics_instance.get_radius())
            assert amount_clusters == len(expected_length_clusters)
            assert len(borders) == amount_clusters - 1
Ejemplo n.º 13
0
    def templateLengthProcessWithMetric(path_to_file, initial_medoids,
                                        expected_cluster_length, metric,
                                        ccore_flag, **kwargs):
        sample = read_sample(path_to_file)
        data_type = kwargs.get('data_type', 'points')
        input_type = kwargs.get('input_type', 'list')
        initialize_medoids = kwargs.get('initialize_medoids', None)
        itermax = kwargs.get('itermax', 200)

        if metric is None:
            metric = distance_metric(type_metric.EUCLIDEAN_SQUARE)

        input_data = sample
        if data_type == 'distance_matrix':
            input_data = calculate_distance_matrix(sample)

            if input_type == 'numpy':
                input_data = numpy.array(input_data)

        testing_result = False
        testing_attempts = 1
        if initialize_medoids is not None:  # in case center initializer randomization appears
            testing_attempts = 10

        for _ in range(testing_attempts):
            if initialize_medoids is not None:
                initial_medoids = kmeans_plusplus_initializer(
                    sample, initialize_medoids).initialize(return_index=True)

            kmedoids_instance = kmedoids(input_data,
                                         initial_medoids,
                                         0.001,
                                         ccore_flag,
                                         metric=metric,
                                         data_type=data_type,
                                         itermax=itermax)
            kmedoids_instance.process()

            clusters = kmedoids_instance.get_clusters()
            medoids = kmedoids_instance.get_medoids()

            if itermax == 0:
                assertion.eq([], clusters)
                assertion.eq(medoids, initial_medoids)
                return

            if len(clusters) != len(medoids):
                continue

            if len(set(medoids)) != len(medoids):
                continue

            obtained_cluster_sizes = [len(cluster) for cluster in clusters]
            if len(sample) != sum(obtained_cluster_sizes):
                continue

            if expected_cluster_length is not None:
                obtained_cluster_sizes.sort()
                expected_cluster_length.sort()
                if obtained_cluster_sizes != expected_cluster_length:
                    continue

            testing_result = True

        assertion.true(testing_result)
Ejemplo n.º 14
0
 def testCalculateMatrixDistanceAsNumPy(self):
     data = numpy.array([[0], [2], [4]])
     matrix = utils.calculate_distance_matrix(data)
     self.assertEqual(matrix,
                      [[0.0, 2.0, 4.0], [2.0, 0.0, 2.0], [4.0, 2.0, 0.0]])
Ejemplo n.º 15
0
 def testCalculateMatrixDistance(self):
     data = [[0], [2], [4]]
     matrix = utils.calculate_distance_matrix(data)
     assert matrix == [[0.0, 2.0, 4.0], [2.0, 0.0, 2.0], [4.0, 2.0, 0.0]]
Ejemplo n.º 16
0
    def clustering_with_answer(data_file, answer_file, ccore, **kwargs):
        data_type = kwargs.get('data_type', 'points')
        metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN))

        original_data = read_sample(data_file)
        data = original_data

        if data_type == 'distance_matrix':
            data = calculate_distance_matrix(original_data, metric)

        reader = answer_reader(answer_file)

        amount_medoids = len(reader.get_clusters())

        initial_medoids = kmeans_plusplus_initializer(
            data, amount_medoids, **kwargs).initialize(return_index=True)
        kmedoids_instance = kmedoids(data, initial_medoids, 0.001, ccore,
                                     **kwargs)

        kmedoids_instance.process()

        clusters = kmedoids_instance.get_clusters()
        medoids = kmedoids_instance.get_medoids()

        expected_length_clusters = sorted(reader.get_cluster_lengths())

        assertion.eq(len(expected_length_clusters), len(medoids))
        assertion.eq(len(data), sum([len(cluster) for cluster in clusters]))
        assertion.eq(sum(expected_length_clusters),
                     sum([len(cluster) for cluster in clusters]))

        unique_medoids = set()
        for medoid in medoids:
            assertion.false(
                medoid in unique_medoids,
                message="Medoids '%s' is not unique (actual medoids: '%s')" %
                (str(medoid), str(unique_medoids)))
            unique_medoids.add(medoid)

        unique_points = set()
        for cluster in clusters:
            for point in cluster:
                assertion.false(
                    point in unique_points,
                    message=
                    "Point '%s' is already assigned to one of the clusters." %
                    str(point))
                unique_points.add(point)

        assertion.eq(expected_length_clusters,
                     sorted([len(cluster) for cluster in clusters]))

        expected_clusters = reader.get_clusters()
        for actual_cluster in clusters:
            cluster_found = False
            for expected_cluster in expected_clusters:
                if actual_cluster == expected_cluster:
                    cluster_found = True

            assertion.true(
                cluster_found,
                message="Actual cluster '%s' is not found among expected." %
                str(actual_cluster))
Ejemplo n.º 17
0
 def testCalculateMatrixDistance(self):
     data = [ [0], [2], [4] ];
     matrix = utils.calculate_distance_matrix(data);
     assert matrix == [ [0.0, 2.0, 4.0], [2.0, 0.0, 2.0], [4.0, 2.0, 0.0] ];