コード例 #1
0
    def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs):
        repeat = 5  # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails.
        testing_result = False

        initializer = kwargs.get('initializer', kmeans_plusplus_initializer)

        sample = read_sample(path_to_data)
        answer = answer_reader(path_to_answer)

        for _ in range(repeat):
            elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer)
            elbow_instance.process()

            actual_elbow = elbow_instance.get_amount()
            actual_wce = elbow_instance.get_wce()

            assertion.gt(actual_elbow, kmin)
            assertion.lt(actual_elbow, kmax)
            assertion.eq(len(actual_wce), kmax - kmin)
            assertion.lt(actual_wce[-1], actual_wce[0])

            if actual_elbow != len(answer.get_clusters()):
                continue

            testing_result = True
            break

        assertion.true(testing_result)
コード例 #2
0
    def templateClusteringWithAnswers(sample_path, answer_path, radius, neighbors, ccore, **kwargs):
        random_order = kwargs.get('random_order', False)
        repeat = kwargs.get('repeat', 1)

        for _ in range(repeat):
            sample = read_sample(sample_path)

            sample_index_map = [ i for i in range(len(sample)) ]
            if random_order:
                shuffle(sample_index_map)

            sample_shuffled = [ sample[i] for i in sample_index_map ]

            dbscan_instance = dbscan(sample_shuffled, radius, neighbors, ccore)
            dbscan_instance.process()

            clusters = dbscan_instance.get_clusters()
            noise = dbscan_instance.get_noise()

            for cluster in clusters:
                for i in range(len(cluster)):
                    cluster[i] = sample_index_map[cluster[i]]

            for i in range(len(noise)):
                noise[i] = sample_index_map[noise[i]]
            noise = sorted(noise)

            reader = answer_reader(answer_path)
            expected_noise = sorted(reader.get_noise())
            expected_length_clusters = reader.get_cluster_lengths()

            assertion.eq(len(sample), sum([len(cluster) for cluster in clusters]) + len(noise))
            assertion.eq(sum(expected_length_clusters), sum([len(cluster) for cluster in clusters]))
            assertion.eq(expected_length_clusters, sorted([len(cluster) for cluster in clusters]))
            assertion.eq(expected_noise, noise)
コード例 #3
0
    def template_visualize(self, path_sample, path_answer, filter=None, **kwargs):
        data = read_sample(path_sample)
        clusters = answer_reader(path_answer).get_clusters()

        visualizer = cluster_visualizer_multidim()
        visualizer.append_clusters(clusters, data)
        visualizer.show(filter, **kwargs)
コード例 #4
0
    def templateClusteringWithAnswers(sample_path, answer_path, radius, neighbors, ccore, **kwargs):
        random_order = kwargs.get('random_order', False)
        repeat = kwargs.get('repeat', 1)

        for _ in range(repeat):
            sample = read_sample(sample_path)

            sample_index_map = [ i for i in range(len(sample)) ]
            if random_order:
                shuffle(sample_index_map)

            sample_shuffled = [ sample[i] for i in sample_index_map ]

            dbscan_instance = dbscan(sample_shuffled, radius, neighbors, ccore)
            dbscan_instance.process()

            clusters = dbscan_instance.get_clusters()
            noise = dbscan_instance.get_noise()

            for cluster in clusters:
                for i in range(len(cluster)):
                    cluster[i] = sample_index_map[cluster[i]]

            for i in range(len(noise)):
                noise[i] = sample_index_map[noise[i]]
            noise = sorted(noise)

            reader = answer_reader(answer_path)
            expected_noise = sorted(reader.get_noise())
            expected_length_clusters = reader.get_cluster_lengths()

            assertion.eq(len(sample), sum([len(cluster) for cluster in clusters]) + len(noise))
            assertion.eq(sum(expected_length_clusters), sum([len(cluster) for cluster in clusters]))
            assertion.eq(expected_length_clusters, sorted([len(cluster) for cluster in clusters]))
            assertion.eq(expected_noise, noise)
コード例 #5
0
    def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs):
        repeat = 10  # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails.
        testing_result = False

        initializer = kwargs.get('initializer', kmeans_plusplus_initializer)

        sample = read_sample(path_to_data)
        answer = answer_reader(path_to_answer)

        additional_info = []

        for _ in range(repeat):
            elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer)
            elbow_instance.process()

            actual_elbow = elbow_instance.get_amount()
            actual_wce = elbow_instance.get_wce()

            assertion.gt(actual_elbow, kmin)
            assertion.lt(actual_elbow, kmax)
            assertion.eq(len(actual_wce), kmax - kmin)
            assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001)

            if actual_elbow != len(answer.get_clusters()):
                additional_info.append(actual_elbow)
                #time.sleep(0.05)    # sleep to gain new seed for random generator
                continue

            testing_result = True
            break

        message = str(len(answer.get_clusters())) + ": " + str(additional_info)
        assertion.true(testing_result, message=message)
コード例 #6
0
    def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs):
        repeat = 10  # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails.
        testing_result = False

        initializer = kwargs.get('initializer', kmeans_plusplus_initializer)

        sample = read_sample(path_to_data)
        answer = answer_reader(path_to_answer)

        additional_info = []

        for _ in range(repeat):
            elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer)
            elbow_instance.process()

            actual_elbow = elbow_instance.get_amount()
            actual_wce = elbow_instance.get_wce()

            assertion.gt(actual_elbow, kmin)
            assertion.lt(actual_elbow, kmax)
            assertion.eq(len(actual_wce), kmax - kmin)
            assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001)

            if actual_elbow != len(answer.get_clusters()):
                additional_info.append(actual_elbow)
                #time.sleep(0.05)    # sleep to gain new seed for random generator
                continue

            testing_result = True
            break

        message = str(len(answer.get_clusters())) + ": " + str(additional_info)
        assertion.true(testing_result, message=message)
コード例 #7
0
    def correct_ksearch(sample_path, answer_path, kmin, kmax, algorithm,
                        ccore_flag):
        attempts = 15
        testing_result = False

        sample = read_sample(sample_path)
        clusters = answer_reader(answer_path).get_clusters()

        for _ in range(attempts):
            ksearch_instance = silhouette_ksearch(sample,
                                                  kmin,
                                                  kmax,
                                                  algorithm=algorithm,
                                                  ccore=ccore_flag).process()
            amount = ksearch_instance.get_amount()
            score = ksearch_instance.get_score()
            scores = ksearch_instance.get_scores()

            assertion.le(-1.0, score)
            assertion.ge(1.0, score)
            assertion.eq(kmax - kmin, len(scores))

            upper_limit = len(clusters) + 1
            lower_limit = len(clusters) - 1
            if lower_limit < 1:
                lower_limit = 1

            if (amount > upper_limit) or (amount < lower_limit):
                continue

            testing_result = True
            break

        assertion.true(testing_result)
コード例 #8
0
    def template_visualize(self, path_sample, path_answer, filter=None, **kwargs):
        data = read_sample(path_sample)
        clusters = answer_reader(path_answer).get_clusters()

        visualizer = cluster_visualizer_multidim()
        visualizer.append_clusters(clusters, data)
        visualizer.show(filter, **kwargs)
コード例 #9
0
    def correct_ksearch(sample_path, answer_path, kmin, kmax, algorithm, ccore_flag):
        attempts = 10
        testing_result = False

        sample = read_sample(sample_path)
        clusters = answer_reader(answer_path).get_clusters()

        for _ in range(attempts):
            ksearch_instance = silhouette_ksearch(sample, kmin, kmax, algorithm=algorithm, ccore=ccore_flag).process()
            amount = ksearch_instance.get_amount()
            score = ksearch_instance.get_score()
            scores = ksearch_instance.get_scores()

            assertion.le(-1.0, score)
            assertion.ge(1.0, score)
            assertion.eq(kmax - kmin, len(scores))

            upper_limit = len(clusters) + 1
            lower_limit = len(clusters) - 1
            if lower_limit < 1:
                lower_limit = 1

            if (amount > upper_limit) or (amount < lower_limit):
                continue

            testing_result = True
            break

        assertion.true(testing_result)
コード例 #10
0
    def template_visualize_adding_step_by_step(self, path_sample, path_answer, filter=None, **kwargs):
        data = read_sample(path_sample)
        clusters = answer_reader(path_answer).get_clusters()

        visualizer = cluster_visualizer_multidim()
        for cluster in clusters:
            visualizer.append_cluster(cluster, data)

        visualizer.show(filter, **kwargs)
コード例 #11
0
    def template_visualize_adding_step_by_step(self, path_sample, path_answer, filter=None, **kwargs):
        data = read_sample(path_sample)
        clusters = answer_reader(path_answer).get_clusters()

        visualizer = cluster_visualizer_multidim()
        for cluster in clusters:
            visualizer.append_cluster(cluster, data)

        visualizer.show(filter, **kwargs)
コード例 #12
0
    def template_visualize(self, path_sample, path_answer, filter=None, **kwargs):
        return_type = kwargs.get('return_type', 'list')

        data = read_sample(path_sample, return_type=return_type)
        clusters = answer_reader(path_answer).get_clusters()

        visualizer = cluster_visualizer_multidim()
        visualizer.append_clusters(clusters, data)
        visualizer.show(filter, **kwargs)
コード例 #13
0
    def correct_scores(sample_path, answer_path, ccore_flag):
        sample = read_sample(sample_path)
        clusters = answer_reader(answer_path).get_clusters()

        scores = silhouette(sample, clusters, ccore=ccore_flag).process().get_score()

        assertion.eq(len(sample), len(scores))
        for score in scores:
            assertion.le(-1.0, score)
            assertion.ge(1.0, score)
コード例 #14
0
    def template_correct_scores(self, sample_path, answer_path):
        sample = read_sample(sample_path)
        clusters = answer_reader(answer_path).get_clusters()

        scores = silhouette(sample, clusters).process().get_score()

        assertion.eq(len(sample), len(scores))
        for score in scores:
            assertion.le(-1.0, score)
            assertion.ge(1.0, score)
コード例 #15
0
    def clustering_with_answer(data_file, answer_file, ccore, **kwargs):
        data = read_sample(data_file)
        reader = answer_reader(answer_file)

        amount_medoids = len(reader.get_clusters())

        initial_medoids = kmeans_plusplus_initializer(
            data, amount_medoids, **kwargs).initialize(return_index=True)
        kmedoids_instance = kmedoids(data, initial_medoids, 0.001, ccore,
                                     **kwargs)

        kmedoids_instance.process()

        clusters = kmedoids_instance.get_clusters()
        medoids = kmedoids_instance.get_medoids()

        expected_length_clusters = sorted(reader.get_cluster_lengths())

        assertion.eq(len(expected_length_clusters), len(medoids))
        assertion.eq(len(data), sum([len(cluster) for cluster in clusters]))
        assertion.eq(sum(expected_length_clusters),
                     sum([len(cluster) for cluster in clusters]))

        unique_medoids = set()
        for medoid in medoids:
            assertion.false(
                medoid in unique_medoids,
                message="Medoids '%s' is not unique (actual medoids: '%s')" %
                (str(medoid), str(unique_medoids)))
            unique_medoids.add(medoid)

        unique_points = set()
        for cluster in clusters:
            for point in cluster:
                assertion.false(
                    point in unique_points,
                    message=
                    "Point '%s' is already assigned to one of the clusters." %
                    str(point))
                unique_points.add(point)

        assertion.eq(expected_length_clusters,
                     sorted([len(cluster) for cluster in clusters]))

        expected_clusters = reader.get_clusters()
        for actual_cluster in clusters:
            cluster_found = False
            for expected_cluster in expected_clusters:
                if actual_cluster == expected_cluster:
                    cluster_found = True

            assertion.true(
                cluster_found,
                message="Actual cluster '%s' is not found among expected." %
                str(actual_cluster))
コード例 #16
0
ファイル: gmeans_templates.py プロジェクト: tomyqg/ASV
    def clustering(self, sample_path, answer_path, amount, ccore):
        attempts = 10

        failures = ""

        for _ in range(attempts):
            data = read_sample(sample_path)

            gmeans_instance = gmeans(data, amount, ccore).process()

            reader = answer_reader(answer_path)
            expected_length_clusters = sorted(reader.get_cluster_lengths())

            clusters = gmeans_instance.get_clusters()
            centers = gmeans_instance.get_centers()
            wce = gmeans_instance.get_total_wce()

            self.assertEqual(len(expected_length_clusters), len(centers))

            if len(clusters) > 1:
                self.assertGreater(wce, 0.0)
            else:
                self.assertGreaterEqual(wce, 0.0)

            unique_indexes = set()
            for cluster in clusters:
                for index_point in cluster:
                    unique_indexes.add(index_point)

            if len(data) != len(unique_indexes):
                failures += "1. %d != %d\n" % (len(data), len(unique_indexes))
                continue

            expected_total_length = sum(expected_length_clusters)
            actual_total_length = sum([len(cluster) for cluster in clusters])
            if expected_total_length != actual_total_length:
                failures += "2. %d != %d\n" % (expected_total_length,
                                               actual_total_length)
                continue

            actual_length_clusters = sorted(
                [len(cluster) for cluster in clusters])
            if expected_length_clusters != actual_length_clusters:
                failures += "3. %s != %s\n" % (str(expected_length_clusters),
                                               str(actual_length_clusters))
                continue

            return

        self.fail("Expected result is not obtained during %d attempts: %s\n" %
                  (attempts, failures))
コード例 #17
0
    def correct_scores(sample_path, answer_path, ccore_flag, **kwargs):
        data_type = kwargs.get('data_type', 'points')

        sample = read_sample(sample_path)
        if data_type == 'distance_matrix':
            sample = calculate_distance_matrix(sample, distance_metric(type_metric.EUCLIDEAN_SQUARE))

        clusters = answer_reader(answer_path).get_clusters()

        scores = silhouette(sample, clusters, ccore=ccore_flag, data_type=data_type).process().get_score()

        assertion.eq(len(sample), len(scores))
        for score in scores:
            assertion.le(-1.0, score)
            assertion.ge(1.0, score)

        return scores
コード例 #18
0
    def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore,
                        **kwargs):
        repeat = 15  # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails.
        testing_result = False
        kstep = kwargs.get('kstep', 1)

        sample = read_sample(path_to_data)

        expected_clusters_amount = None
        if path_to_answer is not None:
            if isinstance(path_to_answer, int):
                expected_clusters_amount = path_to_answer
            else:
                expected_clusters_amount = len(
                    answer_reader(path_to_answer).get_clusters())

        additional_info = []

        for _ in range(repeat):
            elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, **kwargs)
            elbow_instance.process()

            actual_elbow = elbow_instance.get_amount()
            actual_wce = elbow_instance.get_wce()

            assertion.gt(actual_elbow, kmin)
            assertion.lt(actual_elbow, kmax)
            assertion.eq(len(actual_wce),
                         math.floor((kmax - kmin) / kstep + 1))
            assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001)

            if (expected_clusters_amount is not None) and (
                    actual_elbow != expected_clusters_amount):
                additional_info.append(actual_elbow)
                continue

            testing_result = True
            break

        message = None
        if expected_clusters_amount is not None:
            message = str(expected_clusters_amount) + ": " + str(
                additional_info)

        assertion.true(testing_result, message=message)
コード例 #19
0
    def template_correct_ksearch(self, sample_path, answer_path, kmin, kmax,
                                 algorithm):
        attempts = 5
        testing_result = False

        sample = read_sample(sample_path)
        clusters = answer_reader(answer_path).get_clusters()

        for _ in range(attempts):
            ksearch_instance = silhouette_ksearch(
                sample, kmin, kmax, algorithm=algorithm).process()
            amount = ksearch_instance.get_amount()
            score = ksearch_instance.get_score()
            scores = ksearch_instance.get_scores()

            assertion.le(-1.0, score)
            assertion.ge(1.0, score)
            assertion.eq(kmax - kmin, len(scores))

            if amount != len(clusters): continue
            testing_result = True
            break

        assertion.true(testing_result)
コード例 #20
0
    def clustering(self, sample_path, answer, amount, ccore, **kwargs):
        attempts = 10

        failures = ""

        k_max = kwargs.get('k_max', -1)
        random_state = kwargs.get('random_state', None)
        data = read_sample(sample_path)

        if isinstance(answer, str):
            reader = answer_reader(answer)
            expected_length_clusters = sorted(reader.get_cluster_lengths())
            amount_clusters = len(expected_length_clusters)

        elif isinstance(answer, int):
            expected_length_clusters = None
            amount_clusters = answer

        else:
            expected_length_clusters = answer
            amount_clusters = len(answer)

        for _ in range(attempts):
            gmeans_instance = gmeans(data,
                                     amount,
                                     ccore,
                                     k_max=k_max,
                                     random_state=random_state).process()

            clusters = gmeans_instance.get_clusters()
            centers = gmeans_instance.get_centers()
            wce = gmeans_instance.get_total_wce()

            self.assertEqual(amount_clusters, len(centers))

            if len(clusters) > 1:
                self.assertGreater(wce, 0.0)
            else:
                self.assertGreaterEqual(wce, 0.0)

            if len(clusters) != amount_clusters:
                failures += "1. %d != %d\n" % (len(clusters), amount_clusters)
                continue

            unique_indexes = set()
            for cluster in clusters:
                for index_point in cluster:
                    unique_indexes.add(index_point)

            if len(data) != len(unique_indexes):
                failures += "2. %d != %d\n" % (len(data), len(unique_indexes))
                continue

            if expected_length_clusters is None:
                return

            expected_total_length = sum(expected_length_clusters)
            actual_total_length = sum([len(cluster) for cluster in clusters])
            if expected_total_length != actual_total_length:
                failures += "3. %d != %d\n" % (expected_total_length,
                                               actual_total_length)
                continue

            actual_length_clusters = sorted(
                [len(cluster) for cluster in clusters])
            if expected_length_clusters != actual_length_clusters:
                failures += "4. %s != %s\n" % (str(expected_length_clusters),
                                               str(actual_length_clusters))
                continue

            return

        self.fail("Expected result is not obtained during %d attempts: %s\n" %
                  (attempts, failures))