def clustering(self, sample_path, answer_path, amount, ccore): attempts = 10 failures = "" for _ in range(attempts): data = read_sample(sample_path) gmeans_instance = gmeans(data, amount, ccore).process() reader = answer_reader(answer_path) expected_length_clusters = sorted(reader.get_cluster_lengths()) clusters = gmeans_instance.get_clusters() centers = gmeans_instance.get_centers() wce = gmeans_instance.get_total_wce() self.assertEqual(len(expected_length_clusters), len(centers)) if len(clusters) > 1: self.assertGreater(wce, 0.0) else: self.assertGreaterEqual(wce, 0.0) unique_indexes = set() for cluster in clusters: for index_point in cluster: unique_indexes.add(index_point) if len(data) != len(unique_indexes): failures += "1. %d != %d\n" % (len(data), len(unique_indexes)) continue expected_total_length = sum(expected_length_clusters) actual_total_length = sum([len(cluster) for cluster in clusters]) if expected_total_length != actual_total_length: failures += "2. %d != %d\n" % (expected_total_length, actual_total_length) continue actual_length_clusters = sorted( [len(cluster) for cluster in clusters]) if expected_length_clusters != actual_length_clusters: failures += "3. %s != %s\n" % (str(expected_length_clusters), str(actual_length_clusters)) continue return self.fail("Expected result is not obtained during %d attempts: %s\n" % (attempts, failures))
def template_clustering(sample_path, k_init=1, ccore=True, **kwargs): sample = read_sample(sample_path) gmeans_instance = gmeans(sample, k_init, ccore, repeat=5).process() clusters = gmeans_instance.get_clusters() centers = gmeans_instance.get_centers() visualize = kwargs.get('visualize', True) if visualize: visualizer = cluster_visualizer() visualizer.append_clusters(clusters, sample) visualizer.append_cluster(centers, None, marker='*', markersize=10) visualizer.show() return sample, clusters
def template_predict(self, path, amount, points, ccore): metric = distance_metric(type_metric.EUCLIDEAN) sample = read_sample(path) gmeans_instance = gmeans(sample, amount, ccore).process() centers = gmeans_instance.get_centers() closest_clusters = gmeans_instance.predict(points) self.assertEqual(len(points), len(closest_clusters)) for i in range(len(points)): cluster_index = closest_clusters[i] distance = metric(centers[cluster_index], points[i]) for center_index in range(len(centers)): if center_index != cluster_index: other_distance = metric(centers[center_index], points[i]) self.assertLessEqual(distance, other_distance)
def clustering(self, sample_path, answer, amount, ccore, **kwargs): attempts = 10 failures = "" k_max = kwargs.get('k_max', -1) random_state = kwargs.get('random_state', None) data = read_sample(sample_path) if isinstance(answer, str): reader = answer_reader(answer) expected_length_clusters = sorted(reader.get_cluster_lengths()) amount_clusters = len(expected_length_clusters) elif isinstance(answer, int): expected_length_clusters = None amount_clusters = answer else: expected_length_clusters = answer amount_clusters = len(answer) for _ in range(attempts): gmeans_instance = gmeans(data, amount, ccore, k_max=k_max, random_state=random_state).process() clusters = gmeans_instance.get_clusters() centers = gmeans_instance.get_centers() wce = gmeans_instance.get_total_wce() self.assertEqual(amount_clusters, len(centers)) if len(clusters) > 1: self.assertGreater(wce, 0.0) else: self.assertGreaterEqual(wce, 0.0) if len(clusters) != amount_clusters: failures += "1. %d != %d\n" % (len(clusters), amount_clusters) continue unique_indexes = set() for cluster in clusters: for index_point in cluster: unique_indexes.add(index_point) if len(data) != len(unique_indexes): failures += "2. %d != %d\n" % (len(data), len(unique_indexes)) continue if expected_length_clusters is None: return expected_total_length = sum(expected_length_clusters) actual_total_length = sum([len(cluster) for cluster in clusters]) if expected_total_length != actual_total_length: failures += "3. %d != %d\n" % (expected_total_length, actual_total_length) continue actual_length_clusters = sorted( [len(cluster) for cluster in clusters]) if expected_length_clusters != actual_length_clusters: failures += "4. %s != %s\n" % (str(expected_length_clusters), str(actual_length_clusters)) continue return self.fail("Expected result is not obtained during %d attempts: %s\n" % (attempts, failures))
def test_predict_without_process(self): self.assertEqual([], gmeans([[0], [1]]).predict([0]))