def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs): repeat = 5 # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails. testing_result = False initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(path_to_data) answer = answer_reader(path_to_answer) for _ in range(repeat): elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer) elbow_instance.process() actual_elbow = elbow_instance.get_amount() actual_wce = elbow_instance.get_wce() assertion.gt(actual_elbow, kmin) assertion.lt(actual_elbow, kmax) assertion.eq(len(actual_wce), kmax - kmin) assertion.lt(actual_wce[-1], actual_wce[0]) if actual_elbow != len(answer.get_clusters()): continue testing_result = True break assertion.true(testing_result)
def templateClusteringWithAnswers(sample_path, answer_path, radius, neighbors, ccore, **kwargs): random_order = kwargs.get('random_order', False) repeat = kwargs.get('repeat', 1) for _ in range(repeat): sample = read_sample(sample_path) sample_index_map = [ i for i in range(len(sample)) ] if random_order: shuffle(sample_index_map) sample_shuffled = [ sample[i] for i in sample_index_map ] dbscan_instance = dbscan(sample_shuffled, radius, neighbors, ccore) dbscan_instance.process() clusters = dbscan_instance.get_clusters() noise = dbscan_instance.get_noise() for cluster in clusters: for i in range(len(cluster)): cluster[i] = sample_index_map[cluster[i]] for i in range(len(noise)): noise[i] = sample_index_map[noise[i]] noise = sorted(noise) reader = answer_reader(answer_path) expected_noise = sorted(reader.get_noise()) expected_length_clusters = reader.get_cluster_lengths() assertion.eq(len(sample), sum([len(cluster) for cluster in clusters]) + len(noise)) assertion.eq(sum(expected_length_clusters), sum([len(cluster) for cluster in clusters])) assertion.eq(expected_length_clusters, sorted([len(cluster) for cluster in clusters])) assertion.eq(expected_noise, noise)
def template_visualize(self, path_sample, path_answer, filter=None, **kwargs): data = read_sample(path_sample) clusters = answer_reader(path_answer).get_clusters() visualizer = cluster_visualizer_multidim() visualizer.append_clusters(clusters, data) visualizer.show(filter, **kwargs)
def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs): repeat = 10 # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails. testing_result = False initializer = kwargs.get('initializer', kmeans_plusplus_initializer) sample = read_sample(path_to_data) answer = answer_reader(path_to_answer) additional_info = [] for _ in range(repeat): elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, initializer=initializer) elbow_instance.process() actual_elbow = elbow_instance.get_amount() actual_wce = elbow_instance.get_wce() assertion.gt(actual_elbow, kmin) assertion.lt(actual_elbow, kmax) assertion.eq(len(actual_wce), kmax - kmin) assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001) if actual_elbow != len(answer.get_clusters()): additional_info.append(actual_elbow) #time.sleep(0.05) # sleep to gain new seed for random generator continue testing_result = True break message = str(len(answer.get_clusters())) + ": " + str(additional_info) assertion.true(testing_result, message=message)
def correct_ksearch(sample_path, answer_path, kmin, kmax, algorithm, ccore_flag): attempts = 15 testing_result = False sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() for _ in range(attempts): ksearch_instance = silhouette_ksearch(sample, kmin, kmax, algorithm=algorithm, ccore=ccore_flag).process() amount = ksearch_instance.get_amount() score = ksearch_instance.get_score() scores = ksearch_instance.get_scores() assertion.le(-1.0, score) assertion.ge(1.0, score) assertion.eq(kmax - kmin, len(scores)) upper_limit = len(clusters) + 1 lower_limit = len(clusters) - 1 if lower_limit < 1: lower_limit = 1 if (amount > upper_limit) or (amount < lower_limit): continue testing_result = True break assertion.true(testing_result)
def correct_ksearch(sample_path, answer_path, kmin, kmax, algorithm, ccore_flag): attempts = 10 testing_result = False sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() for _ in range(attempts): ksearch_instance = silhouette_ksearch(sample, kmin, kmax, algorithm=algorithm, ccore=ccore_flag).process() amount = ksearch_instance.get_amount() score = ksearch_instance.get_score() scores = ksearch_instance.get_scores() assertion.le(-1.0, score) assertion.ge(1.0, score) assertion.eq(kmax - kmin, len(scores)) upper_limit = len(clusters) + 1 lower_limit = len(clusters) - 1 if lower_limit < 1: lower_limit = 1 if (amount > upper_limit) or (amount < lower_limit): continue testing_result = True break assertion.true(testing_result)
def template_visualize_adding_step_by_step(self, path_sample, path_answer, filter=None, **kwargs): data = read_sample(path_sample) clusters = answer_reader(path_answer).get_clusters() visualizer = cluster_visualizer_multidim() for cluster in clusters: visualizer.append_cluster(cluster, data) visualizer.show(filter, **kwargs)
def template_visualize(self, path_sample, path_answer, filter=None, **kwargs): return_type = kwargs.get('return_type', 'list') data = read_sample(path_sample, return_type=return_type) clusters = answer_reader(path_answer).get_clusters() visualizer = cluster_visualizer_multidim() visualizer.append_clusters(clusters, data) visualizer.show(filter, **kwargs)
def correct_scores(sample_path, answer_path, ccore_flag): sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() scores = silhouette(sample, clusters, ccore=ccore_flag).process().get_score() assertion.eq(len(sample), len(scores)) for score in scores: assertion.le(-1.0, score) assertion.ge(1.0, score)
def template_correct_scores(self, sample_path, answer_path): sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() scores = silhouette(sample, clusters).process().get_score() assertion.eq(len(sample), len(scores)) for score in scores: assertion.le(-1.0, score) assertion.ge(1.0, score)
def clustering_with_answer(data_file, answer_file, ccore, **kwargs): data = read_sample(data_file) reader = answer_reader(answer_file) amount_medoids = len(reader.get_clusters()) initial_medoids = kmeans_plusplus_initializer( data, amount_medoids, **kwargs).initialize(return_index=True) kmedoids_instance = kmedoids(data, initial_medoids, 0.001, ccore, **kwargs) kmedoids_instance.process() clusters = kmedoids_instance.get_clusters() medoids = kmedoids_instance.get_medoids() expected_length_clusters = sorted(reader.get_cluster_lengths()) assertion.eq(len(expected_length_clusters), len(medoids)) assertion.eq(len(data), sum([len(cluster) for cluster in clusters])) assertion.eq(sum(expected_length_clusters), sum([len(cluster) for cluster in clusters])) unique_medoids = set() for medoid in medoids: assertion.false( medoid in unique_medoids, message="Medoids '%s' is not unique (actual medoids: '%s')" % (str(medoid), str(unique_medoids))) unique_medoids.add(medoid) unique_points = set() for cluster in clusters: for point in cluster: assertion.false( point in unique_points, message= "Point '%s' is already assigned to one of the clusters." % str(point)) unique_points.add(point) assertion.eq(expected_length_clusters, sorted([len(cluster) for cluster in clusters])) expected_clusters = reader.get_clusters() for actual_cluster in clusters: cluster_found = False for expected_cluster in expected_clusters: if actual_cluster == expected_cluster: cluster_found = True assertion.true( cluster_found, message="Actual cluster '%s' is not found among expected." % str(actual_cluster))
def clustering(self, sample_path, answer_path, amount, ccore): attempts = 10 failures = "" for _ in range(attempts): data = read_sample(sample_path) gmeans_instance = gmeans(data, amount, ccore).process() reader = answer_reader(answer_path) expected_length_clusters = sorted(reader.get_cluster_lengths()) clusters = gmeans_instance.get_clusters() centers = gmeans_instance.get_centers() wce = gmeans_instance.get_total_wce() self.assertEqual(len(expected_length_clusters), len(centers)) if len(clusters) > 1: self.assertGreater(wce, 0.0) else: self.assertGreaterEqual(wce, 0.0) unique_indexes = set() for cluster in clusters: for index_point in cluster: unique_indexes.add(index_point) if len(data) != len(unique_indexes): failures += "1. %d != %d\n" % (len(data), len(unique_indexes)) continue expected_total_length = sum(expected_length_clusters) actual_total_length = sum([len(cluster) for cluster in clusters]) if expected_total_length != actual_total_length: failures += "2. %d != %d\n" % (expected_total_length, actual_total_length) continue actual_length_clusters = sorted( [len(cluster) for cluster in clusters]) if expected_length_clusters != actual_length_clusters: failures += "3. %s != %s\n" % (str(expected_length_clusters), str(actual_length_clusters)) continue return self.fail("Expected result is not obtained during %d attempts: %s\n" % (attempts, failures))
def correct_scores(sample_path, answer_path, ccore_flag, **kwargs): data_type = kwargs.get('data_type', 'points') sample = read_sample(sample_path) if data_type == 'distance_matrix': sample = calculate_distance_matrix(sample, distance_metric(type_metric.EUCLIDEAN_SQUARE)) clusters = answer_reader(answer_path).get_clusters() scores = silhouette(sample, clusters, ccore=ccore_flag, data_type=data_type).process().get_score() assertion.eq(len(sample), len(scores)) for score in scores: assertion.le(-1.0, score) assertion.ge(1.0, score) return scores
def calculate_elbow(path_to_data, path_to_answer, kmin, kmax, ccore, **kwargs): repeat = 15 # Elbow method randomly chooses initial centers therefore we need to repeat test if it fails. testing_result = False kstep = kwargs.get('kstep', 1) sample = read_sample(path_to_data) expected_clusters_amount = None if path_to_answer is not None: if isinstance(path_to_answer, int): expected_clusters_amount = path_to_answer else: expected_clusters_amount = len( answer_reader(path_to_answer).get_clusters()) additional_info = [] for _ in range(repeat): elbow_instance = elbow(sample, kmin, kmax, ccore=ccore, **kwargs) elbow_instance.process() actual_elbow = elbow_instance.get_amount() actual_wce = elbow_instance.get_wce() assertion.gt(actual_elbow, kmin) assertion.lt(actual_elbow, kmax) assertion.eq(len(actual_wce), math.floor((kmax - kmin) / kstep + 1)) assertion.lt(actual_wce[-1], actual_wce[0] + 0.0000001) if (expected_clusters_amount is not None) and ( actual_elbow != expected_clusters_amount): additional_info.append(actual_elbow) continue testing_result = True break message = None if expected_clusters_amount is not None: message = str(expected_clusters_amount) + ": " + str( additional_info) assertion.true(testing_result, message=message)
def template_correct_ksearch(self, sample_path, answer_path, kmin, kmax, algorithm): attempts = 5 testing_result = False sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() for _ in range(attempts): ksearch_instance = silhouette_ksearch( sample, kmin, kmax, algorithm=algorithm).process() amount = ksearch_instance.get_amount() score = ksearch_instance.get_score() scores = ksearch_instance.get_scores() assertion.le(-1.0, score) assertion.ge(1.0, score) assertion.eq(kmax - kmin, len(scores)) if amount != len(clusters): continue testing_result = True break assertion.true(testing_result)
def clustering(self, sample_path, answer, amount, ccore, **kwargs): attempts = 10 failures = "" k_max = kwargs.get('k_max', -1) random_state = kwargs.get('random_state', None) data = read_sample(sample_path) if isinstance(answer, str): reader = answer_reader(answer) expected_length_clusters = sorted(reader.get_cluster_lengths()) amount_clusters = len(expected_length_clusters) elif isinstance(answer, int): expected_length_clusters = None amount_clusters = answer else: expected_length_clusters = answer amount_clusters = len(answer) for _ in range(attempts): gmeans_instance = gmeans(data, amount, ccore, k_max=k_max, random_state=random_state).process() clusters = gmeans_instance.get_clusters() centers = gmeans_instance.get_centers() wce = gmeans_instance.get_total_wce() self.assertEqual(amount_clusters, len(centers)) if len(clusters) > 1: self.assertGreater(wce, 0.0) else: self.assertGreaterEqual(wce, 0.0) if len(clusters) != amount_clusters: failures += "1. %d != %d\n" % (len(clusters), amount_clusters) continue unique_indexes = set() for cluster in clusters: for index_point in cluster: unique_indexes.add(index_point) if len(data) != len(unique_indexes): failures += "2. %d != %d\n" % (len(data), len(unique_indexes)) continue if expected_length_clusters is None: return expected_total_length = sum(expected_length_clusters) actual_total_length = sum([len(cluster) for cluster in clusters]) if expected_total_length != actual_total_length: failures += "3. %d != %d\n" % (expected_total_length, actual_total_length) continue actual_length_clusters = sorted( [len(cluster) for cluster in clusters]) if expected_length_clusters != actual_length_clusters: failures += "4. %s != %s\n" % (str(expected_length_clusters), str(actual_length_clusters)) continue return self.fail("Expected result is not obtained during %d attempts: %s\n" % (attempts, failures))