def clustering(path, threshold1, threshold2, expected, ccore, **kwargs): metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN)) sample = read_sample(path) ttsas_instance = ttsas(sample, threshold1, threshold2, ccore=ccore, metric=metric) ttsas_instance.process() clusters = ttsas_instance.get_clusters() representatives = ttsas_instance.get_representatives() obtained_length = 0 obtained_cluster_length = [] for cluster in clusters: obtained_length += len(cluster) obtained_cluster_length.append(len(cluster)) assertion.eq(len(sample), obtained_length) assertion.eq(len(expected), len(clusters)) assertion.eq(len(expected), len(representatives)) assertion.ge(len(sample), len(clusters)) dimension = len(sample[0]) for rep in representatives: assertion.eq(dimension, len(rep)) expected.sort() obtained_cluster_length.sort() assertion.eq(expected, obtained_cluster_length)
def assert_distribution(self, data, sizes, centers, widths): index_cluster = 0 index_cluster_point = 0 actual_means = [[0.0 for _ in range(len(data[0]))] for _ in range(len(sizes))] for index_point in range(len(data)): for index_dimension in range(len(data[0])): actual_means[index_cluster][index_dimension] += data[ index_point][index_dimension] index_cluster_point += 1 if index_cluster_point == sizes[index_cluster]: index_cluster_point = 0 index_cluster += 1 for index_cluster in range(len(actual_means)): for index_dimension in range(len(data[0])): actual_means[index_cluster][index_dimension] /= sizes[ index_cluster] assertion.ge( centers[index_cluster][index_dimension], actual_means[index_cluster][index_dimension] - widths[index_cluster]) assertion.le( centers[index_cluster][index_dimension], actual_means[index_cluster][index_dimension] + widths[index_cluster])
def templateMaxAllocatedClusters(ccore_flag, amount_clusters, size_cluster, offset, kinitial, kmax): input_data = [] for index in range(amount_clusters): for _ in range(size_cluster): input_data.append([ random.random() * index * offset, random.random() * index * offset ]) initial_centers = random_center_initializer(input_data, kinitial).initialize() xmeans_instance = xmeans(input_data, initial_centers, kmax, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag) xmeans_instance.process() clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() if len(clusters) != len(centers): print(input_data) print(initial_centers) assertion.ge(kmax, len(clusters)) assertion.ge(kmax, len(centers)) assertion.eq(len(clusters), len(centers))
def correct_ksearch(sample_path, answer_path, kmin, kmax, algorithm, ccore_flag): attempts = 15 testing_result = False sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() for _ in range(attempts): ksearch_instance = silhouette_ksearch(sample, kmin, kmax, algorithm=algorithm, ccore=ccore_flag).process() amount = ksearch_instance.get_amount() score = ksearch_instance.get_score() scores = ksearch_instance.get_scores() assertion.le(-1.0, score) assertion.ge(1.0, score) assertion.eq(kmax - kmin, len(scores)) upper_limit = len(clusters) + 1 lower_limit = len(clusters) - 1 if lower_limit < 1: lower_limit = 1 if (amount > upper_limit) or (amount < lower_limit): continue testing_result = True break assertion.true(testing_result)
def correct_ksearch(sample_path, answer_path, kmin, kmax, algorithm, ccore_flag): attempts = 10 testing_result = False sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() for _ in range(attempts): ksearch_instance = silhouette_ksearch(sample, kmin, kmax, algorithm=algorithm, ccore=ccore_flag).process() amount = ksearch_instance.get_amount() score = ksearch_instance.get_score() scores = ksearch_instance.get_scores() assertion.le(-1.0, score) assertion.ge(1.0, score) assertion.eq(kmax - kmin, len(scores)) upper_limit = len(clusters) + 1 lower_limit = len(clusters) - 1 if lower_limit < 1: lower_limit = 1 if (amount > upper_limit) or (amount < lower_limit): continue testing_result = True break assertion.true(testing_result)
def clustering(path, amount, threshold, expected, ccore, **kwargs): metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN)); sample = read_sample(path); mbsas_instance = mbsas(sample, amount, threshold, ccore=ccore, metric=metric); mbsas_instance.process(); clusters = mbsas_instance.get_clusters(); representatives = mbsas_instance.get_representatives(); obtained_length = 0; obtained_cluster_length = []; for cluster in clusters: obtained_length += len(cluster); obtained_cluster_length.append(len(cluster)); assertion.eq(len(sample), obtained_length); assertion.eq(len(expected), len(clusters)); assertion.eq(len(expected), len(representatives)); assertion.ge(amount, len(clusters)); dimension = len(sample[0]); for rep in representatives: assertion.eq(dimension, len(rep)); expected.sort(); obtained_cluster_length.sort(); assertion.eq(expected, obtained_cluster_length);
def clustering(path, amount, threshold, expected, ccore, **kwargs): metric = kwargs.get('metric', distance_metric(type_metric.EUCLIDEAN)); sample = read_sample(path); bsas_instance = bsas(sample, amount, threshold, ccore=ccore, metric=metric); bsas_instance.process(); clusters = bsas_instance.get_clusters(); representatives = bsas_instance.get_representatives(); obtained_length = 0; obtained_cluster_length = []; for cluster in clusters: obtained_length += len(cluster); obtained_cluster_length.append(len(cluster)); assertion.eq(len(sample), obtained_length); assertion.eq(len(expected), len(clusters)); assertion.eq(len(expected), len(representatives)); assertion.ge(amount, len(clusters)); dimension = len(sample[0]); for rep in representatives: assertion.eq(dimension, len(rep)); expected.sort(); obtained_cluster_length.sort(); assertion.eq(expected, obtained_cluster_length);
def template_correct_scores(self, sample_path, answer_path): sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() scores = silhouette(sample, clusters).process().get_score() assertion.eq(len(sample), len(scores)) for score in scores: assertion.le(-1.0, score) assertion.ge(1.0, score)
def correct_scores(sample_path, answer_path, ccore_flag): sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() scores = silhouette(sample, clusters, ccore=ccore_flag).process().get_score() assertion.eq(len(sample), len(scores)) for score in scores: assertion.le(-1.0, score) assertion.ge(1.0, score)
def templateClusteringResultsSpecificData(data_type, path, radius, neighbors, amount_clusters, expected_length_clusters, ccore): sample = read_sample(path) if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) else: input_data = sample optics_instance = optics(input_data, radius, neighbors, amount_clusters, ccore, data_type=data_type) optics_instance.process() clusters = optics_instance.get_clusters() noise = optics_instance.get_noise() optics_objects = optics_instance.get_optics_objects() object_indexes = set([obj.index_object for obj in optics_objects]) assertion.eq(len(optics_objects), len(object_indexes)) for obj in optics_objects: if obj.core_distance is not None: assertion.ge(obj.core_distance, 0) if obj.reachability_distance is not None: assertion.ge(obj.reachability_distance, 0) assert sum([len(cluster) for cluster in clusters]) + len(noise) == len(sample) assert len(clusters) == len(expected_length_clusters) assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters) assert sorted([len(cluster) for cluster in clusters ]) == sorted(expected_length_clusters) if amount_clusters is not None: analyser = ordering_analyser(optics_instance.get_ordering()) assert len(analyser) > 0 amount_clusters, borders = analyser.extract_cluster_amount( optics_instance.get_radius()) assert amount_clusters == len(expected_length_clusters) assert len(borders) == amount_clusters - 1
def correct_scores(sample_path, answer_path, ccore_flag, **kwargs): data_type = kwargs.get('data_type', 'points') sample = read_sample(sample_path) if data_type == 'distance_matrix': sample = calculate_distance_matrix(sample, distance_metric(type_metric.EUCLIDEAN_SQUARE)) clusters = answer_reader(answer_path).get_clusters() scores = silhouette(sample, clusters, ccore=ccore_flag, data_type=data_type).process().get_score() assertion.eq(len(sample), len(scores)) for score in scores: assertion.le(-1.0, score) assertion.ge(1.0, score) return scores
def templateMaxAllocatedClusters(ccore_flag, amount_clusters, size_cluster, offset, kinitial, kmax): input_data = [] for index in range(amount_clusters): for _ in range(size_cluster): input_data.append([random.random() * index * offset, random.random() * index * offset]) initial_centers = random_center_initializer(input_data, kinitial).initialize() xmeans_instance = xmeans(input_data, initial_centers, kmax, 0.025, splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore_flag) xmeans_instance.process() clusters = xmeans_instance.get_clusters() centers = xmeans_instance.get_centers() if len(clusters) != len(centers): print(input_data) print(initial_centers) assertion.ge(kmax, len(clusters)) assertion.ge(kmax, len(centers)) assertion.eq(len(clusters), len(centers))
def template_correct_ksearch(self, sample_path, answer_path, kmin, kmax, algorithm): attempts = 5 testing_result = False sample = read_sample(sample_path) clusters = answer_reader(answer_path).get_clusters() for _ in range(attempts): ksearch_instance = silhouette_ksearch( sample, kmin, kmax, algorithm=algorithm).process() amount = ksearch_instance.get_amount() score = ksearch_instance.get_score() scores = ksearch_instance.get_scores() assertion.le(-1.0, score) assertion.ge(1.0, score) assertion.eq(kmax - kmin, len(scores)) if amount != len(clusters): continue testing_result = True break assertion.true(testing_result)
def templateClusteringResultsSpecificData(data_type, path, radius, neighbors, amount_clusters, expected_length_clusters, ccore): sample = read_sample(path) if data_type == 'distance_matrix': input_data = calculate_distance_matrix(sample) else: input_data = sample optics_instance = optics(input_data, radius, neighbors, amount_clusters, ccore, data_type=data_type) optics_instance.process() clusters = optics_instance.get_clusters() noise = optics_instance.get_noise() optics_objects = optics_instance.get_optics_objects() object_indexes = set( [ obj.index_object for obj in optics_objects ] ) assertion.eq(len(optics_objects), len(object_indexes)) for obj in optics_objects: if obj.core_distance is not None: assertion.ge(obj.core_distance, 0) if obj.reachability_distance is not None: assertion.ge(obj.reachability_distance, 0) assert sum([len(cluster) for cluster in clusters]) + len(noise) == len(sample) assert len(clusters) == len(expected_length_clusters) assert sum([len(cluster) for cluster in clusters]) == sum(expected_length_clusters) assert sorted([len(cluster) for cluster in clusters]) == sorted(expected_length_clusters) if amount_clusters is not None: analyser = ordering_analyser(optics_instance.get_ordering()) assert len(analyser) > 0 amount_clusters, borders = analyser.extract_cluster_amount(optics_instance.get_radius()) assert amount_clusters == len(expected_length_clusters) assert len(borders) == amount_clusters - 1