def get_distance(self, entry, type_measurement): """! @brief Calculates distance between two clusters in line with measurement type. @details In case of usage CENTROID_EUCLIDIAN_DISTANCE square euclidian distance will be returned. Square root should be taken from the result for obtaining real euclidian distance between entries. @param[in] entry (cfentry): Clustering feature to which distance should be obtained. @param[in] type_measurement (measurement_type): Distance measurement algorithm between two clusters. @return (double) Distance between two clusters. """ if type_measurement is measurement_type.CENTROID_EUCLIDEAN_DISTANCE: return euclidean_distance_square(entry.get_centroid(), self.get_centroid()) elif type_measurement is measurement_type.CENTROID_MANHATTAN_DISTANCE: return manhattan_distance(entry.get_centroid(), self.get_centroid()) elif type_measurement is measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE: return self.__get_average_inter_cluster_distance(entry) elif type_measurement is measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE: return self.__get_average_intra_cluster_distance(entry) elif type_measurement is measurement_type.VARIANCE_INCREASE_DISTANCE: return self.__get_variance_increase_distance(entry) else: raise ValueError("Unsupported type of measurement '%s' is specified." % type_measurement)
def templateDistanceCalculation(self, cluster1, cluster2, type_measurement): entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1)); entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2)); # check that the same distance from 1 to 2 and from 2 to 1. distance12 = entry1.get_distance(entry2, type_measurement); distance21 = entry2.get_distance(entry1, type_measurement); assert distance12 == distance21; # check with utils calculation float_delta = 0.0000001; if (type_measurement == measurement_type.CENTROID_EUCLIDIAN_DISTANCE): assert distance12 == euclidean_distance_sqrt(entry1.get_centroid(), entry2.get_centroid()); elif (type_measurement == measurement_type.CENTROID_MANHATTAN_DISTANCE): assert distance12 == manhattan_distance(entry1.get_centroid(), entry2.get_centroid()); elif (type_measurement == measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE): assert numpy.isclose(distance12, average_inter_cluster_distance(cluster1, cluster2)) == True; elif (type_measurement == measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE): assert numpy.isclose(distance12, average_intra_cluster_distance(cluster1, cluster2)) == True; elif (type_measurement == measurement_type.VARIANCE_INCREASE_DISTANCE): assert numpy.isclose(distance12, variance_increase_distance(cluster1, cluster2)) == True;
def get_distance(self, entry, type_measurement): """! @brief Calculates distance between two clusters in line with measurement type. @param[in] entry (cfentry): Clustering feature to which distance should be obtained. @param[in] type_measurement (measurement_type): Distance measurement algorithm between two clusters. @return (double) Distance between two clusters. """ if (type_measurement is measurement_type.CENTROID_EUCLIDIAN_DISTANCE): return euclidean_distance_sqrt(entry.get_centroid(), self.get_centroid()); elif (type_measurement is measurement_type.CENTROID_MANHATTAN_DISTANCE): return manhattan_distance(entry.get_centroid(), self.get_centroid()); elif (type_measurement is measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE): return self.__get_average_inter_cluster_distance(entry); elif (type_measurement is measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE): return self.__get_average_intra_cluster_distance(entry); elif (type_measurement is measurement_type.VARIANCE_INCREASE_DISTANCE): return self.__get_variance_increase_distance(entry); else: assert 0;
def __update_clusters(self, medoids): """! @brief Forms cluster in line with specified medoids by calculation distance from each point to medoids. """ self.__belong = [0] * len(self.__pointer_data) self.__clusters = [[] for i in range(len(medoids))] for index_point in range(len(self.__pointer_data)): index_optim = -1 dist_optim = 0.0 for index in range(len(medoids)): dist = manhattan_distance(self.__pointer_data[index_point], self.__pointer_data[medoids[index]]) if (dist < dist_optim) or (index == 0): index_optim = index dist_optim = dist self.__clusters[index_optim].append(index_point) self.__belong[index_point] = index_optim # If cluster is not able to capture object it should be removed self.__clusters = [ cluster for cluster in self.__clusters if len(cluster) > 0 ]
def templateDistanceCalculation(self, cluster1, cluster2, type_measurement): entry1 = cfentry(len(cluster1), linear_sum(cluster1), square_sum(cluster1)) entry2 = cfentry(len(cluster2), linear_sum(cluster2), square_sum(cluster2)) # check that the same distance from 1 to 2 and from 2 to 1. distance12 = entry1.get_distance(entry2, type_measurement) distance21 = entry2.get_distance(entry1, type_measurement) assert distance12 == distance21; # check with utils calculation float_delta = 0.0000001 if (type_measurement == measurement_type.CENTROID_EUCLIDEAN_DISTANCE): assert distance12 == euclidean_distance_square(entry1.get_centroid(), entry2.get_centroid()); elif (type_measurement == measurement_type.CENTROID_MANHATTAN_DISTANCE): assert distance12 == manhattan_distance(entry1.get_centroid(), entry2.get_centroid()); elif (type_measurement == measurement_type.AVERAGE_INTER_CLUSTER_DISTANCE): assert numpy.isclose(distance12, average_inter_cluster_distance(cluster1, cluster2)) == True; elif (type_measurement == measurement_type.AVERAGE_INTRA_CLUSTER_DISTANCE): assert numpy.isclose(distance12, average_intra_cluster_distance(cluster1, cluster2)) == True; elif (type_measurement == measurement_type.VARIANCE_INCREASE_DISTANCE): assert numpy.isclose(distance12, variance_increase_distance(cluster1, cluster2)) == True;
def cluster_distances(path_sample, amount_clusters): distances = [ 'euclidian', 'manhattan', 'avr-inter', 'avr-intra', 'variance' ] sample = utils.read_sample(path_sample) agglomerative_instance = agglomerative(sample, amount_clusters) agglomerative_instance.process() obtained_clusters = agglomerative_instance.get_clusters() print("Measurements for:", path_sample) for index_cluster in range(len(obtained_clusters)): for index_neighbor in range(index_cluster + 1, len(obtained_clusters), 1): cluster1 = obtained_clusters[index_cluster] cluster2 = obtained_clusters[index_neighbor] center_cluster1 = utils.centroid(sample, cluster1) center_cluster2 = utils.centroid(sample, cluster2) for index_distance_type in range(len(distances)): distance = None distance_type = distances[index_distance_type] if (distance_type == 'euclidian'): distance = utils.euclidean_distance( center_cluster1, center_cluster2) elif (distance_type == 'manhattan'): distance = utils.manhattan_distance( center_cluster1, center_cluster2) elif (distance_type == 'avr-inter'): distance = utils.average_inter_cluster_distance( cluster1, cluster2, sample) elif (distance_type == 'avr-intra'): distance = utils.average_intra_cluster_distance( cluster1, cluster2, sample) elif (distance_type == 'variance'): distance = utils.variance_increase_distance( cluster1, cluster2, sample) print("\tDistance", distance_type, "from", index_cluster, "to", index_neighbor, "is:", distance)
def __calculate_estimation(self): """! @brief Calculates estimation (cost) of the current clusters. The lower the estimation, the more optimally configuration of clusters. @return (double) estimation of current clusters. """ estimation = 0.0 for index_cluster in range(0, len(self.__clusters)): cluster = self.__clusters[index_cluster] index_medoid = self.__current[index_cluster] for index_point in cluster: estimation += manhattan_distance( self.__pointer_data[index_point], self.__pointer_data[index_medoid]) return estimation
def cluster_distances(path_sample, amount_clusters): distances = ['euclidian', 'manhattan', 'avr-inter', 'avr-intra', 'variance']; sample = utils.read_sample(path_sample); agglomerative_instance = agglomerative(sample, amount_clusters); agglomerative_instance.process(); obtained_clusters = agglomerative_instance.get_clusters(); print("Measurements for:", path_sample); for index_cluster in range(len(obtained_clusters)): for index_neighbor in range(index_cluster + 1, len(obtained_clusters), 1): cluster1 = obtained_clusters[index_cluster]; cluster2 = obtained_clusters[index_neighbor]; center_cluster1 = utils.centroid(sample, cluster1); center_cluster2 = utils.centroid(sample, cluster2); for index_distance_type in range(len(distances)): distance = None; distance_type = distances[index_distance_type]; if (distance_type == 'euclidian'): distance = utils.euclidean_distance(center_cluster1, center_cluster2); elif (distance_type == 'manhattan'): distance = utils.manhattan_distance(center_cluster1, center_cluster2); elif (distance_type == 'avr-inter'): distance = utils.average_inter_cluster_distance(cluster1, cluster2, sample); elif (distance_type == 'avr-intra'): distance = utils.average_intra_cluster_distance(cluster1, cluster2, sample); elif (distance_type == 'variance'): distance = utils.variance_increase_distance(cluster1, cluster2, sample); print("\tDistance", distance_type, "from", index_cluster, "to", index_neighbor, "is:", distance);
def __find_another_nearest_medoid(self, point_index, current_medoid_index): """! @brief Finds the another nearest medoid for the specified point that is differ from the specified medoid. @param[in] point_index: index of point in dataspace for that searching of medoid in current list of medoids is perfomed. @param[in] current_medoid_index: index of medoid that shouldn't be considered as a nearest. @return (uint) index of the another nearest medoid for the point. """ other_medoid_index = -1 other_distance_nearest = float('inf') for index_medoid in self.__current: if (index_medoid != current_medoid_index): other_distance_candidate = manhattan_distance( self.__pointer_data[point_index], self.__pointer_data[current_medoid_index]) if other_distance_candidate < other_distance_nearest: other_distance_nearest = other_distance_candidate other_medoid_index = index_medoid return other_medoid_index
def display_two_dimensional_cluster_distances(path_sample, amount_clusters): distances = [ 'euclidian', 'manhattan', 'avr-inter', 'avr-intra', 'variance' ] ajacency = [[0] * amount_clusters for i in range(amount_clusters)] sample = utils.read_sample(path_sample) agglomerative_instance = agglomerative(sample, amount_clusters) agglomerative_instance.process() obtained_clusters = agglomerative_instance.get_clusters() stage = utils.draw_clusters(sample, obtained_clusters, display_result=False) for index_cluster in range(len(ajacency)): for index_neighbor_cluster in range(index_cluster + 1, len(ajacency)): if ((index_cluster == index_neighbor_cluster) or (ajacency[index_cluster][index_neighbor_cluster] is True)): continue ajacency[index_cluster][index_neighbor_cluster] = True ajacency[index_neighbor_cluster][index_cluster] = True cluster1 = obtained_clusters[index_cluster] cluster2 = obtained_clusters[index_neighbor_cluster] center_cluster1 = utils.centroid(sample, cluster1) center_cluster2 = utils.centroid(sample, cluster2) x_maximum, x_minimum, y_maximum, y_minimum = None, None, None, None x_index_maximum, y_index_maximum = 1, 1 if (center_cluster2[0] > center_cluster1[0]): x_maximum = center_cluster2[0] x_minimum = center_cluster1[0] x_index_maximum = 1 else: x_maximum = center_cluster1[0] x_minimum = center_cluster2[0] x_index_maximum = -1 if (center_cluster2[1] > center_cluster1[1]): y_maximum = center_cluster2[1] y_minimum = center_cluster1[1] y_index_maximum = 1 else: y_maximum = center_cluster1[1] y_minimum = center_cluster2[1] y_index_maximum = -1 print("Cluster 1:", cluster1, ", center:", center_cluster1) print("Cluster 2:", cluster2, ", center:", center_cluster2) stage.annotate(s='', xy=(center_cluster1[0], center_cluster1[1]), xytext=(center_cluster2[0], center_cluster2[1]), arrowprops=dict(arrowstyle='<->')) for index_distance_type in range(len(distances)): distance = None distance_type = distances[index_distance_type] if (distance_type == 'euclidian'): distance = utils.euclidean_distance( center_cluster1, center_cluster2) elif (distance_type == 'manhattan'): distance = utils.manhattan_distance( center_cluster1, center_cluster2) elif (distance_type == 'avr-inter'): distance = utils.average_inter_cluster_distance( cluster1, cluster2, sample) elif (distance_type == 'avr-intra'): distance = utils.average_intra_cluster_distance( cluster1, cluster2, sample) elif (distance_type == 'variance'): distance = utils.variance_increase_distance( cluster1, cluster2, sample) print("\tCluster distance -", distance_type, ":", distance) x_multiplier = index_distance_type + 3 if (x_index_maximum < 0): x_multiplier = len(distances) - index_distance_type + 3 y_multiplier = index_distance_type + 3 if (y_index_maximum < 0): y_multiplier = len(distances) - index_distance_type + 3 x_text = x_multiplier * (x_maximum - x_minimum) / ( len(distances) + 6) + x_minimum y_text = y_multiplier * (y_maximum - y_minimum) / ( len(distances) + 6) + y_minimum #print(x_text, y_text, "\n"); stage.text(x_text, y_text, distance_type + " {:.3f}".format(distance), fontsize=9, color='blue') plt.show()
def __optimize_configuration(self): """! @brief Finds quasi-optimal medoids and updates in line with them clusters in line with algorithm's rules. """ index_neighbor = 0 while (index_neighbor < self.__maxneighbor): # get random current medoid that is to be replaced current_medoid_index = self.__current[random.randint( 0, self.__number_clusters - 1)] current_medoid_cluster_index = self.__belong[current_medoid_index] # get new candidate to be medoid candidate_medoid_index = random.randint( 0, len(self.__pointer_data) - 1) while candidate_medoid_index in self.__current: candidate_medoid_index = random.randint( 0, len(self.__pointer_data) - 1) candidate_cost = 0.0 for point_index in range(0, len(self.__pointer_data)): if point_index not in self.__current: # get non-medoid point and its medoid point_cluster_index = self.__belong[point_index] point_medoid_index = self.__current[point_cluster_index] # get other medoid that is nearest to the point (except current and candidate) other_medoid_index = self.__find_another_nearest_medoid( point_index, current_medoid_index) other_medoid_cluster_index = self.__belong[ other_medoid_index] # for optimization calculate all required distances # from the point to current medoid distance_current = manhattan_distance( self.__pointer_data[point_index], self.__pointer_data[current_medoid_index]) # from the point to candidate median distance_candidate = manhattan_distance( self.__pointer_data[point_index], self.__pointer_data[candidate_medoid_index]) # from the point to nearest (own) medoid distance_nearest = float('inf') if ((point_medoid_index != candidate_medoid_index) and (point_medoid_index != current_medoid_cluster_index)): distance_nearest = manhattan_distance( self.__pointer_data[point_index], self.__pointer_data[point_medoid_index]) # apply rules for cost calculation if (point_cluster_index == current_medoid_cluster_index): # case 1: if (distance_candidate >= distance_nearest): candidate_cost += distance_nearest - distance_current # case 2: else: candidate_cost += distance_candidate - distance_current elif (point_cluster_index == other_medoid_cluster_index): # case 3 ('nearest medoid' is the representative object of that cluster and object is more similar to 'nearest' than to 'candidate'): if (distance_candidate > distance_nearest): pass # case 4: else: candidate_cost += distance_candidate - distance_nearest if (candidate_cost < 0): # set candidate that has won self.__current[ current_medoid_cluster_index] = candidate_medoid_index # recalculate clusters self.__update_clusters(self.__current) # reset iterations and starts investigation from the begining index_neighbor = 0 else: index_neighbor += 1
def display_two_dimensional_cluster_distances(path_sample, amount_clusters): distances = ['euclidian', 'manhattan', 'avr-inter', 'avr-intra', 'variance']; ajacency = [ [0] * amount_clusters for i in range(amount_clusters) ]; sample = utils.read_sample(path_sample); agglomerative_instance = agglomerative(sample, amount_clusters); agglomerative_instance.process(); obtained_clusters = agglomerative_instance.get_clusters(); stage = utils.draw_clusters(sample, obtained_clusters, display_result = False); for index_cluster in range(len(ajacency)): for index_neighbor_cluster in range(index_cluster + 1, len(ajacency)): if ( (index_cluster == index_neighbor_cluster) or (ajacency[index_cluster][index_neighbor_cluster] is True) ): continue; ajacency[index_cluster][index_neighbor_cluster] = True; ajacency[index_neighbor_cluster][index_cluster] = True; cluster1 = obtained_clusters[index_cluster]; cluster2 = obtained_clusters[index_neighbor_cluster]; center_cluster1 = utils.centroid(sample, cluster1); center_cluster2 = utils.centroid(sample, cluster2); x_maximum, x_minimum, y_maximum, y_minimum = None, None, None, None; x_index_maximum, y_index_maximum = 1, 1; if (center_cluster2[0] > center_cluster1[0]): x_maximum = center_cluster2[0]; x_minimum = center_cluster1[0]; x_index_maximum = 1; else: x_maximum = center_cluster1[0]; x_minimum = center_cluster2[0]; x_index_maximum = -1; if (center_cluster2[1] > center_cluster1[1]): y_maximum = center_cluster2[1]; y_minimum = center_cluster1[1]; y_index_maximum = 1; else: y_maximum = center_cluster1[1]; y_minimum = center_cluster2[1]; y_index_maximum = -1; print("Cluster 1:", cluster1, ", center:", center_cluster1); print("Cluster 2:", cluster2, ", center:", center_cluster2); stage.annotate(s = '', xy = (center_cluster1[0], center_cluster1[1]), xytext = (center_cluster2[0], center_cluster2[1]), arrowprops = dict(arrowstyle = '<->')); for index_distance_type in range(len(distances)): distance = None; distance_type = distances[index_distance_type]; if (distance_type == 'euclidian'): distance = utils.euclidean_distance(center_cluster1, center_cluster2); elif (distance_type == 'manhattan'): distance = utils.manhattan_distance(center_cluster1, center_cluster2); elif (distance_type == 'avr-inter'): distance = utils.average_inter_cluster_distance(cluster1, cluster2, sample); elif (distance_type == 'avr-intra'): distance = utils.average_intra_cluster_distance(cluster1, cluster2, sample); elif (distance_type == 'variance'): distance = utils.variance_increase_distance(cluster1, cluster2, sample); print("\tCluster distance -", distance_type, ":", distance); x_multiplier = index_distance_type + 3; if (x_index_maximum < 0): x_multiplier = len(distances) - index_distance_type + 3; y_multiplier = index_distance_type + 3; if (y_index_maximum < 0): y_multiplier = len(distances) - index_distance_type + 3; x_text = x_multiplier * (x_maximum - x_minimum) / (len(distances) + 6) + x_minimum; y_text = y_multiplier * (y_maximum - y_minimum) / (len(distances) + 6) + y_minimum; #print(x_text, y_text, "\n"); stage.text(x_text, y_text, distance_type + " {:.3f}".format(distance), fontsize = 9, color='blue'); plt.show();