def categorize(self, element): dists = {} for number_id in self.data: dists[number_id] = Utils.euclidean_distance( self.data[number_id], element ) s_dists = list( dists.values() ) s_dists.sort() k_th = s_dists[self.k-1] k_labels = {} for number_id in dists: if dists[number_id] <= k_th: label = self.labels[number_id] if label in k_labels: k_labels[label] += 1 else: k_labels[label] = 1 return max(k_labels, key=k_labels.get)
def measure(self): x = [] y = [] for i in range(len(self.data)): k = i + 1 knn = KNN(self.labels, self.data, k, 'regression') KFoldCV_error = knn.KFoldCV(5) if KFoldCV_error > -1: x.append(1.0 / k) y.append(KFoldCV_error) Utils.plot2D(x, y)
def measure(self): x = [] y = [] for i in range( len(self.data) ): k = i+1 knn = KNN(self.labels, self.data, k, 'regression') KFoldCV_error = knn.KFoldCV( 5 ) if KFoldCV_error > -1: x.append( 1.0 / k ) y.append( KFoldCV_error ) Utils.plot2D(x,y)
def absolute_error(self, clusters, means): ae = 0.0 for cluster in clusters: sum_dissimilarities = 0 for element in clusters[cluster]: sum_dissimilarities += Utils.euclidean_distance( element, means[cluster] ) ae += sum_dissimilarities return ae
def absolute_error(self, clusters, means): ae = 0.0 for cluster in clusters: sum_dissimilarities = 0 for element in clusters[cluster]: sum_dissimilarities += Utils.euclidean_distance( element, means[cluster]) ae += sum_dissimilarities return ae
def MSE(self, clusters, means): mse = 0.0 for cluster in clusters: mse_c = 0.0 for element in clusters[cluster]: mse_c += pow( Utils.euclidean_distance( element, means[cluster] ), 2 ) if len(clusters[cluster]) > 0: mse_c = ( mse_c / len(clusters[cluster]) ) mse += mse_c return (mse / len(clusters) )
def MSE(self, clusters, means): mse = 0.0 for cluster in clusters: mse_c = 0.0 for element in clusters[cluster]: mse_c += pow(Utils.euclidean_distance(element, means[cluster]), 2) if len(clusters[cluster]) > 0: mse_c = (mse_c / len(clusters[cluster])) mse += mse_c return (mse / len(clusters))
def getMedoid(self, cluster_elements): if len(cluster_elements) == 0: return self.empty_row N = len(cluster_elements[0]) centroid = self.getCentroid(cluster_elements) # just a big number to start min_dist = self.big_number medoid = self.empty_row for element in cluster_elements: dist = Utils.euclidean_distance(centroid, element) if dist < min_dist: min_dist = dist medoid = element return medoid
def getMedoid(self, cluster_elements): if len(cluster_elements) == 0: return self.empty_row N = len(cluster_elements[0]) centroid = self.getCentroid( cluster_elements ) # just a big number to start min_dist = self.big_number medoid = self.empty_row for element in cluster_elements: dist = Utils.euclidean_distance( centroid, element ) if dist < min_dist: min_dist = dist medoid = element return medoid
def KMeansCore(self, data, k, mean_method, means): K = {} min_AE = self.big_number prev_AE = 0 min_means = {} min_K = {} iter_flag = True # do iterative relocation, until nothing changes while (iter_flag): for cluster in range(k): K[cluster] = [] # assign each element to the cluster which has the closest mean for element in data: # for start, just set a big max number, and just pick the cluster 0 as the closest cluster min_dist = self.big_number closest_cluster = 0 for cluster in range(k): dist = Utils.euclidean_distance(means[cluster], element) if dist < min_dist: min_dist = dist closest_cluster = cluster K[closest_cluster].append(element) means[closest_cluster] = element # calculate new mean for each cluster if mean_method == self.mean_methods[0]: for cluster in range(k): means[cluster] = self.getCentroid(K[cluster]) elif mean_method == self.mean_methods[1]: for cluster in range(k): means[cluster] = self.getMedoid(K[cluster]) # calculate the absolute error if mean_method == self.mean_methods[0]: AE = self.MSE(K, means) elif mean_method == self.mean_methods[1]: AE = self.absolute_error(K, means) # keep the clustering that minimizes the absolute error if AE < min_AE: min_AE = AE min_means = means min_K = K # stop when nothing changes if prev_AE == AE: iter_flag = False else: prev_AE = AE return (min_K, min_means, min_AE)
def KMeansCore(self, data, k, mean_method, means): K = {} min_AE = self.big_number prev_AE = 0 min_means = {} min_K = {} iter_flag = True # do iterative relocation, until nothing changes while(iter_flag): for cluster in range(k): K[cluster] = [] # assign each element to the cluster which has the closest mean for element in data: # for start, just set a big max number, and just pick the cluster 0 as the closest cluster min_dist = self.big_number closest_cluster = 0 for cluster in range(k): dist = Utils.euclidean_distance( means[cluster], element ) if dist < min_dist: min_dist = dist closest_cluster = cluster K[closest_cluster].append( element ) means[closest_cluster] = element # calculate new mean for each cluster if mean_method == self.mean_methods[0]: for cluster in range(k): means[cluster] = self.getCentroid( K[cluster] ) elif mean_method == self.mean_methods[1]: for cluster in range(k): means[cluster] = self.getMedoid( K[cluster] ) # calculate the absolute error if mean_method == self.mean_methods[0]: AE = self.MSE(K, means) elif mean_method == self.mean_methods[1]: AE = self.absolute_error(K, means) # keep the clustering that minimizes the absolute error if AE < min_AE: min_AE = AE min_means = means min_K = K # stop when nothing changes if prev_AE == AE: iter_flag = False else: prev_AE = AE return (min_K, min_means, min_AE)