def CAclustering(self, constraints, final_n_of_clusters, clusters=None): """ Main hierarhical clustering loop """ self.l.log("Creating transitive ML closure...") stevec = len(clusters) for c in clusters: print(clusters[c].clusterId, clusters[c].points) for x in constraints: if 'must-link' in x: print("omejitev: ", x) kluc1 = self.getClusterID(x['point'][0], clusters) kluc2 = self.getClusterID(x['must-link'][0], clusters) print(kluc1, " | ", kluc2, " | ", stevec, kluc1 == kluc2) if kluc1 != kluc2: tocke = [] tocke.append(clusters[kluc1].points) tocke.append(clusters[kluc2].points) clusters.pop(kluc1) clusters.pop(kluc2) nov = Cluster(stevec) nov.update(kluc1, kluc2, 0, tocke) # TLE DEJ NOT clusters.update({stevec: nov}) stevec += 1 m = stevec self.l.log("Creating distance matrix....") self.distances = {} self.clusters = clusters stevec = 0 for c in self.clusters: print(self.clusters[c].points, self.clusters[c].clusterId) for p in self.clusters[c].points: stevec += 1 print(len(self.clusters), stevec) z = [(clusters[a].clusterId, clusters[b].clusterId) for a in self.clusters for b in self.clusters] for l in z: kljuc1 = str(l[0]) + " " + str(l[1]) kljuc2 = str(l[1]) + " " + str(l[0]) if l[0] != l[1]: if kljuc1 in self.distances: continue elif kljuc2 in self.distances: continue else: if self.linkage == "Ward": c = [] u = [] v = [] for p in self.clusters[l[0]].points: c.append(p.coords) u.append(p.coords) for r in self.clusters[l[1]].points: c.append(r.coords) v.append(r.coords) centroid_uv = np.average(c, axis=0) centroid_u = np.average(u, axis=0) centroid_v = np.average(v, axis=0) dist1 = 0 dist2 = 0 dist3 = 0 for point in c: if self.distance_type == "Cosine": dist1 += spatial.distance.cosine( centroid_uv, point)**2 elif self.distance_type == "Euclidean": dist1 += spatial.distance.euclidean( centroid_uv, point)**2 for point in u: if self.distance_type == "Cosine": dist2 += spatial.distance.cosine( centroid_u, point)**2 elif self.distance_type == "Euclidean": dist2 += spatial.distance.euclidean( centroid_u, point)**2 for point in v: if self.distance_type == "Cosine": dist3 += spatial.distance.cosine( centroid_v, point)**2 elif self.distance_type == "Euclidean": dist3 += spatial.distance.euclidean( centroid_v, point)**2 dist = dist1 - dist2 - dist3 self.distances.update({kljuc1: dist}) elif self.linkage == "Average": u = [(a, b) for a in self.clusters[l[0]].points for b in self.clusters[l[1]].points] dist = self.average_linkage(u) self.distances.update({kljuc1: dist}) else: print("Error creating distance matrix...") exit(1) self.l.log("Finding clusters...") ''' s = sorted(self.distances.items(), key=lambda x: x[1]) for k, v in s: print(k, v) ''' print("st. omejitev: ", len(constraints)) # print("clustri: ", self.clusters.keys()) self.Z = np.array([]) # n = len(self.points) #na začetku je vsak primer svoj cluster n = len(self.clusters) idZ = 0 stop_clustering = False while (n != final_n_of_clusters): # print("### ",n," ###") condition = True # clusters_checked = [] while condition: key = min(self.distances, key=self.distances.get) kljuc = key par = key.split(' ') par = [int(i) for i in par] dist = self.distances[kljuc] # print(" ->",key, " ", self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points)) if self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points): self.distances[kljuc] = sys.maxsize if dist == sys.maxsize: self.l.log( "ABHC cannot find clusters under those constraints..." ) return self.clusters print(" Cannot link:", par) else: break # print("--------------------") # print(par[0], par[1]) # print(self.distances.keys()) self.distances.pop(kljuc, None) self.izbrisi_razdalje(par[0]) self.izbrisi_razdalje(par[1]) # print(self.distances.keys()) tocke = [] tocke.append(self.clusters[par[0]].points) tocke.append(self.clusters[par[1]].points) # print("tocke: ", len(tocke)) novCluster = Cluster(m + idZ) novCluster.update(par[0], par[1], dist, tocke) novCluster.centroid = novCluster.calculateCentroid() self.clusters.pop(par[0]) self.clusters.pop(par[1]) self.clusters.update({(m + idZ): novCluster}) # print("clustri:") # print(self.clusters.keys()) # print("dodajam razdalje...") # print("NOV:" ,m+idZ) self.dodaj_razdalje(m + idZ) print("par: ", par, "dist: ", '%.08f' % dist) if idZ == 0: self.Z = [par[0], par[1], dist, novCluster.n] else: newrow = [par[0], par[1], dist, novCluster.n] self.Z = np.vstack([self.Z, newrow]) n = len(self.clusters) idZ += 1 return self.clusters
def ABHclustering(self, constraints, final_n_of_clusters, clusters=None): """ Main hierarhical clustering loop """ self.l.log("Creating transitive ML closure...") stevec = len(clusters) for x in constraints: if 'must-link' in x: #print("omejitev: ", x) kluc1 = self.getClusterID(x['point'][0], clusters) kluc2 = self.getClusterID(x['must-link'][0], clusters) #print(kluc1, " | " , kluc2," | ", stevec, kluc1 == kluc2) if kluc1 != kluc2: tocke = [] tocke.append(clusters[kluc1].points) tocke.append(clusters[kluc2].points) clusters.pop(kluc1) clusters.pop(kluc2) nov = Cluster(stevec) nov.update(kluc1, kluc2, 0, tocke) #TLE DEJ NOT clusters.update({stevec: nov}) stevec += 1 m = stevec self.l.log("Creating distance matrix....") self.distances = {} self.clusters = clusters stevec = 0 for c in self.clusters: print(self.clusters[c].points, self.clusters[c].clusterId) for p in self.clusters[c].points: stevec += 1 print(len(self.clusters), stevec) z = [(clusters[a].clusterId, clusters[b].clusterId) for a in self.clusters for b in self.clusters] for l in z: kljuc1 = str(l[0]) + " " + str(l[1]) kljuc2 = str(l[1]) + " " + str(l[0]) if l[0] != l[1]: if kljuc1 in self.distances: continue elif kljuc2 in self.distances: continue else: if self.linkage == "Ward": c = [] u = [] v = [] for p in self.clusters[l[0]].points: c.append(p.coords) u.append(p.coords) for r in self.clusters[l[1]].points: c.append(r.coords) v.append(r.coords) centroid_uv = np.average(c, axis=0) centroid_u = np.average(u, axis=0) centroid_v = np.average(v, axis=0) dist1 = 0 dist2 = 0 dist3 = 0 for point in c: if self.distance_type == "Cosine": dist1 += spatial.distance.cosine( centroid_uv, point)**2 elif self.distance_type == "Euclidean": dist1 += spatial.distance.euclidean( centroid_uv, point)**2 for point in u: if self.distance_type == "Cosine": dist2 += spatial.distance.cosine( centroid_u, point)**2 elif self.distance_type == "Euclidean": dist2 += spatial.distance.euclidean( centroid_u, point)**2 for point in v: if self.distance_type == "Cosine": dist3 += spatial.distance.cosine( centroid_v, point)**2 elif self.distance_type == "Euclidean": dist3 += spatial.distance.euclidean( centroid_v, point)**2 dist = dist1 - dist2 - dist3 self.distances.update({kljuc1: dist}) elif self.linkage == "Average": u = [(a, b) for a in self.clusters[l[0]].points for b in self.clusters[l[1]].points] dist = self.average_linkage(u) self.distances.update({kljuc1: dist}) else: print("Error creating distance matrix...") exit(1) self.l.log("Finding clusters...") ''' s = sorted(self.distances.items(), key=lambda x: x[1]) for k, v in s: print(k, v) ''' print("st. omejitev: ", len(constraints)) #print("clustri: ", self.clusters.keys()) self.Z = np.array([]) #n = len(self.points) #na začetku je vsak primer svoj cluster n = len(self.clusters) idZ = 0 stop_clustering = False while (n != final_n_of_clusters): #print("### ",n," ###") condition = True #clusters_checked = [] while condition: """ if len(clusters_checked) == len(self.clusters): print("Ni mozno nadaljne zruzevanje, ostalo je ",len(self.clusters)," clustrov.") break dist, pair = self.closest_clusters(clusters_checked) if(pair is None): stop_clustering = True break par = list() for el in pair: par.append(el) self.constraints = self.sort_constraints() #ali ima katerakoli tocka iz obeh clustrov ML, jo zdruzi in ponovno poisci najblizja clustra #ML_pair = self.check_must_link(constraints, self.clusters[par[0]].points) if ML_pair == -1: ML_pair = self.check_must_link(constraints, self.clusters[par[1]].points) if ML_pair != -1: par[0] = ML_pair[0] par[1] = ML_pair[1] condition = self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points) if condition: clusters_checked.append([par[0], par[1]]) dist = self.cluster_distance(par[0], par[1]) if stop_clustering: break #print("par: ", par, ", dist: ", round(dist,2), " ", len(self.clusters)) """ key = min(self.distances, key=self.distances.get) kljuc = key par = key.split(' ') par = [int(i) for i in par] dist = self.distances[kljuc] #print(" ->",key, " ", self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points)) if self.check_cannot_link(constraints, self.clusters[par[0]].points, self.clusters[par[1]].points): self.distances[kljuc] = sys.maxsize if dist == sys.maxsize: self.l.log( "ABHC cannot find clusters under those constraints..." ) return self.clusters print(" Cannot link:", par) else: break # print("--------------------") # print(par[0], par[1]) #print(self.distances.keys()) self.distances.pop(kljuc, None) self.izbrisi_razdalje(par[0]) self.izbrisi_razdalje(par[1]) #print(self.distances.keys()) tocke = [] tocke.append(self.clusters[par[0]].points) tocke.append(self.clusters[par[1]].points) #print("tocke: ", len(tocke)) novCluster = Cluster(m + idZ) novCluster.update(par[0], par[1], dist, tocke) novCluster.centroid = novCluster.calculateCentroid() self.clusters.pop(par[0]) self.clusters.pop(par[1]) self.clusters.update({(m + idZ): novCluster}) #print("clustri:") #print(self.clusters.keys()) #print("dodajam razdalje...") #print("NOV:" ,m+idZ) self.dodaj_razdalje(m + idZ) print("par: ", par, "dist: ", '%.08f' % dist) if idZ == 0: self.Z = [par[0], par[1], dist, novCluster.n] else: newrow = [par[0], par[1], dist, novCluster.n] self.Z = np.vstack([self.Z, newrow]) n = len(self.clusters) idZ += 1 #zapomni si primere, kateri so v drugi skupini kot v prejšni iteraciji. self.diff = [] clusters_checked = set() hm = 0 for cluster in self.clusters: val = -1 for point in self.clusters[cluster].points: hm += 1 if val < 0: val = self.prev_dict[point.reference] if val in clusters_checked: self.diff.append(point.reference) else: if val != self.prev_dict[point.reference]: self.diff.append(point.reference) clusters_checked.add(val) self.prev_dict = self.make_dict() print(len(self.diff)) print(sorted(self.diff)) print("stevilo primerov: ", hm) return self.clusters
def randomClusters(self, max_clusters): for i in range(0, max_clusters): cluster = Cluster() cluster.centroid = self.randomPoint() self.clusters.append(cluster)
def hierarhicalClustering(self, clusters=None): """ Main hierarhical clustering loop """ distanca = 0 self.l.log("Building distance matrix...") n = len(self.points) #na začetku je vsak primer svoj cluster data = [] for c in self.clusters: p = [point.coords for point in self.clusters[c].points] data.append(p[0]) df = pd.DataFrame(data, columns=np.array([a for a in self.attributes])) n_df = (df.values) self.d_matrix = np.zeros(((df.values).shape[0], (df.values).shape[0])) for i in range((df.values).shape[0]): for j in range((df.values).shape[0]): kljuc1 = str(i) + ' ' + str(j) kljuc2 = str(j) + ' ' + str(i) if i != j: if kljuc1 in self.distances: continue elif kljuc2 in self.distances: continue else: if self.linkage == "Ward": l = [] l.append(n_df[i]) l.append(n_df[j]) centroid = np.average(l, axis=0) dist = 0 if self.distance_type == "Cosine": dist += spatial.distance.cosine( centroid, n_df[i])**2 dist += spatial.distance.cosine( centroid, n_df[j])**2 elif self.distance_type == "Euclidean": dist += spatial.distance.euclidean( centroid, n_df[i])**2 dist += spatial.distance.euclidean( centroid, n_df[i])**2 self.distances.update({kljuc1: dist}) elif self.linkage == "Average": if self.distance_type == "Cosine": dist = spatial.distance.cosine( n_df[i], n_df[j]) elif self.distance_type == "Euclidean": dist = spatial.distance.euclidean( n_df[i], n_df[j]) self.distances.update({kljuc1: dist}) else: print("Error creating distance matrix...") exit(1) idZ = 0 m = len(self.points) self.l.log("Finding clusters...") while n > 1: """ dist, pair = self.closest_clusters() par = list() for el in pair: par.append(el) dist = np.amin(self.d_matrix) result = np.where(self.d_matrix == dist) par = list() for el in result[0]: par.append(el) print("--",par) """ key = min(self.distances, key=self.distances.get) par = key.split(' ') par = [int(i) for i in par] dist = self.distances[key] #print("--------------------") #print(par[0], par[1]) self.distances.pop(key, None) self.izbrisi_razdalje(par[0]) self.izbrisi_razdalje(par[1]) #print("5 238" in self.distances) tocke = [] tocke.append(self.clusters[par[0]].points) tocke.append(self.clusters[par[1]].points) #print("tocke: ", len(tocke)) novCluster = Cluster(m + idZ) novCluster.update(par[0], par[1], dist, tocke) novCluster.centroid = novCluster.calculateCentroid() self.clusters.pop(par[0]) self.clusters.pop(par[1]) self.clusters.update({(m + idZ): novCluster}) #print("dodajam razdalje...") self.dodaj_razdalje(m + idZ) """ novCluster = Cluster(par[0]) novCluster.update(par[0], par[1], dist, tocke) novCluster.centroid = novCluster.calculateCentroid() self.clusters.pop(par[0]) self.clusters.pop(par[1]) self.clusters.update({(par[0]): novCluster}) #TODO: preracunaj razdalje v matriki razdalj """ if idZ == 0: self.Z = [par[0], par[1], dist, novCluster.n] else: newrow = [par[0], par[1], dist, novCluster.n] self.Z = np.vstack([self.Z, newrow]) n = len(self.clusters) #self.vseSilhuete.update({idZ: self.metodaSilhuet()}) print("par: ", par, ", dist: ", '%.08f' % dist) #print(idZ, n, m+idZ) idZ += 1 self.l.log("Dendrogram created...") #vrnil naj bi matriko Z, in rezultate metod, ki nam povejo koliko clustrov je #print("Optimalno stevilo clustrov po metodi silhuet: ", len(self.points)-1-max(self.vseSilhuete.items(), key=operator.itemgetter(1))[0]) return self.clusters
parser = argparse.ArgumentParser() parser.add_argument('k', metavar='k', type=int, help='The number of clusters to be used') args = parser.parse_args() k = args.k clusters = [] plants = file_reader.readFile() centroids = random.sample(plants, k) # Inicia os clusters com centroids randômicos for i in range(0, k): cluster = Cluster(i) cluster.centroid = centroids[i] clusters.append(cluster) isConverging = False iterationsLimit = 1000 currentIteration = 0 while (currentIteration < iterationsLimit and not isConverging): for cluster in clusters: cluster.plants = [] for plant in plants: # O primeiro elemento representa o cluster mais próxima e o segundo # representa a distância até o centróide do mesmo closestCluster = [0, 99999999]