def buildCluster(self): #Initialize each content as a Hierachical Cluster hiclusters = [ HierachicalCluster(vec = self.__content[i], id = i ) for i in range(len(self.__content)) ] distances = {} flag = None; currentclusted = -1 while(len(hiclusters) > 1) : min_val = 2; hiclusters_len = len(hiclusters) ''' calculate distances between each two clusters, if d<minval,record the two clusters' id ''' for i in range(hiclusters_len-1) : for j in range(i + 1, hiclusters_len) : if distances.get((hiclusters[i].id,hiclusters[j].id)) == None: distances[(hiclusters[i].id,hiclusters[j].id)] = ClusterBase.pearson_distance(hiclusters[i].vec,hiclusters[j].vec) d = distances[(hiclusters[i].id,hiclusters[j].id)] if d < min_val : min_val = d flag = (i,j) bic1,bic2 = flag newvec = [(hiclusters[bic1].vec[i] + hiclusters[bic2].vec[i])/2 for i in range(len(hiclusters[bic1].vec))] newbic = HierachicalCluster(newvec, left=hiclusters[bic1], right=hiclusters[bic2], distance=min_val, id = currentclusted) currentclusted -= 1 del hiclusters[bic2] del hiclusters[bic1] hiclusters.append(newbic) return hiclusters[0]
def buildCluster(self): ClusterBase.buildCluster(self) min_max_per_word = [ [min([row[i] for row in self.__content]), max([row[i] for row in self.__content])] for i in range(len(self.__content[0]))] # generate k clusters randomly __ClusterCenters = [] for i in range(self.__KCluster) : cluster = [] for min_, max_ in min_max_per_word : cluster.append(random.random() * (max_ - min_) + min_) __ClusterCenters.append(cluster) lables = [] matchs = [ [] for i in range(self.__KCluster)] lastmatchs = [ [] for i in range(self.__KCluster)] rounds = 100 while rounds > 0 : matchs = [ [] for i in range(self.__KCluster)] print('round \t', rounds) for i in range(len(self.__content)) : bestmatch_cluster = None min_distance = 2.1 for j in range(self.__KCluster) : dis = ClusterBase.pearson_distance(__ClusterCenters[j], self.__content[i]) if dis < min_distance : min_distance = dis bestmatch_cluster = j matchs[bestmatch_cluster].append(i) self.printCluster(matchs) self.printCluster(lastmatchs) if matchs == lastmatchs : break lastmatchs = [[ item for item in matchs[i] ] for i in range(self.__KCluster)] # move the centroids to the average of their members for j in range(self.__KCluster) : avg = [0.0 for i in range(len(self.__content[0])) ] for m in matchs[j] : vec = self.__content[m] for i in range(len(self.__content[0])) : avg[i] += vec[i] avg = [ item / len(self.__content[0]) for item in avg] __ClusterCenters[j] = avg rounds -= 1