コード例 #1
0
 def buildCluster(self):
     #Initialize each content as a Hierachical Cluster
     hiclusters = [ HierachicalCluster(vec = self.__content[i], id = i ) for i in range(len(self.__content)) ]
     distances = {}
     flag = None;
     currentclusted = -1
     while(len(hiclusters) > 1) :
         min_val = 2;
         hiclusters_len = len(hiclusters)
         
         '''
         calculate distances between each two clusters,
         if d<minval,record the two clusters' id
         '''
         for i in range(hiclusters_len-1) :
             for j in range(i + 1, hiclusters_len) :
                 if distances.get((hiclusters[i].id,hiclusters[j].id)) == None:
                     distances[(hiclusters[i].id,hiclusters[j].id)] = ClusterBase.pearson_distance(hiclusters[i].vec,hiclusters[j].vec)
                 d = distances[(hiclusters[i].id,hiclusters[j].id)] 
                 if d < min_val :
                     min_val = d
                     flag = (i,j)
         bic1,bic2 = flag
         newvec = [(hiclusters[bic1].vec[i] + hiclusters[bic2].vec[i])/2 for i in range(len(hiclusters[bic1].vec))]
         newbic = HierachicalCluster(newvec, left=hiclusters[bic1], right=hiclusters[bic2], distance=min_val, id = currentclusted)
         currentclusted -= 1
         del hiclusters[bic2]
         del hiclusters[bic1]
         hiclusters.append(newbic)
     return hiclusters[0]
コード例 #2
0
 def buildCluster(self):
     ClusterBase.buildCluster(self)
     min_max_per_word = [ [min([row[i] for row in self.__content]), max([row[i] for row in self.__content])]  for i in range(len(self.__content[0]))]
     
     # generate k clusters randomly
     __ClusterCenters = []
     for i in range(self.__KCluster) :
         cluster = []
         for min_, max_ in min_max_per_word :
             cluster.append(random.random() * (max_ - min_) + min_)
         __ClusterCenters.append(cluster)
     
     lables = []
     matchs = [ [] for i in range(self.__KCluster)]
     lastmatchs = [ [] for i in range(self.__KCluster)]
     rounds = 100
     while rounds > 0 :
         matchs = [ [] for i in range(self.__KCluster)]
         print('round \t', rounds)
         for i in range(len(self.__content)) :
             bestmatch_cluster = None
             min_distance = 2.1
             for j in range(self.__KCluster) :
                 dis = ClusterBase.pearson_distance(__ClusterCenters[j], self.__content[i])
                 if dis < min_distance :
                     min_distance = dis
                     bestmatch_cluster = j
             matchs[bestmatch_cluster].append(i)
         self.printCluster(matchs)
         self.printCluster(lastmatchs)
         if matchs == lastmatchs : break
         lastmatchs = [[ item for item in matchs[i] ] for i in range(self.__KCluster)]
         
         # move the centroids to the average of their members
         for j in range(self.__KCluster) :
             avg = [0.0 for i in range(len(self.__content[0])) ]
             for m in matchs[j] :
                 vec = self.__content[m]
                 for i in range(len(self.__content[0])) :
                     avg[i] += vec[i]
             avg = [ item / len(self.__content[0]) for item in avg]
             __ClusterCenters[j] = avg
         rounds -= 1