def Search(self): profiles = set() medoids_clusters = {} # 对每个领域聚类 people = datapre.People(self.features) for category in self.categories.keys(): # 对每个领域进行聚类 number = int(self.k * self.categories[category]) + 1 tuples = people[category] method = KMedoidsCluster( number, datapre.FeaturesById(tuples, self.features), category) clusters, medoids = method.Cluster() # 先加入到profiles中 for medoid in medoids: profiles.add(medoid) medoids_clusters[medoid] = clusters[medoid] print "开始删除" # 删除多出来的 profiles = self.Delete(profiles) print "开始替换" profiles = self.Replace(profiles, medoids_clusters) return profiles
def Replace(self, profiles, cluster): ''' :param profiles: 完成的中心点 :param cluster: 字典形式的,以profiles为key,聚类簇value为列表格式 :return: 返回替换好的profiles ''' # 替换过程用离medoids最近的且满足要求的元素来替换 while True: iteration = True new_profiles = deepcopy(profiles) for profile in profiles: if not metric.checkOneTypical(self.features, profile, new_profiles, self.epsilon): new_profiles.remove(profile) # 对profile进行替换,在cluster[profile]寻找profile对其代表性最大的元素,且满足条件的来替换 R = np.load("new%sRepresentativeMatrix.npy" % self.features[profile][5]) # 加载id字典 open_file = open("new%sRepresentativeDictionary.pickle" % self.features[profile][5]) R_dic = pickle.load(open_file) open_file.close() # 在其聚类簇中寻找到其代表性最大的来替换 results = { id: R[R_dic[id]][R_dic[profile]] for id in cluster[profile] } # results = {element:metric.Repre(self.features[profile],self.features[element]) for element in cluster[profile]} results = sorted(results.items(), key=lambda key: key[1], reverse=True) flag = False # 在results中找到profile最能代表的,且满足领域典型要求的元素 for result in results: key = result[0] if metric.checkOneTypical(self.features, key, new_profiles, self.epsilon): new_profiles.add(key) cluster[key] = cluster[profile] cluster.pop(profile) flag = True break # 没找到领域典型的,需要在该领域的原集中去除这部分元素,重新聚类 if flag == False: iteration = False # 对该领域去除这部分元素后,重新寻找k个聚类簇 category = self.features[profiles][5] for profile in profiles: if self.features[profile][5] == category: new_profiles.remove(profile) # 获取该领域的人物集合 tuples = datapre.People(self.features)[category] # 去除cluster[profile]这部分元素 for element in tuples: if element in set(cluster[profile]): tuples.remove(element) number = 0 for profile in profiles: if self.features[profile][5] == category: number += 1 # 重新对tuples聚类 method = KMedoidsCluster( number, datapre.FeaturesById(tuples, self.features), category) clusters, medoids = method.Cluster() for key in clusters.keys(): cluster[key] = clusters[key] for element in medoids: new_profiles.add(element) # 此时new_profiles是最新的,继续向下替换 if iteration == True: break else: profiles = new_profiles return new_profiles