def Replace(self, target, profiles):
        # people为每个领域的用户集合
        people = datapre.People(self.features)
        category = self.features[target][5]

        index = profiles.index(target)
        old_element = profiles[index]

        profile_domain = set(
            [id for id in profiles if self.features[id][5] == category])

        if os.path.exists("new%sRepresentativeMatrix.npy" % category):
            # 加载矩阵
            # open_file = open("%sRepresentativeMatrix.pickle" % category)
            # R = pickle.load(open_file)
            # open_file.close()
            # R = np.load("new%sRepresentativeMatrix.npy" % category)
            R = self.Repre[category]
            # 加载id字典
            # open_file = open("new%sRepresentativeDictionary.pickle" % category)
            # R_dic = pickle.load(open_file)
            # open_file.close()
            R_dic = self.Repre_id[category]
            # 该领域的代表性人物对应的所有行
            rows = set([R_dic[id] for id in profile_domain])
            results = {
                element: sum(
                    np.max(np.asarray([R[i] for i in rows | {R_dic[element]}]),
                           axis=0))
                for element in people[category] if element not in set(profiles)
            }
            results = sorted(results.items(),
                             key=lambda dic: dic[1],
                             reverse=True)
            for result in results:
                to_replace = result[0]

                if metric.checkOneTypical(self.features, to_replace, profiles,
                                          self.epsilon):
                    self.replace[target] = to_replace
                    profiles[index] = old_element
                    # print new_element
                    return to_replace
        return None
    def SearchWithoutConstraints(self):
        # 每次并入使得目标函数最小化
        profiles = set()
        people = datapre.People(self.features)
        print "数据集装载完毕"
        for category in self.categories.keys():
            # p_number为该领域需要的人数
            p_number = (int)(self.k * self.categories[category]) + 1
            # tuples为该领域所有的人
            tuples = people[category]

            if not os.path.exists("new%sRepresentativeMatrix.npy" % category):
                pass
            else:
                # 加载矩阵
                # open_file = open("new%sRepresentativeMatrix.pickle" % category)
                # R = pickle.load(open_file)
                # open_file.close()
                # 换一种加载方式
                # R = np.load("new%sRepresentativeMatrix.npy" % category)
                R = self.Repre[category]
            rowN = len(tuples)
            results_vector = np.asarray([0 for i in xrange(rowN)])
            # 得到了代表性矩阵后
            count = 0
            has = {}
            while count < p_number:
                # results = {i:sum(max(x,y) for x,y in zip(R[i],results_vector)) for i in xrange(rowN) if i not in has}
                results = {
                    i: sum(np.max(np.vstack((R[i], results_vector)), axis=0))
                    for i in xrange(rowN) if i not in has
                }
                to_add = (max(results.items(), key=lambda key: key[1]))[0]
                has[to_add] = tuples[to_add]
                profiles.add(tuples[to_add])
                # 更新
                results_vector = np.max(np.vstack((R[to_add], results_vector)),
                                        axis=0)
                # results_vector = [max(x,y) for x,y in zip(R[to_add],results_vector)]
                count += 1
                print "the number of profiles is %d" % len(profiles)
        return list(profiles)
Example #3
0
 def Split(self):
     # 返回结果为训练集和测试集
     train_set = {}
     test_set = {}
     # 对原集中每个领域取3/10加入train_set,取7/10加入test_set
     people = datapre.People(self.features)
     categories = datapre.GetUserCategory()
     for category in categories:
         domain_people = people[category]
         train_set_number = int(len(domain_people) * 0.3) + 1
         count = 0
         for id in domain_people:
             if count < train_set_number:
                 train_set[id] = self.features[id]
                 count += 1
             else:
                 break
     # 将剩余的用户加入
     left = set(self.features.keys()) - set(train_set.keys())
     for id in left:
         test_set[id] = self.features[id]
     return train_set,test_set
 def Search(self):
     profiles = set()
     medoids_clusters = {}
     # 对每个领域聚类
     people = datapre.People(self.features)
     for category in self.categories.keys():
         # 对每个领域进行聚类
         number = int(self.k * self.categories[category]) + 1
         tuples = people[category]
         method = KMedoidsCluster(
             number, datapre.FeaturesById(tuples, self.features), category)
         clusters, medoids = method.Cluster()
         # 先加入到profiles中
         for medoid in medoids:
             profiles.add(medoid)
             medoids_clusters[medoid] = clusters[medoid]
     print "开始删除"
     # 删除多出来的
     profiles = self.Delete(profiles)
     print "开始替换"
     profiles = self.Replace(profiles, medoids_clusters)
     return profiles
    def Replace(self, profiles, cluster):
        '''

        :param profiles: 完成的中心点
        :param cluster: 字典形式的,以profiles为key,聚类簇value为列表格式
        :return: 返回替换好的profiles
        '''

        # 替换过程用离medoids最近的且满足要求的元素来替换

        while True:
            iteration = True
            new_profiles = deepcopy(profiles)
            for profile in profiles:
                if not metric.checkOneTypical(self.features, profile,
                                              new_profiles, self.epsilon):
                    new_profiles.remove(profile)
                    # 对profile进行替换,在cluster[profile]寻找profile对其代表性最大的元素,且满足条件的来替换
                    R = np.load("new%sRepresentativeMatrix.npy" %
                                self.features[profile][5])
                    # 加载id字典
                    open_file = open("new%sRepresentativeDictionary.pickle" %
                                     self.features[profile][5])
                    R_dic = pickle.load(open_file)
                    open_file.close()

                    # 在其聚类簇中寻找到其代表性最大的来替换

                    results = {
                        id: R[R_dic[id]][R_dic[profile]]
                        for id in cluster[profile]
                    }
                    # results = {element:metric.Repre(self.features[profile],self.features[element]) for element in cluster[profile]}
                    results = sorted(results.items(),
                                     key=lambda key: key[1],
                                     reverse=True)
                    flag = False
                    # 在results中找到profile最能代表的,且满足领域典型要求的元素
                    for result in results:
                        key = result[0]
                        if metric.checkOneTypical(self.features, key,
                                                  new_profiles, self.epsilon):
                            new_profiles.add(key)
                            cluster[key] = cluster[profile]
                            cluster.pop(profile)
                            flag = True
                            break
                    # 没找到领域典型的,需要在该领域的原集中去除这部分元素,重新聚类
                    if flag == False:
                        iteration = False
                        # 对该领域去除这部分元素后,重新寻找k个聚类簇
                        category = self.features[profiles][5]
                        for profile in profiles:
                            if self.features[profile][5] == category:
                                new_profiles.remove(profile)
                        # 获取该领域的人物集合
                        tuples = datapre.People(self.features)[category]
                        # 去除cluster[profile]这部分元素
                        for element in tuples:
                            if element in set(cluster[profile]):
                                tuples.remove(element)
                        number = 0
                        for profile in profiles:
                            if self.features[profile][5] == category:
                                number += 1
                        # 重新对tuples聚类
                        method = KMedoidsCluster(
                            number,
                            datapre.FeaturesById(tuples,
                                                 self.features), category)
                        clusters, medoids = method.Cluster()
                        for key in clusters.keys():
                            cluster[key] = clusters[key]
                        for element in medoids:
                            new_profiles.add(element)
                        # 此时new_profiles是最新的,继续向下替换

            if iteration == True:
                break
            else:
                profiles = new_profiles

        return new_profiles