forked from DuncanZhou/TwitterUsersProfiling
/
GreedyAlgorithm.py
351 lines (316 loc) · 14.4 KB
/
GreedyAlgorithm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
#!/usr/bin/python
#-*-coding:utf-8-*-
'''@author:duncan'''
import DataPrepare as datapre
import Metric as metric
import time
from copy import deepcopy
from numpy import *
import numpy as np
import os
class Greedy:
def __init__(self,k,features,categories,epsilon):
'''
:param k: 共计需要寻找的代表性子集大小
:param categories: 领域分布
:param features: 全局特征集
'''
self.k = k
self.features = features
# 全局领域分布情况
self.categories = categories
# 典型性判断参数
self.epsilon = epsilon
# 最优代表性子集
self.best_profiles = set()
# 最大的代表性
self.max_repre = 0
# 一次性加载代表性矩阵和id字典
self.Repre = {}
for category in categories:
self.Repre[category] = np.load("new%sRepresentativeMatrix.npy" % category)
self.Repre_id = {}
for category in categories:
open_file = open("new%sRepresentativeDictionary.pickle" % category)
self.Repre_id[category] = pickle.load(open_file)
open_file.close()
# 某个具体领域代表性计算
def AttributeRepresentativeByDomain(self,profiles,domain):
# 加载该领域的代表性矩阵
# R = np.load("new%sRepresentativeMatrix.npy" % domain)
R = self.Repre[domain]
# 加载id字典
# open_file = open("new%sRepresentativeDictionary.pickle" % domain)
# R_dic = pickle.load(open_file)
# open_file.close()
R_dic = self.Repre_id[domain]
profile_domain = [id for id in profiles if self.features[id][5] == domain]
# 将profile_domain中的最大值相加
repre = sum(np.max(np.asarray([R[R_dic[id]] for id in profile_domain]),axis=0))
return repre
# 属性代表性
def AttributeRepresentative(self,profiles):
# 分别在每个领域内计算代表性
repre = 0
for category in self.categories:
# 得到profiles中在这领域的代表性用户
profile_domain = [id for id in profiles if self.features[id][5] == category]
if len(profile_domain) != 0:
repre += self.AttributeRepresentativeByDomain(profile_domain,category)
return repre
# 贪心寻找,暂不管领域典型条件
def SearchWithoutConstraints(self):
# 每次并入使得目标函数最小化
profiles = set()
people = datapre.People(self.features)
print "数据集装载完毕"
for category in self.categories.keys():
# p_number为该领域需要的人数
p_number = (int)(self.k * self.categories[category]) + 1
# tuples为该领域所有的人
tuples = people[category]
if not os.path.exists("new%sRepresentativeMatrix.npy" % category):
pass
else:
# 加载矩阵
# open_file = open("new%sRepresentativeMatrix.pickle" % category)
# R = pickle.load(open_file)
# open_file.close()
# 换一种加载方式
# R = np.load("new%sRepresentativeMatrix.npy" % category)
R = self.Repre[category]
rowN = len(tuples)
results_vector = np.asarray([0 for i in xrange(rowN)])
# 得到了代表性矩阵后
count = 0
has = {}
while count < p_number:
# results = {i:sum(max(x,y) for x,y in zip(R[i],results_vector)) for i in xrange(rowN) if i not in has}
results = {i:sum(np.max(np.vstack((R[i],results_vector)),axis=0)) for i in xrange(rowN) if i not in has}
to_add = (max(results.items(),key=lambda key:key[1]))[0]
has[to_add] = tuples[to_add]
profiles.add(tuples[to_add])
# 更新
results_vector = np.max(np.vstack((R[to_add],results_vector)),axis=0)
# results_vector = [max(x,y) for x,y in zip(R[to_add],results_vector)]
count += 1
print "the number of profiles is %d" % len(profiles)
return list(profiles)
# 删除多出来的用户
def Delete(self,profiles):
print "开始删除多余结点"
# 先统计每个领域的人数,用以统计该领域是否能被减少人数
categories = self.DomainDistribution(profiles)
# 遍历,如果将其排除,那么损耗将会减少多少,将排除后损失依然小的排除
to_delete = len(profiles) - self.k
has_category = set()
count = 0
results = {}
for category in categories.keys():
if categories[category] == 1 or categories[category] == int(self.categories[category] * self.k):
# 该领域不能删除
continue
profile_domain = set([id for id in profiles if self.features[id][5] == category])
if os.path.exists("new%sRepresentativeMatrix.npy" % category):
# 加载矩阵
# open_file = open("%sRepresentativeMatrix.pickle" % category)
# R = pickle.load(open_file)
# open_file.close()
# R = np.load("new%sRepresentativeMatrix.npy" % category)
R = self.Repre[category]
# 加载id字典
# open_file = open("new%sRepresentativeDictionary.pickle" % category)
# R_dic = pickle.load(open_file)
# open_file.close()
R_dic = self.Repre_id[category]
# 该领域的代表性人物对应的所有行
rows = set([R_dic[id] for id in profile_domain])
original = sum(np.max(np.asarray([R[i] for i in rows]),axis=0))
subresults = {profile:(original - sum(np.max(np.asarray([R[i] for i in (rows - {R_dic[profile]})]),axis=0))) for profile in profile_domain}
to_delete_id = (min(subresults.items(),key=lambda key:key[1]))[0]
results[to_delete_id] = subresults[to_delete_id]
# print len(results)
results = sorted(results.items(),key=lambda key:key[1])
for result in results:
profiles.remove(result[0])
print "the number of profiles is %d" % len(profiles)
has_category.add(self.features[result[0]][5])
count += 1
if count == to_delete:
break
return profiles
# 对非典型的元素进行替换
def Replace(self,target,profiles):
# people为每个领域的用户集合
people = datapre.People(self.features)
category = self.features[target][5]
index = profiles.index(target)
old_element = profiles[index]
profile_domain = set([id for id in profiles if self.features[id][5] == category])
if os.path.exists("new%sRepresentativeMatrix.npy" % category):
# 加载矩阵
# open_file = open("%sRepresentativeMatrix.pickle" % category)
# R = pickle.load(open_file)
# open_file.close()
# R = np.load("new%sRepresentativeMatrix.npy" % category)
R = self.Repre[category]
# 加载id字典
# open_file = open("new%sRepresentativeDictionary.pickle" % category)
# R_dic = pickle.load(open_file)
# open_file.close()
R_dic = self.Repre_id[category]
# 该领域的代表性人物对应的所有行
rows = set([R_dic[id] for id in profile_domain])
results = {element:sum(np.max(np.asarray([R[i] for i in rows | {R_dic[element]}]),axis=0)) for element in people[category] if element not in set(profiles)}
results = sorted(results.items(),key=lambda dic:dic[1],reverse=True)
for result in results:
to_replace = result[0]
if metric.checkOneTypical(self.features,to_replace,profiles,self.epsilon):
self.replace[target] = to_replace
profiles[index] = old_element
# print new_element
return to_replace
return None
# 在边集合中删除与点相关的边
@staticmethod
def DeleteEdges(edges,vertex):
newedges = deepcopy(edges)
for edge in edges:
if edge[0] == vertex:
newedges.remove(edge)
return newedges
# 检查在边集合中还是否有与vertex相连的边
@staticmethod
def CheckEdgeCase1(edges,vertex):
for edge in edges:
if edge[1] == vertex or edge[0] == vertex:
return False
return True
# 检查是否有其他元素与其相连着
@staticmethod
def CheckEdgeCase2(edges,vertex):
for edge in edges:
if edge[1] == vertex:
# 有相连
return True
return False
# 统计集合中每个领域相应的人数
def DomainDistribution(self,profiles):
categories = datapre.DomainDistribution(profiles,self.features)
return categories
# 递归替换非领域典型元素算法
def SearchRecursion(self,index,current_profiles,noneTypical,edges):
# 递归终止条件(已经)
if metric.checkAllTypical(self.features,current_profiles,self.epsilon):
# print "找到一个可行解"
temp = self.AttributeRepresentative(set(current_profiles))
if self.max_repre == 0 or temp > self.max_repre:
self.max_repre = temp
self.best_profiles = set(current_profiles)
# print self.best_profiles
print self.max_repre
return
# 不是典型,而最小的损耗已经大于最小损耗,不必再向下搜索了
if index == len(noneTypical):
return
# 如果当前不满足领域典型的结果都小于当前最优结果,则不需要再往下替换
if self.AttributeRepresentative(set(current_profiles)) <= self.max_repre:
return
# 三种情况,不替换,替换,可替换可不替换
i = noneTypical[index]
# 第一种情况
if self.CheckEdgeCase1(edges,i) == True:
# 不替换
self.SearchRecursion(index + 1,current_profiles,noneTypical,edges)
elif self.CheckEdgeCase2(edges,i) == True:
# 替换
new_profiles = deepcopy(current_profiles)
# 先在已经计算过的字典中去寻找
if self.Replace(current_profiles[i],new_profiles) == None:
print "不可替换"
return
new_profiles[i] = self.Replace(current_profiles[i],new_profiles)
# 删除edges中与i相关的边
newedges = self.DeleteEdges(edges,i)
# 替换后继续向下寻找
self.SearchRecursion(index + 1,new_profiles,noneTypical,newedges)
else:
# 替换或不替换
# 替换
new_profiles = deepcopy(current_profiles)
# 先在已经计算过的字典中去寻找
if self.Replace(current_profiles[i],new_profiles) == None:
print "不可替换"
return
new_profiles[i] = self.Replace(current_profiles[i],new_profiles)
# 删除edges中与i相关的边
newedges = self.DeleteEdges(edges,i)
# print len(newedges)
# 替换后继续向下寻找
self.SearchRecursion(index + 1,new_profiles,noneTypical,newedges)
# 不替换继续向下寻找
self.SearchRecursion(index + 1,current_profiles,noneTypical,edges)
return
# 先不管典型贪心寻找,然后在进行替换寻找最优值
def SearchWithReplace(self):
# 第一步,在没有领域典型的条件得到的贪心最优值
current_profiles = self.SearchWithoutConstraints()
print self.AttributeRepresentative(set(current_profiles))
# 持久化找到的代表性人物
# with open("%dGBProfiles" % len(current_profiles),"wb") as f:
# for profile in current_profiles:
# f.write(profile)
# f.write("\n")
# self.best_profiles = self.SearchWithK()
# print metric.AttributeLoss(self.features,self.best_profiles)
# 贪心排除多余的
self.best_profiles = self.Delete(set(current_profiles))
# 统计一下每个领域的人数
# categories = self.DomainDistribution(self.best_profiles)
# 第二步,排除不够典型的,统计贪心算法求得的解中有哪些不够典型的
# 将best_profiles中不够典型的元素求出
best_profiles = list(self.best_profiles)
# 直接贪心搜索到的解的损耗是
print "直接贪心搜索到的解的属性代表性是"
print self.AttributeRepresentative(self.best_profiles)
# vertexs = set()
# # 顶点替换代价
# vertexs_cost = {}
# 不够典型的点的编号
NoneTypical = []
edges = []
i = 0
while i < len(best_profiles):
j = i + 1
while j < len(best_profiles):
if metric.Similarity(self.features,best_profiles[i],best_profiles[j]) > self.epsilon:
edges.append((i,j))
j += 1
i += 1
for profile in best_profiles:
if not metric.checkOneTypical(self.features,profile,self.best_profiles,self.epsilon):
NoneTypical.append(best_profiles.index(profile))
print NoneTypical
print "开始替换不够领域典型的人物"
self.SearchRecursion(0,best_profiles,list(NoneTypical),edges)
return self.best_profiles
def test():
to_run = [40,60,80,100]
for i in to_run:
start_time = time.time()
method = Greedy(i,datapre.Features(),datapre.CategoriesDistribution(),0.1555)
# profiles = method.SearchWithoutConstraints()
# profiles = method.SearchWithConstraints()
profiles = method.SearchWithReplace()
# print len(profiles)
end_time = time.time()
# 将结果写入文件
with open("%dGB_results" % i,"wb") as f:
f.write("cost %f s" % (end_time - start_time))
f.write("\n")
f.write("Attribute Representativeness is:")
f.write(str(method.AttributeRepresentative(profiles)))
f.write("\n")
for profile in profiles:
f.write(profile + "\t")
# test()