/
gen_dynamic_feature.py
286 lines (233 loc) · 12.8 KB
/
gen_dynamic_feature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#coding=utf8
"""
从评论数据中构建:1)评论树,2)reply-to关系,以计算各种dynamic feature。
dynamic feature:
1. comment tree:
max depth
2. author reply-to graph:
mean degree, clustering coefficient
计算方法:
为了方便以后调用,在每个评论发布后都计算一次dynamic feature。可能会导致计算时间很长。
"""
import codecs
import os.path
from datetime import datetime
import ipdb
from igraph import *
from utils import load_id_list
# 评论数量的最大值
MAX_COMMENT = 2000
def main(section_id, base_path):
""" 生成动态因素
"""
post_list_path = base_path + section_id + '-post-list.txt'
topic_list = load_id_list(post_list_path)
topic_list = list(set(topic_list)) # 删除重复post id
target_base_path = 'data-dynamic/'
#topic_list = ['1377621']
# 这里不用post_id替换topic_id的原因是:feature_dict中使用了“topic_id”作为key
valid_topic_list = [] # 真正抽取特征的post id列表
for topic_id in topic_list:
path = base_path + section_id + '/' + topic_id + '-info.txt'
if not os.path.exists(path):
continue
tpath = target_base_path + section_id + '/' + topic_id + '.txt'
# 如果目标输出已经存在,则忽略
if os.path.exists(tpath):
continue
f = codecs.open(path, 'r', 'utf8')
print 'Extracting features from post file: ', path
# 记录需要存储的字符串,该字符串可能较大,最大可能几M
result_string = ""
# 第一行为post本身的信息
line = f.readline().strip()
seg_list = line.split('[=]')
if len(seg_list) != 8:
print 'Error in the first line of topic file: ', path
f.close()
continue
lz = seg_list[2] # LZ id for author reply to, topic_id for comment tree
lz_user_name = seg_list[3] # 楼主的用户名
pubdate = seg_list[5]
num_comment = int(seg_list[6]) # NOTE:可能该文件中存储的评论数不足此数值,由于之前抓取错误导致
# first line: topic info
feature_dict = dict()
feature_dict['topic_id'] = topic_id
feature_dict['lz'] = lz
feature_dict['pubdate'] = pubdate
feature_dict['num_comment'] = num_comment
#tf.write(str(feature_dict) + '\n')
result_string += (str(feature_dict) + '\n')
#NOTE: 因为可能在抓取时保存不完整,真正存在的评论数量可能少于第一行所表明的数字
# 关于这一点需要检查,如果不合格,则删除
# build two graphs
comment_tree = Graph(directed=True)
author_reply = Graph(directed=True)
# 构建一个二模网络:两类节点分别是评论(包括原帖)和用户
# 如果comment和author之间相连,则:1)author写了comment(包括原作者写了帖子),
# 2)author评论了comment(这个comment包括原帖)
# comment的type为False,author的type为True
comment_author_bigraph = Graph(directed=False)
comment_author_bigraph.add_vertex(topic_id, type=False)
comment_author_bigraph.add_vertex(lz, type=True)
comment_author_bigraph.add_edge(topic_id, lz)
comment_dict = dict() # map comment id to graph index
comment_dict[topic_id] = comment_tree.vcount()
comment_tree.add_vertex(topic_id, date=pubdate, author=lz, depth=0)
author_dict = dict() # map author_id to graph index
author_dict[lz] = author_reply.vcount()
author_reply.add_vertex(lz)
flag = True # 标记该文件是否合乎规范
max_depth = 0
# 用于描述comment tree讨论的激烈程度:根节点为0,处于depth为1的节点贡献是1,依次类推
weighted_depth_sum = 0
current_comment_count = 0 # 记录当前的comment数量
for line in f:
# 将所有的feature放入feature_dict,可以不考虑顺序
feature_dict = dict()
line = line.strip()
seg_list = line.split('[=]')
if len(seg_list) != 8:
print 'Error in the comment line of topic file: ', path
flag = False
break
cid = seg_list[0] # 评论ID
pid = seg_list[3] # 用户的ID
uname = seg_list[4] # 用户昵称
pubdate = seg_list[5] # 发表时间
replyto = seg_list[6] # 评论引用
feature_dict['cid'] = cid
feature_dict['pid'] = pid
feature_dict['pubdate'] = pubdate
feature_dict['replyto'] = replyto # 回复的comment的cid
comment_dict[cid] = comment_tree.vcount()
comment_tree.add_vertex(cid, date=pubdate, author=lz)
current_comment_count += 1
feature_dict['current_comment_count'] = current_comment_count
comment_author_bigraph.add_vertex(cid, type=False)
# if this author has once commented, it should be in author_dict
if not pid in author_dict:
author_dict[pid] = author_reply.vcount()
author_reply.add_vertex(pid)
comment_author_bigraph.add_vertex(pid, type=True)
# the author-of relationship
comment_author_bigraph.add_edge(pid, cid)
replyto_pid = ''
commenton_cid = ''
parent_index = 0 # 在评论树上,该节点的父节点
if replyto == '':
commenton_cid = topic_id
parent_index = comment_dict[commenton_cid]
replyto_pid = lz
else:
commenton_cid = replyto
parent_index = comment_dict[commenton_cid]
replyto_pid = comment_tree.vs[parent_index]['author']
comment_tree.add_edge(cid, commenton_cid)
comment_author_bigraph.add_edge(pid, commenton_cid)
# 为cid节点添加depth属性
index = comment_dict[cid]
if comment_tree.vs[parent_index]['depth'] == None:
#TODO: 未知错误,举例:3211910
flag = False
break
else:
current_depth = comment_tree.vs[index]['depth'] = comment_tree.vs[parent_index]['depth'] + 1
weighted_depth_sum += current_depth
avg_weighted_depth_sum = weighted_depth_sum * 1.0 / current_comment_count
if current_depth > max_depth:
max_depth = current_depth
# 如果是回复自己,则忽略
if pid != replyto_pid:
# 如果 pid指向replyto_pid已经有链接,则不考虑再次添加
v1 = author_dict[pid]
v2 = author_dict[replyto_pid]
if author_reply.get_eid(v1, v2, directed=True, error=False) == -1:
author_reply.add_edge(v1, v2)
# number of current participating commenters
num_authors = author_reply.vcount()
feature_dict['num_authors'] = num_authors
# write statics in target file
mean_degree = sum(author_reply.degree()) * 1.0 / author_reply.vcount()
avg_local_transitivity = author_reply.transitivity_avglocal_undirected(mode='nan') # the avg of local transitivity
clustering_coefficient = author_reply.transitivity_undirected(mode='zero')
assortativity = author_reply.assortativity_degree(directed=False)
num_componnet = len(author_reply.components(mode=WEAK))
reply_density = author_reply.density(loops=True)
# cohesion和adhesion都不合适,因为几乎每个图都有一条度为1的边
#cohesion = author_reply.cohesion(neighbors='ignore')
#adhesion = author_reply.adhesion()
# Ref: http://igraph.sourceforge.net/doc/python/igraph.Graph-class.html#cohesive_blocks
# cohesive_blocks only works on undirected graphs
#author_reply_cohesive_block = author_reply.cohesive_blocks()
#author_reply_max_cohesions = author_reply_cohesive_block.max_cohesions()
feature_dict['mean_degree'] = mean_degree
feature_dict['avg_local_transitivity'] = avg_local_transitivity
feature_dict['clustering_coefficient'] = clustering_coefficient
feature_dict['assortativity'] = assortativity
feature_dict['num_componnet'] = num_componnet
feature_dict['reply_density'] = reply_density
# author-reply graph group cohesiveness
#feature_dict['author_reply_max_cohesions'] = author_reply_max_cohesions
# dynamic factor from WWW'13, Bao
tree_density = comment_tree.density(loops=False)
average_path_length = comment_tree.average_path_length(directed=False) # do not consider directed graphs
#diffusion_depth = comment_tree.diameter(directed=True) # diffusion depth for a tree, i.e the depth of a tree
diffusion_depth = max_depth
# comment tree related factors
feature_dict['tree_density'] = tree_density
feature_dict['diffusion_depth'] = diffusion_depth
feature_dict['avg_weighted_depth_sum'] = avg_weighted_depth_sum
feature_dict['avg_path_length'] = average_path_length # Wiener index, the average distance between all paris of nodes in a cascade
# the comment-author two model network properties
ca_mean_degree = sum(comment_author_bigraph.degree()) * 1.0 / comment_author_bigraph.vcount()
ca_avg_local_transitivity = comment_author_bigraph.transitivity_avglocal_undirected(mode='nan') # the avg of local transitivity
ca_clustering_coefficient = comment_author_bigraph.transitivity_undirected(mode='zero')
ca_assortativity = comment_author_bigraph.assortativity_degree(directed=False)
ca_num_componnet = len(comment_author_bigraph.components(mode=WEAK))
ca_reply_density = comment_author_bigraph.density(loops=True)
#comment_author_cohesive_block = comment_author_bigraph.cohesive_blocks()
#ca_max_cohesions = comment_author_cohesive_block.max_cohesions()
feature_dict['ca_mean_degree'] = ca_mean_degree
feature_dict['ca_avg_local_transitivity'] = ca_avg_local_transitivity
feature_dict['ca_clustering_coefficient'] = ca_clustering_coefficient
feature_dict['ca_assortativity'] = ca_assortativity
feature_dict['ca_num_componnet'] = ca_num_componnet
feature_dict['ca_reply_density'] = ca_reply_density
#feature_dict['ca_max_cohesions'] = ca_max_cohesions
# write feature dict to file
#tf.write(str(feature_dict) + '\n')
result_string += (str(feature_dict) + '\n')
# do not consider threads who has more than 1000 comments
if current_comment_count >= MAX_COMMENT:
break
# 如果未达到最大评论数,而且当前的评论数不足第一行所宣称的总评论数,则忽略该post
if current_comment_count < MAX_COMMENT and num_comment != current_comment_count:
flag = False
# print dynamic feature
#plot(comment_tree)
#plot(author_reply)
#ipdb.set_trace()
#print author_reply.transitivity_undirected(mode='zero')
#print author_reply.transitivity_avglocal_undirected(mode='zero') # the avg of local transitivity
#print author_reply.assortativity_degree(False)
f.close()
if flag:
valid_topic_list.append(topic_id)
# 输出到文件
print 'Saving: ', tpath
tf = codecs.open(tpath, 'w', 'utf8')
tf.write(result_string)
tf.close()
# 保存所有有效的讨论帖id
path = target_base_path + section_id + '-post-list-feature.txt'
f = codecs.open(path, 'w', 'utf8')
for topic_id in valid_topic_list:
f.write(topic_id + '\n')
f.close()
if __name__ == '__main__':
import sys
section_id = sys.argv[1]
#section_id = 'free'
base_path = '/home/kongqingchao/dataset/tianya-forum/'
main(section_id, base_path)