/
plot_growth.py
121 lines (97 loc) · 3.54 KB
/
plot_growth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#coding=utf8
"""
从已经抓取的topic中找到那些较为popular的帖子,将其评论数(看作时间序列)增长趋势画出来
"""
import codecs
from datetime import datetime, timedelta
import math
from matplotlib.dates import date2num
import matplotlib.pyplot as plt
import pandas as pd
from utils import load_id_list
threshold = 300 # 考虑的最少评论数
def plot_popularity(date_list):
""" 根据评论的时间列表画出增长示意图
"""
x = date2num(date_list)
count = len(date_list)
y = range(count)
for i in range(count):
y[i] = math.log(y[i]+1)
plt.figure()
plt.plot(x, y)
plt.show()
def plot_granularity_popularity(date_list, freq = 'D'):
""" 以不同的时间粒度为单位画出流行度曲线
date_list[0] is the topic publishing date
Note: 这里假设date_list中的时间是按照时间前后排序后的
freq 的取值有:{'S', 'M', 'H', 'D', 'M', 'Y', 'W'}
"""
start = date_list[0]
end = date_list[-1]
delta = timedelta(days=30*6) # 构造一个时间差为半年的timedelta对象
if end > start + delta:
end = start + delta # 只考虑半年内的评论
# create a time series with specified freq
ts = pd.Series(0, pd.date_range(start, end, freq = freq))
index = 0
for date in date_list:
if date > ts.index[index]:
index = index + 1
if index >= len(ts):
break
ts[index] = ts[index] + 1
print 'Number of %s(s): %d' % (freq, len(ts))
# caculate the cumulative sum
cum_count = ts.cumsum()
for i in range(len(ts)):
ts[i] = math.log(ts[i] + 1)
cum_count[i] = math.log(cum_count[i])
plt.figure()
#plt.plot(ts, color='blue', hold=True, ls=':')
#plt.plot(cum_count, color='red', ls='--')
ts.plot(color='blue', ls='-')
cum_count.plot(color='red', ls='--')
plt.show()
def main(group_id):
# 读取topic id list
path = 'data/' + group_id + '/' + group_id + '-TopicList.txt'
topic_id_list = load_id_list(path)
#topic_id_list = ['34029324']
for topic_id in topic_id_list:
path = 'data/' + group_id + '/' + topic_id + '-content.txt'
topic_pubdate = None
try:
with codecs.open(path, 'r', 'utf-8') as f:
content = f.read()
seg_list = content.split('[=]')
num_comment = int(seg_list[5])
# topic publishing date
topic_puddate = datetime.strptime(seg_list[4], '%Y-%m-%d %H:%M:%S')
if num_comment < threshold:
continue
except IOError:
continue
path = 'data/' + group_id + '/' + topic_id + '-comment.txt'
f = codecs.open(path, 'r', 'utf-8')
date_list = [topic_puddate]
row = ''
for line in f:
if line != '[*ROWEND*]\n':
row = row + line
else:
seg_list = row.split('[=]')
date = seg_list[4]
date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
date_list.append(date)
row = ''
sorted(date_list, reverse = True)
count = len(date_list)
print 'topic id:', topic_id
print 'Number of comments: %d\n' % (count-1)
#plot_popularity(date_list)
plot_granularity_popularity(date_list, freq = 'D')
if __name__ == '__main__':
import sys
group_id = sys.argv[1]
main(group_id)