-
Notifications
You must be signed in to change notification settings - Fork 1
/
blogselector.py
136 lines (102 loc) · 2.81 KB
/
blogselector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#! /usr/bin/env python
# -*- coding: utf-8 -*-
'''
Author: Xihao Liang
Created: 2016.03.11
'''
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import cPickle
import numpy as np
import blogger
import commdatica
from commdatica import BlogInfo
from utils import progbar
def pkl_dump(d, fname):
cPickle.dump(d, open(fname, 'w'))
def pkl_load(fname):
return cPickle.load(open(fname, 'r'))
def export_unmv():
import db
con = db.connect()
cur = con.cursor()
cur.execute('select user_id, comments_count from microblogs where comments_count > 0')
uc = {}
for uid, cc in cur:
if uc.has_key(uid):
uc[uid].append(cc)
else:
uc[uid] = [cc, ]
# u for user_id, n for n_blogs, m for mean(count), v for sqrt(var(count))
unmv = [(uid, len(cc), np.mean(cc), np.sqrt(np.var(cc))) for uid, cc in uc.items()]
pkl_dump(unmv, 'output/unmv.pkl')
cur.close()
con.close()
def select():
unmv = pkl_load('output/unmv.pkl')
import db
con = db.connect()
cur = con.cursor()
thr_min = 5
# t for threshold_max
unt = [(u, n, m + v) for u, n, m, v in unmv if m <= 50 and v <= 100]
umtc = []
for u, n, thr_max in unt:
cur.execute('select mid, text, comments_count from microblogs where user_id = %s and comments_count >= %d and comments_count <= %d limit %d'%(u, thr_min, thr_max, n))
tmp_umtc = []
for m, t, c in cur:
if blogger.is_valid(t, check_emo = False):
tmp_umtc.append((u, m, t, c))
tmp_umtc = sorted(tmp_umtc, key = lambda k: -k[3])
if len(tmp_umtc) > 100:
tmp_umtc = tmp_umtc[:100]
umtc.extend(tmp_umtc)
if len(umtc) >= 400000:
break
fobj = open('output/umtc.txt', 'w')
for u, m, t, c in umtc:
fobj.write(repr(BlogInfo(u, m, t, c)) + '\n')
fobj.close()
def sample():
blogs = commdatica.load('output/umtc.txt')
has_emo = []
no_emo = []
target = 1000
i = 0
pbar = progbar.start(target)
for blog in blogs:
if blogger.is_valid(blog.text):
if not len(has_emo) >= 500:
has_emo.append(blog)
i += 1
elif blogger.is_valid(blog.text, check_emo = False):
if not len(no_emo) >= 500:
no_emo.append(blog)
i += 1
pbar.update(i)
pbar.finish()
print 'writing to umtc_yes_emo.txt ....',
open('output/umtc_yes_emo.txt', 'w').write('\n'.join([repr(blog) for blog in has_emo]))
print 'OK'
print 'writing to umtc_no_emo.txt ....',
open('output/umtc_no_emo.txt', 'w').write('\n'.join([repr(blog) for blog in no_emo]))
print 'OK'
bs = commdatica.load('output/umtc_yes_emo.txt')
print len(bs)
def main():
blogs = commdatica.load('output/umtc.txt')
print '%d in total'%(len(blogs))
pbar = progbar.start(len(blogs))
c = 0
for i, blog in enumerate(blogs):
if blogger.is_valid(blog.text, check_emo = False):
c += 1
pbar.update(i + 1)
pbar.finish()
print '%.2f%%'%(100. * c / len(blogs))
if __name__ == '__main__':
#export_unmv()
#select()
#sample()
main()