/
grab_chn.py
137 lines (128 loc) · 4.41 KB
/
grab_chn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/user/bin/env python
from tweepy import TweepError
from twitter_api import get_api2
from tweepy import Cursor
from time import sleep
from twitter_user import TwitterUser
import re
import db
from db import save_non_chn, is_in_no_chn
api = get_api2()
chn_search = re.compile(ur"[\u4e00-\u9fa5]").search
jpn_search = re.compile(ur"[\u3040-\u309F\u30A0-\u30FF]").search
krn_search = re.compile(ur"[\u1100-\u11FF\u3130-\u318F\uAC00-\uD7AF]").search
def init():
db.init()
first_user = api.me()
TwitterUser.save_tweepy_user(first_user)
def fetch():
current_user = TwitterUser.get_next_unscanned()
if current_user:
print "analyzing ", current_user.scrn_name, "......"
save_user_followers(current_user)
fetch()
else:
print "done!"
def save_user_followers(user):
try:
c = Cursor(api.followers,user.user_id)
except TweepError:
print "tweep breaks!"
print TweepError.message
while(True):
try:
print 'taking a rest before move to next page'
sleep(10)
page = c.pages().next()
print "start a new page of user ", user.scrn_name, \
'page', c.pages().count
except TweepError:
print "tweep breaks!"
print TweepError.message
continue
except StopIteration:
print "Move to next unscanned"
break
for tweepy_user in page:
print "follower -----", tweepy_user.screen_name, "----- found......"
if TwitterUser.get_by_id(tweepy_user.id) or \
is_in_no_chn(tweepy_user.id):
print 'ALREADY in DB!!, skip'
continue
try:
if not tweepy_user.protected or \
(tweepy_user.protected and tweepy_user.following):
if is_chn(tweepy_user):
print "and speaks Chinese! Saving...."
TwitterUser.save_tweepy_user(tweepy_user)
else:
save_non_chn(tweepy_user.id)
print "pitty, s/he is not Chinese Speaker, next..."
continue
except TweepError:
print "tweep breaks!"
print TweepError.message
try:
print "the remaining hit is ", \
api.rate_limit_status()['remaining_hits']
except TweepError:
print "tweep breaks!"
print TweepError.message
page =[]
user.update_scanned()
def is_chn_by_timeline(tweepy_user):
print 'has to check timeline...'
is_chn = False
try:
for status in tweepy_user.timeline():
if text_is_chn(status.text):
print 'Chinese words found in timeline'
is_chn = True
print is_chn
break
except TweepError:
print "tweep breaks!"
print TweepError.message
print 'taking a rest'
sleep(10)
return is_chn
def is_chn(tweepy_user):
print 'Check if speak Chinese..'
is_chn = False
is_jpn = False
is_krn = False
print 'checking most recent status...'
if hasattr(tweepy_user, 'status'):
if text_is_chn(tweepy_user.status.text):
is_chn = True
elif jpn_search(tweepy_user.status.text):
print "has jpn word!"
is_jpn = True
elif krn_search(tweepy_user.status.text):
print "has krn word!"
is_krn = True
print 'trying user description'
if hasattr(tweepy_user, 'description') and tweepy_user.description:
if text_is_chn(tweepy_user.description):
is_chn = True
elif jpn_search(tweepy_user.description):
print "has jpn word!"
is_jpn = True
elif krn_search(tweepy_user.description):
print "has krn word!"
is_krn = True
print 'Checking name...'
if text_is_chn(tweepy_user.name):
print 'Chinese name!'
is_chn = True
if tweepy_user.statuses_count > 10 and not is_chn \
and not is_jpn and not is_krn:
is_chn = is_chn_by_timeline(tweepy_user)
return (is_chn and not is_jpn and not is_krn)
def text_is_chn(text):
if chn_search(text) and not jpn_search(text) and not \
krn_search(text):
return True
else: return False
if __name__ == "__main__":
fetch()