This repository has been archived by the owner on Jun 13, 2021. It is now read-only.
/
weibo_crawler.py
executable file
·365 lines (316 loc) · 12.9 KB
/
weibo_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
#encoding=utf-8
"""
by Roi @IS10.PKU
It is a script used to crawl sina Weibo contents,
which are used as raw data for the purpose of my graduation thesis only.
IE is called in the script through python win32com module, so as to get
XMLHttpRequest data in Weibo pages. Therefore it could be further applied
to Twitter, Facebook, Renren etc., with the intent of reasch restrictly.
To run the crawler, class IdCollector is to crawl target user ids,
while the other is the content crawler scanning target id list.
Jan 2013
"""
from win32com.client import DispatchEx
import re
import time
'''
This implementation use default session of the ie browser,
such as cookies
Also, the brower is able to run backgroudly, set visibility
in __init__ method
On the contrary refer the other, selenium one
'''
def bkup_cookie_zhihu():
import hashlib
import pickle
from bs4 import BeautifulSoup
from selenium import webdriver
oper = Opener()
u_set0=set()
u_set1=set()
b=webdriver.Firefox()
b.get('http://zhihu.com')
cookies=pickle.load(open('/Users/r/Downloads/zhihu.pickle','r'))
for cookie in cookies:
b.add_cookie(cookie)
class IdCollector:
def __init__(self,url='about:blank'):
self.ie = DispatchEx('InternetExplorer.Application')
self.ie.Visible = 0
time.sleep(0.5)
def openFullPage(self,url):
'''
for the purpose html file is complete
this one is for pages without XHR
print page title and page url for debugging
'''
self.ie.Navigate(url)
while self.ie.Busy:
time.sleep(0.1)
print self.ie.LocationName, self.ie.LocationURL
def initSets(self) :
'''
init 3 sets for swap saving user id for the class instance,
1 for current seeds
1 for saving current result, 1 crawled seeds
'''
self.collected_set, self.collector_set, self.doing_set = \
self.loadIdFilesToSets('collected.ids','collector.ids','doing.ids')
self.set_size = len(self.collected_set)+len(self.collector_set)+len(self.doing_set)
print 'there are totally',str(self.set_size),'entries in sets'
def backupMem(self):
'''backup THE3 sets from memory to file'''
if len(self.collected_set)+len(self.collector_set)+len(self.doing_set)-self.set_size>1000:
self.writeToIdFiles('collected.ids','collector.ids','doing.ids')
self.set_size = len(self.collected_set)+len(self.collector_set)+len(self.doing_set)
def getIdsOnePage(self,html):
'''return a set of all ids in given html'''
uids = re.compile(r'uid=(\d+)').findall(html)
page_id_set = set()
for uid in uids:
page_id_set.add(int(uid.encode('ascii')))
return page_id_set
def getIdFollowUrl(self,uid,page):
'''return url of follower page given uid and page'''
return ''.join(['http://weibo.com/',str(uid),'/follow?page=',str(page)])
def combineSet(self,current_set):
new_id_set = current_set - self.collected_set - self.doing_set
self.collector_set = self.collector_set.union(new_id_set)
def currentHtml(self):
return self.ie.Document.body.outerHTML
def getIdPage(self,uid,page=1):
'''
get ids from a given uid, by update collector set,
default is 'Breadth-First', getting 1st page of follower
'''
url = self.getIdFollowUrl(uid,page)
self.ie.Navigate(url)
html = self.currentHtml()
this_set = self.getIdsOnePage(html)
self.combineSet(this_set)
print len(self.collector_set),len(self.collected_set),len(self.doing_set)
'''
To crawl ids in 'Deep First' way, using following codes
if html.find('page='+str(page+1))>1:
page = page+1
self.getIdPage(uid,page)
'''
def loadIdFilesToSets(self,fcollected,fcollector,fdoing,d='F:\\IS10\\grad\\data\\'):
'''init THE3 sets from file'''
[fcollected,fcollector,fdoing]=[''.join([d,str(f)]) for f in (fcollected,fcollector,fdoing)]
f1,f2,f3 = open(fcollected,'r'),open(fcollector,'r'),open(fdoing,'r')
fh = (f1,f2,f3)
scollected,scollector,sdoing = set(),set(),set()
sh = (scollected,scollector,sdoing)
print 'Time before load to set:',str(time.ctime())
for i in range(3):
for ele in fh[i].read().split(' '):
sh[i].add(ele)
fh[i].close
print 'Time after loaded to set:',str(time.ctime())
return (scollected,scollector,sdoing)
def writeToIdFiles(self,fcollected,fcollector,fdoing,d='F:\\IS10\\grad\\data\\'):
'''save id sets to file'''
[fcollected,fcollector,fdoing]=[d+f for f in (fcollected,fcollector,fdoing)]
f1,f2,f3 = open(fcollected,'w'),open(fcollector,'w'),open(fdoing,'w')
fh = (f1,f2,f3)
sh = (self.collected_set, self.collector_set, self.doing_set)
for i in range(3):
print 'T0 saving set to file:',str(time.ctime())
fh[i].write(' '.join([str(ele) for ele in sh[i]]))
print 'T1 saving set to file:',str(time.ctime())
fh[i].close()
class Wcontent:
def __init__(self):
self.ie = DispatchEx('InternetExplorer.Application')
self.ie.Visible = 1
time.sleep(0.5)
def openFullPage(self,url):
self.ie.Navigate(url)
while self.ie.Busy:
time.sleep(0.5)
print self.ie.LocationName, self.ie.LocationURL
def openXHRFull(self,url):
'''
get complete page for html
for XHR content page
'''
self.ie.Navigate(url)
while self.ie.Busy:
time.sleep(0.1)
print self.ie.LocationName, self.ie.LocationURL
t0 = time.time()
while self.currentHtml().find('page=')==-1:
self.ie.Document.parentWindow.execScript('window.scrollTo(0,document.body.scrollHeight-500)')
time.sleep(0.1)
if time.time() - t0 > 5:
self.ie.refresh()
def initSet(self):
'''init target/done id set from file'''
self.preUrl = 'http://weibo.com/u/'
self.data_dir = 'F:\\IS10\\grad\\data\\'
self.uid_set = set()
self.done_content_set = set()
self.loadIdFilesToSet(['collected.ids'],'done.content.ids')
def backupDoneSet(self):
'''save done id set to file'''
print 'saving done set to file'
self.bf = open(self.data_dir+'done.content.ids','a')
self.bf.write(' '+' '.join(str(ele) for ele in self.done_content_set))
self.bf.close()
def currentHtml(self):
return self.ie.Document.body.outerHTML
def getPageText(self,url):
'''
return plain Weibo text from given url
using div=WB_text to filter
'''
self.openXHRFull(url)
html = self.currentHtml()
txt = re.compile(r'WB_text(.*?)div').findall(html)
if txt:
for i,line in enumerate(txt):
txt[i] = ''.join(re.compile(r'>(.*?)<').findall(line)).split('//')[0]
return '\n'.join(txt)
else:
return ''
def getIdContentUrl(self,uid,page=1):
return ''.join([self.preUrl,str(uid),'?page=',str(page)])
def pageCnts(self,html):
'''return Weibo owner nick name and counts'''
fname = re.compile(r'fname=(.*?)&').search(html)
if fname != None:
wb_cnt = re.compile(r'tagweibo[^>]*>[^>]*>(\d+)<').search(html).group(1)
fans_cnt = re.compile(r'tagfans[^>]*>[^>]*>(\d+)<').search(html).group(1)
follow_cnt = re.compile(r'tagfollow[^>]*>[^>]*>(\d+)<').search(html).group(1)
return fname.group(1),follow_cnt,fans_cnt,wb_cnt
'''
To e.weibo & media.weibo page cnts
fname = re.compile(r'title_big..[\n](.*?)[\n]').search(html)
if fname != None:
cseg = re.compile(r'strong>(\d+)<').findall(html)
self.preUrl = 'http://e.weibo.com/'
return '#e#'+fname.group(1),cseg[0],cseg[2],cseg[4]
'''
#retry
return '##',0,0,0
def getIdContent(self,uid):
'''
Main portal to content crawling over uid
allow controlling by wb_cnt of given uid
'''
url = self.getIdContentUrl(uid)
self.openFullPage(url)
if self.ie.LocationURL.find('media.weibo')==7 or self.ie.LocationURL.find('e.weibo')==7:
return (self.ie.LocationName,0,0,0),''
html = self.currentHtml()
#id_cnts = (fname,follow_cnt,fans_cnt,wb_cnt)
id_cnts = self.pageCnts(html)
print id_cnts
wb_cnt = id_cnts[3]
if int(wb_cnt) < 45:
return (id_cnts,'')
if int(wb_cnt) > 900:
#self.getIdOriContent(uid,id_cnts)
wb_cnt = 2000
content = ''
for page in range(1,int(wb_cnt)/45+2):
url = self.getIdContentUrl(uid,page)
#page_content = self.browser.get_text('class=WB_feed')
page_content = self.getPageText(url)
content = content + page_content
return (id_cnts,content)
def getIdOriContentUrl(self,uid,page=1):
return ''.join([self.preUrl,str(uid),'?ori=1','&page=',str(page)])
def getIdOriContent(self,uid,id_cnts):
'''
When the user is too noisy
crawler her original content pages
'''
url = self.getIdOriContentUrl(uid)
self.openXHRFull(url)
time.sleep(1)
html = self.currentHtml()
page = self.getMaxPage(html)
print page
content = self.getPageText(url)
for page in range(2,int(page)+1):
url = self.getIdOriContentUrl(uid,page)
page_content = self.getPageText(url)
content = content + page_content
return (id_cnts,content)
def getMaxPage(self,html):
maxi = re.compile(r'page=(/d+)').search(html)
if maxi:
page = maxi.group(0)
else:
page = 1
return page
def loadIdFilesToSet(self,uid_files,done_file):
uid_file_paths = [self.data_dir+f for f in uid_files]
done_file_hdl = open(d+done_file,'r')
print 'Time before load to set:',str(time.ctime())
for file_path in uid_file_paths:
file_handle = open(file_path,'r')
t_set = set()
for ele in file_handle.read().split(' '):
t_set.add(ele)
self.uid_set = self.uid_set.union(t_set)
file_handle.close()
for ele in done_file_hdl.read().split(' '):
self.done_content_set.add(ele)
done_file_hdl.close()
print 'Time after loaded to set:',str(time.ctime())
print len(self.uid_set)
def writeIdContent(self,uid'):
fh = open(self.data_dir+str(uid),'w')
cnt,text = self.getIdContent(uid)
fh.writelines(cnt[0].encode('utf-8')+' '+' '.join(str(c) for c in cnt[1:]))
fh.write(text.encode('utf-8'))
fh.close()
self.done_content_set.add(uid)#''.join([str(uid),'||',cnt[0].encode('utf-8')]))
def runIdCraw(url='http://weibo.com/10pku'):
'''
This is the id crawler runner,
it could not be a class for win32com can not be instance
'''
rpm = RoiPamie()
rpm.initSets()
if len(rpm.doing_set)<2 and len(rpm.collector_set)<2:
rpm.openFullPage(url)
html = rpm.currentHtml()
rpm.doing_set = rpm.getIdsOnePage(html)
print 'sdoing',str(len(rpm.doing_set))
while len(rpm.collected_set)+len(rpm.collector_set)+len(rpm.doing_set)<10000:
if len(rpm.doing_set)<=1 and len(rpm.collector_set)>1:
rpm.collector_set,rpm.doing_set = rpm.doing_set,rpm.collector_set
for i in rpm.doing_set:
rpm.collected_set.add(i)
rpm.getIdPage(i) #weak diversity if crawling all pages
rpm.backupMem()
rpm.doing_set = set()
rpm.backupMem()
def runCtnCraw():
'''
This is the Weibo content crawler runner
exceptions including page opening error
'''
dbg = Wcontent()
id_size = 0
t=time.time()
dbg.initSet()
working_set=dbg.uid_set-dbg.done_content_set
for uid in working_set:
try:
dbg.writeIdContent(uid)
except:
continue
if id_size%10 == 0:
dbg.backupDoneSet()
print time.time()-t
id_size = id_size + 1
dbg.backupDoneSet()
if __name__=='__main__':
#runIdCraw()
runCtnCraw()
#dbg=Wcontent()