/
server.py
478 lines (431 loc) · 18.7 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
__author__ = 'multiangle'
"""
NAME: server.py
PY_VERSION: python3.4
FUNCTION:
This server part of distrubuted microblog spider.
The function of server can be divided into 3 parts.
1. proxy manager. Scratch web page in high speed need a lot of http proxy ip.
Server should maintain a proxy pool which should provide proxy to client.
2. task manager. Client will require task from server. task list should fetched
from sqlserver and stored in memory
3. store return info. When client finished searching user information from sina,
client will return this info to server. if the length of data is too lang, client
will seperate it into several parts and send then individually. Server should combine
these data package together.
Besides, server should check whether the received user is already exist in database.
Server has to assure that no repeating data exists in database. It a heavy task for
server to connect with databases.
VERSION:
_0.5_
"""
#======================================================================
#----------------import package--------------------------
# import python package
import threading
import time
import sys,os
from random import Random
from pprint import pprint
# import from outer package
from pymongo import MongoClient
import pymongo
import tornado.web
import tornado.ioloop
import tornado.options
from tornado.options import define,options
# import from this folder
from server_proxy import proxy_pool,proxy_manager
import server_config as config
from server_database import DB_manager,deal_cache_user_info,deal_cache_attends
import File_Interface as FI
from DB_Interface import MySQL_Interface
from server_data import DataServer
#=======================================================================
define('port',default=8000,help='run on the given port',type=int)
class Application(tornado.web.Application):
def __init__(self):
handlers=[
(r'/auth',AuthHandler),
(r'/proxy/',ProxyHandler),
(r'/task/',TaskHandler),
(r'/proxy_size',ProxySize),
(r'/proxy_empty',ProxyEmpty),
(r'/proxy_return',ProxyReturn),
(r'/info_return',InfoReturn),
(r'/history_report',HistoryReport),
(r'/update_report',UpdateReport)
]
settings=dict(
debug=True
)
tornado.web.Application.__init__(self,handlers,**settings)
class AuthHandler(tornado.web.RequestHandler):
def get(self):
self.write('connection valid')
self.finish()
class ProxyHandler(tornado.web.RequestHandler):
def get(self):
global proxy
num=int(self.get_argument('num'))
if num>proxy.size():
self.write('no valid proxy')
self.finish()
else:
proxy_list=proxy.get(num)
try:
proxy_list=['{url},{timedelay};'.format(url=x[0],timedelay=x[1]) for x in proxy_list]
except Exception as e:
self.write('no valid proxy')
self.finish()
print('ERROR:server->ProxyHandler:')
print(e)
return
res=''
for i in proxy_list: res+=i
res=res[0:-1] # 'url,timedelay;url,timedelay;...,'
self.write(res)
self.finish()
class TaskHandler(tornado.web.RequestHandler):
def get(self):
global proxy
uuid=str(self.get_argument('uuid'))
task_id=self.task_assign(uuid)
if proxy.get_ave_proxy_size()<30: # check the size of current proxy size
self.write('no task')
self.finish()
return
if task_id==-1: # checi if this uuid is valid
self.write('no task')
self.finish()
return
if task_id==1: # get the social web of certain user
dbi=MySQL_Interface()
query='select * from ready_to_get where is_fetching is null order by fans_num desc limit 1;'
res=dbi.select_asQuery(query)
if res.__len__()==0:
self.write('no task')
self.finish()
return
res=res[0]
col_info=dbi.get_col_name('ready_to_get')
uid=res[col_info.index('uid')]
self.write('{uid},connect'.format(uid=uid))
self.finish()
time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
query="update ready_to_get set is_fetching=\'{t_time}\' where uid={uid} ;"\
.format(t_time=time_stick,uid=uid)
dbi.update_asQuery(query)
if task_id==2: # get the history microblog of a certain user
dbi=MySQL_Interface()
query='select container_id,blog_num from user_info_table ' \
'where (isGettingBlog is null and update_time is null and blog_num<{valve} and blog_num>100)' \
'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE)
# query='select container_id,blog_num from user_info_table ' \
# 'order by rand() limit 1 ;'
res=dbi.select_asQuery(query)
if res.__len__()==0:
self.write('no task')
self.finish()
return
[container_id,blog_num]=res[0]
self.write('{c_id};{blog},history'
.format(c_id=container_id,blog=blog_num))
self.finish()
time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;"\
.format(t_time=time_stick,cid=container_id)
dbi.update_asQuery(query)
if task_id==3: # get the history microblog of a certain user
dbi=MySQL_Interface()
query='select container_id,blog_num from user_info_table ' \
'where (isGettingBlog is null and update_time is null and blog_num>={valve} and blog_num>100)' \
'order by fans_num desc limit 1 ;'.format(valve=config.HISTORY_TASK_VALVE)
# query='select container_id,blog_num from user_info_table ' \
# 'order by rand() limit 1 ;'
[container_id,blog_num]=dbi.select_asQuery(query)[0]
self.write('{c_id};{blog},history'
.format(c_id=container_id,blog=blog_num))
self.finish()
time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
query="update user_info_table set isGettingBlog=\'{t_time}\' where container_id={cid} ;" \
.format(t_time=time_stick,cid=container_id)
dbi.update_asQuery(query)
if task_id==4 or task_id==5 or task_id==100: # this part is in test
dbi=MySQL_Interface()
current_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
target_time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()-60*60*24*1)) #提早5天
if task_id==4:
batch_size = 100
elif task_id==5:
batch_size = 200
else:
batch_size = 10
query='select container_id,update_time,latest_blog from user_info_table ' \
'where update_time<\'{target_time}\' and isGettingBlog is null and blog_num>10 order by fans_num desc limit {batch}' \
.format(target_time=target_time_stick,batch=batch_size)
print(query)
res=dbi.select_asQuery(query)
# 将从mysql中取得的用户列表加上必要的变量以后发送给客户端
res=[[line[0],int(time.mktime(line[1].timetuple())),int(time.mktime(line[2].timetuple()))] for line in res]
res_cp=res
if res_cp.__len__()==0: # if no task ,then return "no task"
print('*** warning: no avaliable update mission ***')
self.write('no task')
self.finish()
return
# print('debug from task handler')
# pprint(res_cp)
res=[line[0]+'-'+str(line[1])+'-'+str(line[2]) for line in res]
inn=''
for item in res:
inn+=item+';'
inn=inn[0:-1]
# uid-stamp;uid-timestamp;...;,update (the formation of order)
mission_id=random_str(15)
commend='{list};{task_id},update'.format(list=inn,task_id=mission_id)
# 传送给客户端的指令格式: ContainerId-UpdateTime-LatestBlog;...;...;...,update
self.write(commend)
self.finish()
# 将用户列表,任务id,以及任务开始时间存入mongodb
u_list=[dict(container_id=x[0],update_time=x[1],latest_blog=x[2]) for x in res_cp]
data_toMongo=dict(
mission_id = mission_id,
user_list = u_list,
mission_start= int(time.time())
)
client=MongoClient('localhost',27017)
db=client['microblog_spider']
collec=db.update_mission
collec.insert(data_toMongo)
# 将相关内容从mysql中设置isGettingBlog
user_list_str=''
for line in res_cp:
user_list_str+='\'{cid}\','.format(cid=line[0])
user_list_str=user_list_str[:-1]
time_stick=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
query='update user_info_table set isGettingBlog=\'{time}\' where container_id in ({ulist})'\
.format(time=time_stick,ulist=user_list_str)
dbi.update_asQuery(query)
def task_assign(self,uuid):
t_1=['1'] # get social web
t_2=['2'] # get history weibo , get counter which blog_num<=15000 ,connect with server through www
t_3=['3'] # get history weibo , get counter which blog_num>15000 ,connect with server through localhost
t_4=['4'] # update weibo
t_5=['5']
t_6=['100']
if uuid in t_1:
return 1
elif uuid in t_2:
return 2
elif uuid in t_3:
return 3
elif uuid in t_4:
return 4
elif uuid in t_5:
return 5
elif uuid in t_6:
return 100
else:
return -1
class ProxySize(tornado.web.RequestHandler):
global proxy
def get(self):
self.write(str(proxy.size()))
self.finish()
class ProxyEmpty(tornado.web.RequestHandler):
global proxy
def get(self):
proxy.empty()
if proxy.size()<2:
self.write('empty proxy success')
self.finish()
class ProxyReturn(tornado.web.RequestHandler):
def post(self):
global proxy
data=self.get_argument('data')
print('proxy data:',data)
proxy_list=data.split(';')
in_data=[x.split(',') for x in proxy_list]
if in_data.__len__()>0:
proxy.add(in_data)
print('Success to receive returned proxy')
for i in in_data:
print(i)
self.write('return success')
self.finish()
class InfoReturn(tornado.web.RequestHandler):
def post(self):
try:
user_basic_info=self.get_argument('user_basic_info')
attends=self.get_argument('user_attends')
user_basic_info=eval(user_basic_info)
attends=eval(attends)
self.write('success to return user info')
self.finish()
except:
self.write('fail to return user info')
self.finish()
return
try:
dbi=MySQL_Interface()
except:
print('unable to connect to MySql DB')
try:
if attends.__len__()>0: #store attends info
table_name='cache_attends'
attends_col_info=dbi.get_col_name(table_name)
keys=attends[0].keys()
attends= [[line[i] if i in keys else '' for i in attends_col_info] for line in attends]
fans_col_pos=attends_col_info.index('fans_num')
insert_attends=[]
for line in attends:
if line[fans_col_pos]>1000:
insert_attends.append(line)
dbi.insert_asList(table_name,insert_attends,unique=True)
print('Success : attends of {uid} is stored in {tname}'
.format(uid=user_basic_info['uid'],tname=table_name))
else:
pass
except Exception as e:
print(e)
path="temp"+os.sep+"{uid}_attends.pkl".format(uid=user_basic_info['uid'])
print('unable to store attends of {uid}, it will be stored '
.format(uid=user_basic_info['uid']))
FI.save_pickle(attends,path)
try:
atten_num_real=user_basic_info['attends_num']
atten_num_get=attends.__len__()
user_basic_info['accuracy']=atten_num_get # 实际获取到的关注数目
col_info=dbi.get_col_name('cache_user_info') # store user basic info
keys=user_basic_info.keys()
data=[user_basic_info[i] if i in keys else '' for i in col_info]
dbi.insert_asList('cache_user_info',[data],unique=True)
print('Success : basic info of {uid} is stored in cache_user_info'
.format(uid=user_basic_info['uid']))
except Exception as e:
print(e)
path='temp'+os.sep+'{uid}_basic_info.pkl'.format(uid=user_basic_info['uid'])
print('unable to store basic info of {uid} , it will be stored'
.format(uid=user_basic_info['uid']))
FI.save_pickle(user_basic_info,path)
try:
if attends.__len__()>0: # store atten connection web
from_uid=user_basic_info['uid']
from_fans_num=user_basic_info['fans_num']
from_blog_num=user_basic_info['blog_num']
data=[[from_uid,from_fans_num,from_blog_num,str(x[attends_col_info.index('uid')]),str(x[attends_col_info.index('fans_num')]),str(x[attends_col_info.index('blog_num')])]for x in attends]
dbi.insert_asList('cache_atten_web',data)
print('Success : conn web of {uid} is stored in cache_atten_web'
.format(uid=user_basic_info['uid']))
else:
pass
except Exception as e:
print(e)
path='{uid}_atten_web.pkl'.format(uid=user_basic_info['uid'])
print('unable to store atten web of {uid} , it will be stored'
.format(uid=user_basic_info['uid']))
FI.save_pickle(data,path)
class HistoryReport(tornado.web.RequestHandler):
def post(self):
# 从客户端获取信息
try:
latest_time=self.get_argument('latest_time')
latest_timestamp=self.get_argument('latest_timestamp')
container_id=self.get_argument('container_id')
self.write('success')
self.finish()
print('Success: to get data from web')
except Exception as e:
self.write('fail to return user history')
self.finish()
print('Error:server-HistoryReturn:'
'Unable to get value from http package,Reason:')
print(e)
return
dbi=MySQL_Interface()
checkin_timestamp=int(time.time())
col_info=dbi.get_col_name('cache_history')
data=dict(
latest_time=latest_time,
latest_timestamp=latest_timestamp,
container_id=container_id,
checkin_timestamp=checkin_timestamp
)
keys=data.keys()
insert_data=[[data[item] if item in keys else None for item in col_info]]
dbi.insert_asList('cache_history',insert_data)
class UpdateReport(tornado.web.RequestHandler):
def post(self):
# 从客户端获取信息
try:
mission_id=self.get_argument('mission_id')
self.write('success')
self.finish()
print('Success: to get update report from web')
except Exception as e:
self.write('fail to return user update')
self.finish()
print('Error:server-UpdateReturn:'
'Unable to get value from http package,Reason:')
print(e)
return
# 将该任务在mongodb中设置为组装状态
client=MongoClient('localhost',27017)
db=client['microblog_spider']
collec=db.update_mission
collec.update({'mission_id':mission_id},{'$set':{'isReported':int(time.time())}})
def random_str(randomlength=8):
str = ''
chars = 'AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789'
length = len(chars) - 1
random = Random()
for i in range(randomlength):
str+=chars[random.randint(0, length)]
return str
def auto_index():
client = MongoClient('localhost',27017)
db = client['microblog_spider']
collec_list = []
res=db.collection_names()
for x in res:
if 'user' in x:
collec_list.append(x)
print('** start to check the index station of collections in mongodb **')
for name in collec_list:
collec = db.get_collection(name)
indexs = [x for x in collec.list_indexes()]
if indexs.__len__()<3: # 此时没有索引
print('{n} do not have indexes yet, ready to craete'.format(n=name))
collec.create_index([('user_id',pymongo.DESCENDING)])
collec.create_index([('id',pymongo.DESCENDING)])
else:
# print('{n} has 3 indexs, done'.format(n=name))
pass
print('** all indexes is created **')
def start_selfcheck(): # 启动自检
print('\n\n********* start to selfcheck *********\n')
mi = MySQL_Interface()
if mi.cur :
print('mysql is connected')
client = MongoClient('localhost',27017)
print('mongodb is connected')
client.close()
auto_index()
print('\n********* selfcheck success *********\n')
if __name__=='__main__':
start_selfcheck() # 启动自检
proxy_lock=threading.Lock() # proxy thread
global proxy
proxy=proxy_pool()
pm=proxy_manager(proxy,proxy_lock)
pm.start()
db_thread=DB_manager() # database thread
db_thread.start()
tornado.options.parse_command_line() # tornado thread
Application().listen(options.port)
# nginx 使用8001接口,分别链接到8002,8003,8004等若干个数据服务器
DataServer().listen(8002)
DataServer().listen(8003)
DataServer().listen(8004)
tornado.ioloop.IOLoop.instance().start()