-
Notifications
You must be signed in to change notification settings - Fork 0
/
sender.py
101 lines (70 loc) · 2.86 KB
/
sender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import tornado.gen
import tornado.ioloop
from tornado.httpclient import AsyncHTTPClient, HTTPRequest
from tornado.options import options
from queue import Queue
from cohash import Hash
import logging
import datetime
from util import Singleton
import fetcher
class Sender(metaclass=Singleton):
'''
URL分发器类
兼顾用一致性hash实现的负载均衡
'''
def __init__(self, ioloop=None, server_list=None, replicas=20):
super().__init__()
self.ioloop = ioloop or tornado.ioloop.IOLoop.instance()
self.send_url_queue = Queue()
self.sending = 0
self.server_list = server_list or []
if not self.server_list:
raise ValueError("server_list is None.")
self.replicas = replicas
self.ring = Hash(self.server_list, replicas=self.replicas)
def add_url(self, url):
logging.debug("send url to queue %s" % url)
self.send_url_queue.put(url)
@tornado.gen.coroutine
def send(self, server, url):
'''
把 url hash后传递给对应的服务器去抓取
'''
if server != options.local:
1/0
http_cilent = AsyncHTTPClient()
target_url = 'http://'+ server+ '/crawler/'+ url
logging.info("target_url: %s" % target_url)
request = HTTPRequest(url=target_url.encode('utf-8'), connect_timeout=options.timeout, request_timeout=options.timeout)
yield http_cilent.fetch(request)
else:
fetch = fetcher.Fetcher()
fetch.fetch_queue.put(url)
@tornado.gen.coroutine
def do_work(self, url):
logging.debug("sender do_work with url %s" % url)
server = self.ring.get_node(url)
try:
yield self.send(server, url)
except tornado.httpclient.HTTPError as e:
import traceback
traceback.print_exc()
with open('httperrorwithServer.txt', "a") as f:
f.write("Send Url: %s to Server:%s HTTPError: %s \n"% (url, server, e.code))
logging.error("Send Url: %s to Server:%s HTTPError: %s \n"% (url, server, e.code))
except:
import traceback
traceback.print_exc()
logging.error("Send Url: %s to Server:%s Unknow Error\n"% (url, server))
self.sending -= 1
def run(self):
'''
Get url from send_url_queue to send to crawlers
'''
logging.error("sending: %s and %s urls waiting in queue" % (self.sending, self.send_url_queue.qsize()))
while not self.send_url_queue.empty() and self.sending <= options.max_send_clients:
url = self.send_url_queue.get()
self.sending += 1
self.ioloop.add_callback(self.do_work, url)
self.ioloop.add_timeout(datetime.timedelta(seconds=1), self.run)