Beispiel #1
0
    def __init__(self, ioloop=None, server_list=None, replicas=20):
        super().__init__()
        
        self.ioloop = ioloop or tornado.ioloop.IOLoop.instance()
        self.send_url_queue = Queue()
        self.sending = 0
        self.server_list = server_list or []
        if not self.server_list:
            raise ValueError("server_list is None.")

        self.replicas = replicas

        self.ring = Hash(self.server_list, replicas=self.replicas)
Beispiel #2
0
class Sender(metaclass=Singleton):
    '''
        URL分发器类
        兼顾用一致性hash实现的负载均衡
    '''


    def __init__(self, ioloop=None, server_list=None, replicas=20):
        super().__init__()
        
        self.ioloop = ioloop or tornado.ioloop.IOLoop.instance()
        self.send_url_queue = Queue()
        self.sending = 0
        self.server_list = server_list or []
        if not self.server_list:
            raise ValueError("server_list is None.")

        self.replicas = replicas

        self.ring = Hash(self.server_list, replicas=self.replicas)

    def add_url(self, url):
        logging.debug("send url to queue %s" % url)

        self.send_url_queue.put(url)


    @tornado.gen.coroutine
    def send(self, server, url):
        '''
        把 url hash后传递给对应的服务器去抓取
        '''
        if server != options.local:
            1/0
            http_cilent = AsyncHTTPClient()
            
            target_url = 'http://'+ server+ '/crawler/'+ url

            logging.info("target_url: %s" % target_url)

            request = HTTPRequest(url=target_url.encode('utf-8'), connect_timeout=options.timeout, request_timeout=options.timeout)

            yield http_cilent.fetch(request)

        else:
            fetch = fetcher.Fetcher()
            fetch.fetch_queue.put(url)

    @tornado.gen.coroutine
    def do_work(self, url):
        logging.debug("sender do_work with url %s" % url)

        server = self.ring.get_node(url)

        try:
            yield self.send(server, url)
        except tornado.httpclient.HTTPError as e:
            import traceback
            traceback.print_exc()

            with open('httperrorwithServer.txt', "a") as f:
                f.write("Send Url: %s to Server:%s HTTPError: %s \n"% (url, server, e.code))

            logging.error("Send Url: %s to Server:%s HTTPError: %s \n"% (url, server, e.code))

        except:
            import traceback
            traceback.print_exc()
            logging.error("Send Url: %s to Server:%s Unknow Error\n"% (url, server))

        self.sending -= 1


    def run(self):
        '''
            Get url from send_url_queue to send to crawlers
        '''
        logging.error("sending: %s and %s urls waiting in queue" % (self.sending, self.send_url_queue.qsize()))

        while not self.send_url_queue.empty() and self.sending <= options.max_send_clients:
            url = self.send_url_queue.get()
            
            self.sending += 1
            
            self.ioloop.add_callback(self.do_work, url)

        self.ioloop.add_timeout(datetime.timedelta(seconds=1), self.run)