Beispiel #1
0
 def wrapper(*args, **kwargs):
     try:
         ret = func(*args, **kwargs)
         return ret
     except Exception:
         logger.info('【%s】error:%s' %
                     (func.__name__, traceback.format_exc()))
Beispiel #2
0
 def wrapper(*args, **kwargs):
     rets = func(*args, **kwargs)
     start = time.clock()
     for ret in rets:
         yield ret
     logger.info(func.__name__ + ' run time: ' +
                 '{:.9f}'.format(time.clock() - start))
Beispiel #3
0
 def wrapper(self, response):
     if not response.m_response:
         if response.m_response is None:
             logger.error('response.m_response is None and url : ' +
                          response.request.url +
                          ' and request has been push to queue again!')
         else:
             logger.error('response.m_response is failed 【' +
                          str(response.m_response.status_code) +
                          '】 and url : ' + response.request.url +
                          ' content:' + response.m_response.content +
                          ' and request has been push to queue again!')
         yield response.request
     else:
         process = func(self, response)
         if process is not None:
             try:
                 start = time.clock()
                 for callback in process:
                     yield callback
                 logger.info(func.__name__ + ' run time: ' +
                             '{:.9f}'.format(time.clock() - start))
             except Exception:
                 logger.error('process error: ' + response.request.url +
                              '\r\n' + response.m_response.content +
                              '\r\n' + traceback.format_exc())
Beispiel #4
0
 def download(self, request):
     web = self.web_driver_pool.get()  # type:WebDriver
     web.get(request.url)
     response = Response(content=web.execute_script(
         "return document.documentElement.outerHTML"),
                         request=request)
     self.web_driver_pool.put(web)
     logger.info("selenium download success:" + request.url)
     return response
Beispiel #5
0
 def stop(self):
     if self._spider_status == 'stopped':
         logger.info("STOP %s SUCCESS" % self._spider_id)
         return
     elif self._spider_status == 'stopping':
         while self._spider_status == 'stopping':
             pass
     elif self._spider_status == 'start':
         self._spider_status = 'stopping'
         while self._spider_status == 'stopping':
             pass
Beispiel #6
0
 def start(self):
     try:
         logger.info("START %s SUCCESS" % self._spider_id)
         self._spider_status = 'start'
         self._queue = PriorityQueue(self._processor)
         if len(self._processor.start_requests) > 0:
             for start_request in self._processor.start_requests:
                 if self._should_follow(start_request):
                     start_request.duplicate_remove = False
                     self._queue.push(start_request)
                     logger.info("start request:" + str(start_request))
         for batch in self._batch_requests():
             if len(batch) > 0:
                 self._crawl(batch)
             if self._spider_status == 'stopping':
                 break
         self._spider_status = 'stopped'
         logger.info("STOP %s SUCCESS" % self._spider_id)
     except Exception:
         logger.info("%s -- Exception -- Stopped -- %s" % (self._spider_id, traceback.format_exc()))
         self._spider_status = 'stopped'
Beispiel #7
0
 def init_pool(self):
     logger.info('init web driver pool...')
     self.web_driver_pool = get_web_driver_pool(1)
     logger.info('init web driver pool success...')
Beispiel #8
0
 def __init__(self):
     logger.info("init web driver pool...")
     self.web_driver_pool = get_web_driver_pool(
         default_settings.DRIVER_POOL_SIZE)
     logger.info("init web driver pool success")
Beispiel #9
0
    def download(self, batch):
        batch_requests = []

        for request in batch:
            session = requests.session()
            session.mount('https://', self._request_retry)
            session.mount('http://', self._request_retry)

            if not request.headers:
                request.headers = self._headers
                session.headers = self._headers

            if request.method.upper() == "GET":
                if self.use_proxy:
                    m_proxies = self.proxy_pool.getProxy()
                    batch_requests.append(
                        grequests.get(session=session,
                                      url=request.url,
                                      headers=request.headers,
                                      cookies=self._cookies,
                                      verify=False,
                                      allow_redirects=request.allow_redirects,
                                      timeout=request.timeout,
                                      proxies=m_proxies))
                else:
                    batch_requests.append(
                        grequests.get(session=session,
                                      url=request.url,
                                      headers=request.headers,
                                      cookies=self._cookies,
                                      verify=False,
                                      allow_redirects=request.allow_redirects,
                                      timeout=request.timeout))
            elif request.method.upper() == "POST":
                if self.use_proxy:
                    m_proxies = self.proxy_pool.getProxy()
                    batch_requests.append(
                        grequests.post(session=session,
                                       url=request.url,
                                       data=request.data,
                                       json=request.json,
                                       headers=request.headers,
                                       cookies=self._cookies,
                                       verify=False,
                                       allow_redirects=request.allow_redirects,
                                       timeout=request.timeout,
                                       proxies=m_proxies))
                else:
                    batch_requests.append(
                        grequests.post(session=session,
                                       url=request.url,
                                       data=request.data,
                                       json=request.json,
                                       headers=request.headers,
                                       cookies=self._cookies,
                                       verify=False,
                                       allow_redirects=request.allow_redirects,
                                       timeout=request.timeout))
            else:
                pass

        rets = grequests.map(batch_requests,
                             exception_handler=exception_handler)

        true_responses = []
        index = 0
        for ret in rets:
            true_response = Response(
                m_response=ret,
                request=batch[index],
            )
            true_responses.append(true_response)
            logger.info(true_response)
            index += 1

        return true_responses
Beispiel #10
0
 def wrapper(*args, **kwargs):
     start = time.clock()
     ret = func(*args, **kwargs)
     logger.info(func.__name__ + ' run time: ' +
                 '{:.9f}'.format(time.clock() - start))
     return ret