Beispiel #1
0
 def wrapper(*args, **kwargs):
     rets = func(*args, **kwargs)
     start = time.clock()
     for ret in rets:
         yield ret
     logger.info(func.__name__ + ' run time: ' +
                 '{:.9f}'.format(time.clock() - start))
Beispiel #2
0
 def wrapper(*args, **kwargs):
     try:
         rets = func(*args, **kwargs)
         for ret in rets:
             yield ret
     except Exception:
         logger.info('【%s】error:%s' % (func.__name__, traceback.format_exc()))
Beispiel #3
0
 def wrapper(self, response):
     if not response.m_response:
         if response.m_response is None:
             logger.error('response.m_response is None and url : ' +
                          response.request.url +
                          ' and request has been push to queue again!')
         else:
             logger.error('response.m_response is failed 【' +
                          str(response.m_response.status_code) +
                          '】 and url : ' + response.request.url +
                          ' content:' + response.m_response.content +
                          ' and request has been push to queue again!')
         yield response.request
     else:
         process = func(self, response)
         if process is not None:
             try:
                 start = time.clock()
                 for callback in process:
                     yield callback
                 logger.info(func.__name__ + ' run time: ' +
                             '{:.9f}'.format(time.clock() - start))
             except Exception:
                 logger.error('process error: ' + response.request.url +
                              '\r\n' + response.m_response.content +
                              '\r\n' + traceback.format_exc())
Beispiel #4
0
 def __init__(self, driver_pool_size=None):
     self.driver_pool_size = driver_pool_size
     logger.info("init web driver pool...")
     if driver_pool_size:
         self.web_driver_pool = get_web_driver_pool(driver_pool_size)
     else:
         self.web_driver_pool = get_web_driver_pool(
             default_settings.DRIVER_POOL_SIZE)
     logger.info("init web driver pool success")
Beispiel #5
0
 def download(self, request):
     web = self.web_driver_pool.get()  # type:WebDriver
     web.get(request.url)
     response = Response(content=web.execute_script(
         "return document.documentElement.outerHTML"),
                         request=request)
     self.web_driver_pool.put(web)
     logger.info("selenium download success:" + request.url)
     return response
Beispiel #6
0
 def stop(self):
     if self._spider_status == 'stopped':
         logger.info("STOP %s SUCCESS" % self._spider_id)
         return
     elif self._spider_status == 'stopping':
         while self._spider_status == 'stopping':
             pass
     elif self._spider_status == 'start':
         self._spider_status = 'stopping'
         while self._spider_status == 'stopping':
             pass
Beispiel #7
0
 def start(self):
     try:
         logger.info("START %s SUCCESS" % self._spider_id)
         self._spider_status = 'start'
         self._queue = PriorityQueue(self._processor)
         if len(self._processor.start_requests) > 0:
             for start_request in self._processor.start_requests:
                 if self._should_follow(start_request):
                     start_request.duplicate_remove = False
                     self._queue.push(start_request)
                     logger.info("start request:" + str(start_request))
         for batch in self._batch_requests():
             if len(batch) > 0:
                 self._crawl(batch)
                 if self.test:
                     if self._process_count > 0:
                         return
             if self._spider_status == 'stopping':
                 break
         self._spider_status = 'stopped'
         logger.info("STOP %s SUCCESS" % self._spider_id)
     except Exception:
         logger.info("%s -- Exception -- Stopped -- %s" %
                     (self._spider_id, traceback.format_exc()))
         self._spider_status = 'stopped'
Beispiel #8
0
    def download(self, batch):
        if self.driver_pool_size:
            pool = Pool(processes=self.driver_pool_size)
        else:
            pool = Pool(processes=default_settings.DRIVER_POOL_SIZE)

        results = []

        for request in batch:
            results.append(pool.apply_async(self.download_one, (request, )))
        pool.close()
        pool.join()

        true_responses = []
        for result in results:
            true_response = result.get()
            true_responses.append(true_response)
            logger.info(true_response)

        return true_responses
Beispiel #9
0
 def wrapper(*args, **kwargs):
     start = time.clock()
     ret = func(*args, **kwargs)
     logger.info(func.__name__ + ' run time: ' + '{:.9f}'.format(time.clock() - start))
     return ret
Beispiel #10
0
    def download(self, batch):
        batch_requests = []

        for request in batch:
            session = requests.session()
            session.mount('https://', self._request_retry)
            session.mount('http://', self._request_retry)

            if not request.headers:
                request.headers = self._headers
                session.headers = self._headers

            if request.method.upper() == "GET":
                if self.use_proxy:
                    m_proxies = self.proxy_pool.getProxy()
                    batch_requests.append(grequests.get(
                            session=session,
                            url=request.url,
                            headers=request.headers,
                            cookies=self._cookies,
                            verify=False,
                            allow_redirects=request.allow_redirects,
                            timeout=request.timeout,
                            proxies=m_proxies
                    ))
                else:
                    batch_requests.append(grequests.get(
                            session=session,
                            url=request.url,
                            headers=request.headers,
                            cookies=self._cookies,
                            verify=False,
                            allow_redirects=request.allow_redirects,
                            timeout=request.timeout
                    ))
            elif request.method.upper() == "POST":
                if self.use_proxy:
                    m_proxies = self.proxy_pool.getProxy()
                    batch_requests.append(grequests.post(
                            session=session,
                            url=request.url,
                            data=request.data,
                            json=request.json,
                            headers=request.headers,
                            cookies=self._cookies,
                            verify=False,
                            allow_redirects=request.allow_redirects,
                            timeout=request.timeout,
                            proxies=m_proxies
                    ))
                else:
                    batch_requests.append(grequests.post(
                            session=session,
                            url=request.url,
                            data=request.data,
                            json=request.json,
                            headers=request.headers,
                            cookies=self._cookies,
                            verify=False,
                            allow_redirects=request.allow_redirects,
                            timeout=request.timeout
                    ))
            else:
                pass

        rets = grequests.map(batch_requests, exception_handler=exception_handler)

        true_responses = []
        index = 0
        for ret in rets:
            true_response = Response(
                    m_response=ret,
                    request=batch[index],
            )
            true_responses.append(true_response)
            logger.info(true_response)
            index += 1

        return true_responses
Beispiel #11
0
 def init_pool(self):
     logger.info('init web driver pool...')
     self.web_driver_pool = get_web_driver_pool(1)
     logger.info('init web driver pool success...')
Beispiel #12
0
 def __init__(self):
     logger.info("init web driver pool...")
     self.web_driver_pool = get_web_driver_pool(
         default_settings.DRIVER_POOL_SIZE)
     logger.info("init web driver pool success")