Ejemplo n.º 1
0
 def process_exception(self, request, exception, spider):
     to_return = RetryMiddleware.process_exception(
         self, request, exception, spider)
     # customize retry middleware by modifying this
     request.meta['url'] = request.url
     self.record_failed('failed.txt', request, exception, 'url')
     return to_return
Ejemplo n.º 2
0
 def process_exception(self, request, exception, spider):
     to_return = RetryMiddleware.process_exception(self, request, exception,
                                                   spider)
     # customize retry middleware by modifying this
     request.meta['url'] = request.url
     self.record_failed('failed.txt', request, exception, 'url')
     return to_return
Ejemplo n.º 3
0
    def _retry(self, request, reason, spider):
        
        if isinstance(reason, TCPTimedOutError):
            reason.args = (u'...',)
        
        retries = request.meta.get('retry_times', 0)
        
        if str(reason).find('404') > -1 and request.callback.im_class == CarDetailSpider:
            ci = request.cookies[FetchConstant.CarInfo]

            fs = FetchSession()
            ci_exist = fs.query(CarInfo).filter(CarInfo.seqid == ci.seqid).first()
            if ci_exist:
                try:
                    ci_exist.statustype = CarInfoValueConst.offline
                    ci_exist.offlinedatetime = datetime.datetime.today()
                    fs.commit()
                    msg = (u'[404] seqid: %s ,url not exist %s') % (ci.seqid, request.url,)
                    spider.log(msg, log.INFO)
                except:
                    fs.rollback()
                finally:
                    fs.close()
            
            request.meta['retry_times'] = self.max_retry_times
                
            return RetryMiddleware._retry(self, request, reason, spider)
        
        if retries <= self.max_retry_times - 1:
            next_proxy = get_valid_proxy.next()
            rs = request.copy()
            if next_proxy:
                proxy_str = next_proxy.build_literal()
                rs = rs.replace(dont_filter=True)
                rs.meta['proxy'] = proxy_str
                msg = (u'use to %s access %s ') % (proxy_str, rs.url)
                spider.log(msg, log.DEBUG)
            else:
                try:
                    del rs.meta[u'proxy']
                    msg = (u'use self ip asscess %s') % (rs.url)
                    spider.log(msg, log.DEBUG)
                except :pass
            
        return RetryMiddleware._retry(self, rs, reason, spider)
class RetryTest(unittest.TestCase):
    def setUp(self):
        self.spider = BaseSpider()
        self.mw = RetryMiddleware()
        self.mw.max_retry_times = 2

    def test_priority_adjust(self):
        req = Request('http://www.scrapytest.org/503')
        rsp = Response('http://www.scrapytest.org/503', body='', status=503)
        req2 = self.mw.process_response(req, rsp, self.spider)
        assert req2.priority < req.priority

    def test_404(self):
        req = Request('http://www.scrapytest.org/404')
        rsp = Response('http://www.scrapytest.org/404', body='', status=404)

        # dont retry 404s
        assert self.mw.process_response(req, rsp, self.spider) is rsp

    def test_503(self):
        req = Request('http://www.scrapytest.org/503')
        rsp = Response('http://www.scrapytest.org/503', body='', status=503)

        # first retry
        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
        self.assertEqual(req.meta['retry_times'], 1)

        # second retry
        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
        self.assertEqual(req.meta['retry_times'], 2)

        # discard it
        assert self.mw.process_response(req, rsp, self.spider) is rsp

    def test_twistederrors(self):
        for exc in (ServerTimeoutError, DNSLookupError, ConnectionRefusedError, ConnectionDone, ConnectError, ConnectionLost):
            req = Request('http://www.scrapytest.org/%s' % exc.__name__)
            self._test_retry_exception(req, exc())

    def _test_retry_exception(self, req, exception):
        # first retry
        req = self.mw.process_exception(req, exception, self.spider)
        assert isinstance(req, Request)
        self.assertEqual(req.meta['retry_times'], 1)

        # second retry
        req = self.mw.process_exception(req, exception, self.spider)
        assert isinstance(req, Request)
        self.assertEqual(req.meta['retry_times'], 2)

        # discard it
        req = self.mw.process_exception(req, exception, self.spider)
        self.assertEqual(req, None)
Ejemplo n.º 5
0
 def _retry(self, request, reason, spider):
     log.message('Changing proxy')
     conn = TorCtl.connect(passphrase="1234")
     conn.sendAndRecv('signal newnym\r\n')
     conn.close()      
     time.sleep(3)
     log.message("renewed")
     
     return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 6
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0)
        proxy = request.meta.get(u'proxy')
        if retries <= self.max_retry_times - 1:
            try:
                next_proxy = spider.get_next_proxy(request.cookies)
            except Exception:
                msg = (u'there is no proxy list in cookies %s ,please check')
                spider.log(msg, log.WARNING)
                return RetryMiddleware._retry(self, request, reason, spider)
            if proxy:
                msg = (u'proxy %s fail, use %s for the %srd time '
                       'retry') % (proxy, next_proxy, retries)
            else:
                msg = (u'request without proxy , use %s for the %srd time '
                       'retry') % (next_proxy, retries)
            spider.log(msg, log.INFO)
            request.meta[u'proxy'] = next_proxy

        return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 7
0
 def _retry(self, request, reason, spider):
     retries = request.meta.get('retry_times', 0)
     proxy = request.meta.get(u'proxy')
     if retries <= self.max_retry_times - 1:
         try:
             next_proxy = spider.get_next_proxy(request.cookies)
         except Exception:
             msg = (u'there is no proxy list in cookies %s ,please check')
             spider.log(msg, log.WARNING)
             return RetryMiddleware._retry(self, request, reason, spider)
         if proxy:
             msg = (u'proxy %s fail, use %s for the %srd time '
                    'retry') % (proxy, next_proxy, retries)
         else:
             msg = (u'request without proxy , use %s for the %srd time '
                    'retry') % (next_proxy, retries)
         spider.log(msg, log.INFO)
         request.meta[u'proxy'] = next_proxy 
         
     return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 8
0
 def _retry(self, request, reason, spider):
     log.msg('Changing-proxy')
     tn = telnetlib.Telnet('127.0.0.1', 9051)
     tn.read_until("Escape character is '^]'.", 2)
     tn.write('AUTHENTICATE "267765"\r\n')
     tn.read_until("250 OK", 2)
     tn.write("signal NEWNYM\r\n")
     tn.read_until("250 OK", 2)
     tn.write("quit\r\n")
     tn.close()
     time.sleep(3)
     log.msg('Proxychanged')
     return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 9
0
    def _retry(self, request, reason, spider):
        settings = spider.settings

        if RetryChangeProxyMiddleware.conn is None:
            RetryChangeProxyMiddleware.conn = TorCtl.connect(controlAddr=settings.get('TOR_HOST'),
                                                             controlPort=settings.get('TOR_PORT'),
                                                             passphrase=settings.get('TOR_PASSW'))
            RetryChangeProxyMiddleware.last = 0
            RetryChangeProxyMiddleware.timelimit = settings.get('TOR_CHANGE_LIMIT')

        if isinstance(reason, basestring):
            #log.msg('Valid retry, reason: ' + reason + ' for URL ' + request.url, log.INFO)
            t = time.time()
            diff = t - RetryChangeProxyMiddleware.last
            if RetryChangeProxyMiddleware.conn and diff > RetryChangeProxyMiddleware.timelimit:
                TorCtl.Connection.send_signal(RetryChangeProxyMiddleware.conn, "NEWNYM")
                RetryChangeProxyMiddleware.last = t
                log.msg('Proxy changed for reason %s. New last: %s' % (reason, time.strftime("%H:%M:%S")), log.INFO)
            #else:
            #    log.msg('Proxy not changed! Time difference is %s seconds' % ("{:.2f}".format(diff)), log.INFO)
            return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 10
0
 def __init__(self, settings):
     RetryMiddleware.__init__(self, settings)
 def setUp(self):
     self.spider = BaseSpider()
     self.mw = RetryMiddleware()
     self.mw.max_retry_times = 2
Ejemplo n.º 12
0
def _retry_proxy(self, request, reason, spider):
    change_proxy(log_msg=True)
    # time.sleep(1)
    return RetryMiddleware._retry(self, request, reason, spider)
Ejemplo n.º 13
0
 def setUp(self):
     crawler = get_crawler(Spider)
     self.spider = crawler._create_spider('foo')
     self.mw = RetryMiddleware.from_crawler(crawler)
     self.mw.max_retry_times = 2
Ejemplo n.º 14
0
def _retry_proxy(self, request, reason, spider):
    change_proxy(log_msg=True)
    # time.sleep(1)
    return RetryMiddleware._retry(self, request, reason, spider)
 def setUp(self):
     self.spider = BaseSpider('foo')
     self.mw = RetryMiddleware()
     self.mw.max_retry_times = 2
 def setUp(self):
     crawler = get_crawler()
     self.spider = Spider("foo")
     self.mw = RetryMiddleware.from_crawler(crawler)
     self.mw.max_retry_times = 2
 def _retry(self, request, exception, spider):
   if isinstance(exception, ConnectionRefusedError):
     TorManager.get_instance().refresh_circuit()
     time.sleep(3)
     log.msg('Connection refused and tor circuit refreshed')
   return RetryMiddleware._retry(self, request, exception, spider)
Ejemplo n.º 18
0
 def process_response(self, request, response, spider):
     if not busy(response.body_as_unicode()):
         return RetryMiddleware.process_response(self, request, response, spider)
     reason = 'tora request failed'
     return self._retry(request, reason, spider) or response
Ejemplo n.º 19
0
 def process_response(self, request, response, spider):
     if not busy(response.body_as_unicode()):
         return RetryMiddleware.process_response(self, request, response,
                                                 spider)
     reason = 'tora request failed'
     return self._retry(request, reason, spider) or response
class RetryTest(unittest.TestCase):
    def setUp(self):
        self.spider = BaseSpider('foo')
        self.mw = RetryMiddleware()
        self.mw.max_retry_times = 2

    def test_priority_adjust(self):
        req = Request('http://www.scrapytest.org/503')
        rsp = Response('http://www.scrapytest.org/503', body='', status=503)
        req2 = self.mw.process_response(req, rsp, self.spider)
        assert req2.priority < req.priority

    def test_404(self):
        req = Request('http://www.scrapytest.org/404')
        rsp = Response('http://www.scrapytest.org/404', body='', status=404)

        # dont retry 404s
        assert self.mw.process_response(req, rsp, self.spider) is rsp

    def test_dont_retry(self):
        req = Request('http://www.scrapytest.org/503',
                      meta={'dont_retry': True})
        rsp = Response('http://www.scrapytest.org/503', body='', status=503)

        # first retry
        r = self.mw.process_response(req, rsp, self.spider)
        assert r is rsp

    def test_dont_retry_exc(self):
        req = Request('http://www.scrapytest.org/503',
                      meta={'dont_retry': True})
        rsp = Response('http://www.scrapytest.org/503', body='', status=503)

        r = self.mw.process_exception(req, DNSLookupError(), self.spider)
        assert r is None

    def test_503(self):
        req = Request('http://www.scrapytest.org/503')
        rsp = Response('http://www.scrapytest.org/503', body='', status=503)

        # first retry
        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
        self.assertEqual(req.meta['retry_times'], 1)

        # second retry
        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
        self.assertEqual(req.meta['retry_times'], 2)

        # discard it
        assert self.mw.process_response(req, rsp, self.spider) is rsp

    def test_twistederrors(self):
        for exc in (ServerTimeoutError, DNSLookupError, ConnectionRefusedError,
                    ConnectionDone, ConnectError, ConnectionLost):
            req = Request('http://www.scrapytest.org/%s' % exc.__name__)
            self._test_retry_exception(req, exc())

    def _test_retry_exception(self, req, exception):
        # first retry
        req = self.mw.process_exception(req, exception, self.spider)
        assert isinstance(req, Request)
        self.assertEqual(req.meta['retry_times'], 1)

        # second retry
        req = self.mw.process_exception(req, exception, self.spider)
        assert isinstance(req, Request)
        self.assertEqual(req.meta['retry_times'], 2)

        # discard it
        req = self.mw.process_exception(req, exception, self.spider)
        self.assertEqual(req, None)
Ejemplo n.º 21
0
 def process_response(self, request, response, spider):
     ret = RetryMiddleware.process_response(self, request, response, spider)
     if spider.name != 'tora' or good(response):
         return ret
     reason = 'tora request failed'
     return self._retry(request, reason, spider) or response
Ejemplo n.º 22
0
 def __init__(self, settings):
     RetryMiddleware.__init__(self, settings)
Ejemplo n.º 23
0
 def process_response(self, request, response, spider):
     ret = RetryMiddleware.process_response(self, request, response, spider)
     if spider.name != 'tora' or good(response):
         return ret
     reason = 'tora request failed'
     return self._retry(request, reason, spider) or response
Ejemplo n.º 24
0
 def setUp(self):
     crawler = get_crawler()
     self.spider = Spider('foo')
     self.mw = RetryMiddleware.from_crawler(crawler)
     self.mw.max_retry_times = 2
Ejemplo n.º 25
0
 def _retry(self, request, exception, spider):
     if isinstance(exception, ConnectionRefusedError):
         TorManager.get_instance().refresh_circuit()
         time.sleep(3)
         log.msg('Connection refused and tor circuit refreshed')
     return RetryMiddleware._retry(self, request, exception, spider)