Exemple #1
0
class TestHttpErrorMiddlewareIntegrational(TrialTestCase):
    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_middleware_works(self):
        spider = _HttpErrorSpider()
        yield docrawl(spider)
        assert not spider.skipped, spider.skipped
        self.assertEqual(spider.parsed, {'200'})
        self.assertEqual(spider.failed, {'404', '402', '500'})

    @defer.inlineCallbacks
    def test_logging(self):
        spider = _HttpErrorSpider(bypass_status_codes={402})
        yield docrawl(spider)
        # print(get_testlog())
        self.assertEqual(spider.parsed, {'200', '402'})
        self.assertEqual(spider.skipped, {'402'})
        self.assertEqual(spider.failed, {'404', '500'})

        log = get_testlog()
        self.assertIn('Ignoring response <404', log)
        self.assertIn('Ignoring response <500', log)
        self.assertNotIn('Ignoring response <200', log)
        self.assertNotIn('Ignoring response <402', log)
class TestHttpErrorMiddlewareIntegrational(TrialTestCase):

    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_middleware_works(self):
        spider = _HttpErrorSpider()
        yield docrawl(spider)
        assert not spider.skipped, spider.skipped
        self.assertEqual(spider.parsed, {'200'})
        self.assertEqual(spider.failed, {'404', '402', '500'})

    @defer.inlineCallbacks
    def test_logging(self):
        spider = _HttpErrorSpider(bypass_status_codes={402})
        yield docrawl(spider)
        # print(get_testlog())
        self.assertEqual(spider.parsed, {'200', '402'})
        self.assertEqual(spider.skipped, {'402'})
        self.assertEqual(spider.failed, {'404', '500'})

        log = get_testlog()
        self.assertIn('Ignoring response <404', log)
        self.assertIn('Ignoring response <500', log)
        self.assertNotIn('Ignoring response <200', log)
        self.assertNotIn('Ignoring response <402', log)
Exemple #3
0
 def setUp(self):
     self.mockserver = MockServer()
     self.mockserver.__enter__()
     self._oldenv = os.environ.copy()
     self._proxy = HTTPSProxy(8888)
     self._proxy.start()
     # Wait for the proxy to start.
     time.sleep(1.0)
     os.environ['http_proxy'] = 'http://*****:*****@localhost:8888'
     os.environ['https_proxy'] = 'http://*****:*****@localhost:8888'
Exemple #4
0
class TestCloseSpider(TestCase):

    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_closespider_itemcount(self):
        spider = ItemSpider()
        close_on = 5
        yield docrawl(spider, {'CLOSESPIDER_ITEMCOUNT': close_on})
        reason = spider.meta['close_reason']
        self.assertEqual(reason, 'closespider_itemcount')
        itemcount = spider.crawler.stats.get_value('item_scraped_count')
        self.assertTrue(itemcount >= close_on)

    @defer.inlineCallbacks
    def test_closespider_pagecount(self):
        spider = FollowAllSpider()
        close_on = 5
        yield docrawl(spider, {'CLOSESPIDER_PAGECOUNT': close_on})
        reason = spider.meta['close_reason']
        self.assertEqual(reason, 'closespider_pagecount')
        pagecount = spider.crawler.stats.get_value('response_received_count')
        self.assertTrue(pagecount >= close_on)

    @defer.inlineCallbacks
    def test_closespider_errorcount(self):
        spider = ErrorSpider(total=1000000)
        close_on = 5
        yield docrawl(spider, {'CLOSESPIDER_ERRORCOUNT': close_on})
        self.flushLoggedErrors(spider.exception_cls)
        reason = spider.meta['close_reason']
        self.assertEqual(reason, 'closespider_errorcount')
        key = 'spider_exceptions/{name}'\
                .format(name=spider.exception_cls.__name__)
        errorcount = spider.crawler.stats.get_value(key)
        self.assertTrue(errorcount >= close_on)

    @defer.inlineCallbacks
    def test_closespider_timeout(self):
        spider = FollowAllSpider(total=1000000)
        close_on = 0.1
        yield docrawl(spider, {'CLOSESPIDER_TIMEOUT': close_on})
        reason = spider.meta['close_reason']
        self.assertEqual(reason, 'closespider_timeout')
        stats = spider.crawler.stats
        start = stats.get_value('start_time')
        stop = stats.get_value('finish_time')
        diff = stop - start
        total_seconds = diff.seconds + diff.microseconds
        self.assertTrue(total_seconds >= close_on)
class TestCloseSpider(TestCase):
    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_closespider_itemcount(self):
        spider = ItemSpider()
        close_on = 5
        yield docrawl(spider, {'CLOSESPIDER_ITEMCOUNT': close_on})
        reason = spider.meta['close_reason']
        self.assertEqual(reason, 'closespider_itemcount')
        itemcount = spider.crawler.stats.get_value('item_scraped_count')
        self.assertTrue(itemcount >= close_on)

    @defer.inlineCallbacks
    def test_closespider_pagecount(self):
        spider = FollowAllSpider()
        close_on = 5
        yield docrawl(spider, {'CLOSESPIDER_PAGECOUNT': close_on})
        reason = spider.meta['close_reason']
        self.assertEqual(reason, 'closespider_pagecount')
        pagecount = spider.crawler.stats.get_value('response_received_count')
        self.assertTrue(pagecount >= close_on)

    @defer.inlineCallbacks
    def test_closespider_errorcount(self):
        spider = ErrorSpider(total=1000000)
        close_on = 5
        yield docrawl(spider, {'CLOSESPIDER_ERRORCOUNT': close_on})
        self.flushLoggedErrors(spider.exception_cls)
        reason = spider.meta['close_reason']
        self.assertEqual(reason, 'closespider_errorcount')
        key = 'spider_exceptions/{name}'\
                .format(name=spider.exception_cls.__name__)
        errorcount = spider.crawler.stats.get_value(key)
        self.assertTrue(errorcount >= close_on)

    @defer.inlineCallbacks
    def test_closespider_timeout(self):
        spider = FollowAllSpider(total=1000000)
        close_on = 0.1
        yield docrawl(spider, {'CLOSESPIDER_TIMEOUT': close_on})
        reason = spider.meta['close_reason']
        self.assertEqual(reason, 'closespider_timeout')
        stats = spider.crawler.stats
        start = stats.get_value('start_time')
        stop = stats.get_value('finish_time')
        diff = stop - start
        total_seconds = diff.seconds + diff.microseconds
        self.assertTrue(total_seconds >= close_on)
 def setUp(self):
     self.mockserver = MockServer()
     self.mockserver.__enter__()
     self._oldenv = os.environ.copy()
     self._proxy = HTTPSProxy(8888)
     self._proxy.start()
     # Wait for the proxy to start.
     time.sleep(1.0)
     os.environ['http_proxy'] = 'http://*****:*****@localhost:8888'
     os.environ['https_proxy'] = 'http://*****:*****@localhost:8888'
Exemple #7
0
class CrawlTestCase(TestCase):

    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        spider = FollowAllSpider()
        yield docrawl(spider)
        self.assertEqual(len(spider.urls_visited), 11)  # 10 + start_url

    @defer.inlineCallbacks
    def test_delay(self):
        # short to long delays
        yield self._test_delay(0.2, False)
        yield self._test_delay(1, False)
        # randoms
        yield self._test_delay(0.2, True)
        yield self._test_delay(1, True)

    @defer.inlineCallbacks
    def _test_delay(self, delay, randomize):
        settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
        spider = FollowAllSpider(maxlatency=delay * 2)
        yield docrawl(spider, settings)
        t = spider.times
        totaltime = t[-1] - t[0]
        avgd = totaltime / (len(t) - 1)
        tolerance = 0.6 if randomize else 0.2
        self.assertTrue(avgd > delay * (1 - tolerance),
                        "download delay too small: %s" % avgd)

    @defer.inlineCallbacks
    def test_timeout_success(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider)
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 > 0)
        self.assertTrue(spider.t2 > spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)
        # server hangs after receiving response headers
        spider = DelaySpider(n=0.5, b=1)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        spider = SimpleSpider("http://localhost:8998/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        spider = SimpleSpider("http://localhost:65432/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        with mock.patch('socket.gethostbyname',
                        side_effect=socket.gaierror(-5, 'No address associated with hostname')):
            spider = SimpleSpider("http://example.com/")
            yield docrawl(spider)
            self._assert_retried()

    @defer.inlineCallbacks
    def test_start_requests_bug_before_yield(self):
        spider = BrokenStartRequestsSpider(fail_before_yield=1)
        yield docrawl(spider)
        errors = self.flushLoggedErrors(ZeroDivisionError)
        self.assertEqual(len(errors), 1)

    @defer.inlineCallbacks
    def test_start_requests_bug_yielding(self):
        spider = BrokenStartRequestsSpider(fail_yielding=1)
        yield docrawl(spider)
        errors = self.flushLoggedErrors(ZeroDivisionError)
        self.assertEqual(len(errors), 1)

    @defer.inlineCallbacks
    def test_start_requests_lazyness(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        spider = BrokenStartRequestsSpider()
        yield docrawl(spider, settings)
        #self.assertTrue(False, spider.seedsseen)
        #self.assertTrue(spider.seedsseen.index(None) < spider.seedsseen.index(99),
        #                spider.seedsseen)

    @defer.inlineCallbacks
    def test_unbounded_response(self):
        # Completeness of responses without Content-Length or Transfer-Encoding
        # can not be determined, we treat them as valid but flagged as "partial"
        from urllib import urlencode
        query = urlencode({'raw': '''\
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
Pragma: no-cache
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Cache-Control: no-cache
Cache-Control: no-store
Content-Type: text/html;charset=UTF-8
Content-Language: en
Date: Tue, 27 Aug 2013 13:05:05 GMT
Connection: close

foo body
with multiples lines
'''})
        spider = SimpleSpider("http://localhost:8998/raw?{0}".format(query))
        yield docrawl(spider)
        log = get_testlog()
        self.assertEqual(log.count("Got response 200"), 1)

    @defer.inlineCallbacks
    def test_retry_conn_lost(self):
        # connection lost after receiving data
        spider = SimpleSpider("http://localhost:8998/drop?abort=0")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_aborted(self):
        # connection lost before receiving data
        spider = SimpleSpider("http://localhost:8998/drop?abort=1")
        yield docrawl(spider)
        self._assert_retried()

    def _assert_retried(self):
        log = get_testlog()
        self.assertEqual(log.count("Retrying"), 2)
        self.assertEqual(log.count("Gave up retrying"), 1)

    @defer.inlineCallbacks
    def test_referer_header(self):
        """Referer header is set by RefererMiddleware unless it is already set"""
        req0 = Request('http://localhost:8998/echo?headers=1&body=0', dont_filter=1)
        req1 = req0.replace()
        req2 = req0.replace(headers={'Referer': None})
        req3 = req0.replace(headers={'Referer': 'http://example.com'})
        req0.meta['next'] = req1
        req1.meta['next'] = req2
        req2.meta['next'] = req3
        spider = SingleRequestSpider(seed=req0)
        yield docrawl(spider)
        # basic asserts in case of weird communication errors
        self.assertIn('responses', spider.meta)
        self.assertNotIn('failures', spider.meta)
        # start requests doesn't set Referer header
        echo0 = json.loads(spider.meta['responses'][2].body)
        self.assertNotIn('Referer', echo0['headers'])
        # following request sets Referer to start request url
        echo1 = json.loads(spider.meta['responses'][1].body)
        self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
        # next request avoids Referer header
        echo2 = json.loads(spider.meta['responses'][2].body)
        self.assertNotIn('Referer', echo2['headers'])
        # last request explicitly sets a Referer header
        echo3 = json.loads(spider.meta['responses'][3].body)
        self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])

    @defer.inlineCallbacks
    def test_engine_status(self):
        from scrapy.utils.engine import get_engine_status
        est = []

        def cb(response):
            est.append(get_engine_status(spider.crawler.engine))

        spider = SingleRequestSpider(seed='http://localhost:8998/', callback_func=cb)
        yield docrawl(spider)
        self.assertEqual(len(est), 1, est)
        s = dict(est[0])
        self.assertEqual(s['engine.spider.name'], spider.name)
        self.assertEqual(s['len(engine.scraper.slot.active)'], 1)
Exemple #8
0
 def setUp(self):
     self.mockserver = MockServer()
     self.mockserver.__enter__()
Exemple #9
0
class CrawlTestCase(TestCase):
    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        spider = FollowAllSpider()
        yield docrawl(spider)
        self.assertEqual(len(spider.urls_visited), 11)  # 10 + start_url

    @defer.inlineCallbacks
    def test_delay(self):
        # short to long delays
        yield self._test_delay(0.2, False)
        yield self._test_delay(1, False)
        # randoms
        yield self._test_delay(0.2, True)
        yield self._test_delay(1, True)

    @defer.inlineCallbacks
    def _test_delay(self, delay, randomize):
        settings = {
            "DOWNLOAD_DELAY": delay,
            'RANDOMIZE_DOWNLOAD_DELAY': randomize
        }
        spider = FollowAllSpider(maxlatency=delay * 2)
        yield docrawl(spider, settings)
        t = spider.times
        totaltime = t[-1] - t[0]
        avgd = totaltime / (len(t) - 1)
        tolerance = 0.6 if randomize else 0.2
        self.assertTrue(avgd > delay * (1 - tolerance),
                        "download delay too small: %s" % avgd)

    @defer.inlineCallbacks
    def test_timeout_success(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider)
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 > 0)
        self.assertTrue(spider.t2 > spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)
        # server hangs after receiving response headers
        spider = DelaySpider(n=0.5, b=1)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        spider = SimpleSpider("http://localhost:8998/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        spider = SimpleSpider("http://localhost:65432/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        spider = SimpleSpider("http://localhost666/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_start_requests_bug_before_yield(self):
        spider = BrokenStartRequestsSpider(fail_before_yield=1)
        yield docrawl(spider)
        errors = self.flushLoggedErrors(ZeroDivisionError)
        self.assertEqual(len(errors), 1)

    @defer.inlineCallbacks
    def test_start_requests_bug_yielding(self):
        spider = BrokenStartRequestsSpider(fail_yielding=1)
        yield docrawl(spider)
        errors = self.flushLoggedErrors(ZeroDivisionError)
        self.assertEqual(len(errors), 1)

    @defer.inlineCallbacks
    def test_start_requests_lazyness(self):
        settings = {"CONCURRENT_REQUESTS": 1}
        spider = BrokenStartRequestsSpider()
        yield docrawl(spider, settings)
        #self.assertTrue(False, spider.seedsseen)
        #self.assertTrue(spider.seedsseen.index(None) < spider.seedsseen.index(99),
        #                spider.seedsseen)

    @defer.inlineCallbacks
    def test_unbounded_response(self):
        # Completeness of responses without Content-Length or Transfer-Encoding
        # can not be determined, we treat them as valid but flagged as "partial"
        from urllib import urlencode
        query = urlencode({
            'raw':
            '''\
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
Pragma: no-cache
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Cache-Control: no-cache
Cache-Control: no-store
Content-Type: text/html;charset=UTF-8
Content-Language: en
Date: Tue, 27 Aug 2013 13:05:05 GMT
Connection: close

foo body
with multiples lines
'''
        })
        spider = SimpleSpider("http://localhost:8998/raw?{0}".format(query))
        yield docrawl(spider)
        log = get_testlog()
        self.assertEqual(log.count("Got response 200"), 1)

    @defer.inlineCallbacks
    def test_retry_conn_lost(self):
        # connection lost after receiving data
        spider = SimpleSpider("http://localhost:8998/drop?abort=0")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_aborted(self):
        # connection lost before receiving data
        spider = SimpleSpider("http://localhost:8998/drop?abort=1")
        yield docrawl(spider)
        self._assert_retried()

    def _assert_retried(self):
        log = get_testlog()
        self.assertEqual(log.count("Retrying"), 2)
        self.assertEqual(log.count("Gave up retrying"), 1)

    @defer.inlineCallbacks
    def test_referer_header(self):
        """Referer header is set by RefererMiddleware unless it is already set"""
        req0 = Request('http://localhost:8998/echo?headers=1&body=0',
                       dont_filter=1)
        req1 = req0.replace()
        req2 = req0.replace(headers={'Referer': None})
        req3 = req0.replace(headers={'Referer': 'http://example.com'})
        req0.meta['next'] = req1
        req1.meta['next'] = req2
        req2.meta['next'] = req3
        spider = SingleRequestSpider(seed=req0)
        yield docrawl(spider)
        # basic asserts in case of weird communication errors
        self.assertIn('responses', spider.meta)
        self.assertNotIn('failures', spider.meta)
        # start requests doesn't set Referer header
        echo0 = json.loads(spider.meta['responses'][2].body)
        self.assertNotIn('Referer', echo0['headers'])
        # following request sets Referer to start request url
        echo1 = json.loads(spider.meta['responses'][1].body)
        self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
        # next request avoids Referer header
        echo2 = json.loads(spider.meta['responses'][2].body)
        self.assertNotIn('Referer', echo2['headers'])
        # last request explicitly sets a Referer header
        echo3 = json.loads(spider.meta['responses'][3].body)
        self.assertEqual(echo3['headers'].get('Referer'),
                         ['http://example.com'])
class ProxyConnectTestCase(TestCase):

    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()
        self._oldenv = os.environ.copy()
        self._proxy = HTTPSProxy(8888)
        self._proxy.start()
        # Wait for the proxy to start.
        time.sleep(1.0)
        os.environ['http_proxy'] = 'http://*****:*****@localhost:8888'
        os.environ['https_proxy'] = 'http://*****:*****@localhost:8888'

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)
        self._proxy.shutdown()
        os.environ = self._oldenv

    @defer.inlineCallbacks
    def test_https_connect_tunnel(self):
        spider = SimpleSpider("https://localhost:8999/status?n=200")
        yield docrawl(spider)
        self._assert_got_response_code(200)

    @defer.inlineCallbacks
    def test_https_noconnect(self):
        os.environ['https_proxy'] = 'http://*****:*****@localhost:8888?noconnect'
        spider = SimpleSpider("https://*****:*****@localhost:8888'

    @defer.inlineCallbacks
    def test_https_connect_tunnel_error(self):
        spider = SimpleSpider("https://localhost:99999/status?n=200")
        yield docrawl(spider)
        self._assert_got_tunnel_error()

    @defer.inlineCallbacks
    def test_https_tunnel_auth_error(self):
        os.environ['https_proxy'] = 'http://*****:*****@localhost:8888'
        spider = SimpleSpider("https://*****:*****@localhost:8888'

    @defer.inlineCallbacks
    def test_https_noconnect_auth_error(self):
        os.environ['https_proxy'] = 'http://*****:*****@localhost:8888?noconnect'
        spider = SimpleSpider("https://localhost:8999/status?n=200")
        yield docrawl(spider)
        self._assert_got_response_code(407)

    def _assert_got_response_code(self, code):
        log = get_testlog()
        self.assertEqual(log.count('Crawled (%d)' % code), 1)

    def _assert_got_tunnel_error(self):
        log = get_testlog()
        self.assertEqual(log.count('TunnelError'), 1)
Exemple #11
0
 def run(self, args, opts):
     with MockServer():
         spider = FollowAllSpider(total=100000)
         crawler = self.crawler_process.create_crawler()
         crawler.crawl(spider)
         self.crawler_process.start()
Exemple #12
0
class CrawlTestCase(TestCase):

    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        spider = FollowAllSpider()
        yield docrawl(spider)
        self.assertEqual(len(spider.urls_visited), 11)  # 10 + start_url

    @defer.inlineCallbacks
    def test_delay(self):
        # short to long delays
        yield self._test_delay(0.2, False)
        yield self._test_delay(1, False)
        # randoms
        yield self._test_delay(0.2, True)
        yield self._test_delay(1, True)

    @defer.inlineCallbacks
    def _test_delay(self, delay, randomize):
        settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
        spider = FollowAllSpider(maxlatency=delay * 2)
        yield docrawl(spider, settings)
        t = spider.times
        totaltime = t[-1] - t[0]
        avgd = totaltime / (len(t) - 1)
        tolerance = 0.6 if randomize else 0.2
        self.assertTrue(avgd > delay * (1 - tolerance),
                        "download delay too small: %s" % avgd)

    @defer.inlineCallbacks
    def test_timeout_success(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider)
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 > 0)
        self.assertTrue(spider.t2 > spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)
        # server hangs after receiving response headers
        spider = DelaySpider(n=0.5, b=1)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        spider = SimpleSpider("http://localhost:8998/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        spider = SimpleSpider("http://localhost:65432/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        spider = SimpleSpider("http://localhost666/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_unbounded_response(self):
        # Completeness of responses without Content-Length or Transfer-Encoding
        # can not be determined, we treat them as valid but flagged as "partial"
        from urllib import urlencode
        query = urlencode({'raw': '''\
HTTP/1.1 200 OK
Server: Apache-Coyote/1.1
X-Powered-By: Servlet 2.4; JBoss-4.2.3.GA (build: SVNTag=JBoss_4_2_3_GA date=200807181417)/JBossWeb-2.0
Set-Cookie: JSESSIONID=08515F572832D0E659FD2B0D8031D75F; Path=/
Pragma: no-cache
Expires: Thu, 01 Jan 1970 00:00:00 GMT
Cache-Control: no-cache
Cache-Control: no-store
Content-Type: text/html;charset=UTF-8
Content-Language: en
Date: Tue, 27 Aug 2013 13:05:05 GMT
Connection: close

foo body
with multiples lines
'''})
        spider = SimpleSpider("http://localhost:8998/raw?{0}".format(query))
        yield docrawl(spider)
        log = get_testlog()
        self.assertEqual(log.count("Got response 200"), 1)

    @defer.inlineCallbacks
    def test_retry_conn_lost(self):
        # connection lost after receiving data
        spider = SimpleSpider("http://localhost:8998/drop?abort=0")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_aborted(self):
        # connection lost before receiving data
        spider = SimpleSpider("http://localhost:8998/drop?abort=1")
        yield docrawl(spider)
        self._assert_retried()

    def _assert_retried(self):
        log = get_testlog()
        self.assertEqual(log.count("Retrying"), 2)
        self.assertEqual(log.count("Gave up retrying"), 1)
Exemple #13
0
class CrawlTestCase(TestCase):

    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        spider = FollowAllSpider()
        yield docrawl(spider)
        self.assertEqual(len(spider.urls_visited), 11)  # 10 + start_url

    @defer.inlineCallbacks
    def test_delay(self):
        # short to long delays
        yield self._test_delay(0.2, False)
        yield self._test_delay(1, False)
        # randoms
        yield self._test_delay(0.2, True)
        yield self._test_delay(1, True)

    @defer.inlineCallbacks
    def _test_delay(self, delay, randomize):
        settings = {"DOWNLOAD_DELAY": delay, 'RANDOMIZE_DOWNLOAD_DELAY': randomize}
        spider = FollowAllSpider(maxlatency=delay * 2)
        yield docrawl(spider, settings)
        t = spider.times
        totaltime = t[-1] - t[0]
        avgd = totaltime / (len(t) - 1)
        tolerance = 0.6 if randomize else 0.2
        self.assertTrue(avgd > delay * (1 - tolerance),
                        "download delay too small: %s" % avgd)

    @defer.inlineCallbacks
    def test_timeout_success(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider)
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 > 0)
        self.assertTrue(spider.t2 > spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)
        # server hangs after receiving response headers
        spider = DelaySpider(n=0.5, b=1)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        spider = SimpleSpider("http://localhost:8998/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        spider = SimpleSpider("http://localhost:65432/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        spider = SimpleSpider("http://localhost666/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_lost(self):
        # connection lost after receiving data
        spider = SimpleSpider("http://localhost:8998/drop?abort=0")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_aborted(self):
        # connection lost before receiving data
        spider = SimpleSpider("http://localhost:8998/drop?abort=1")
        yield docrawl(spider)
        self._assert_retried()

    def _assert_retried(self):
        log = get_testlog()
        self.assertEqual(log.count("Retrying"), 2)
        self.assertEqual(log.count("Gave up retrying"), 1)
Exemple #14
0
class CrawlTestCase(TestCase):
    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        spider = FollowAllSpider()
        yield docrawl(spider)
        self.assertEqual(len(spider.urls_visited), 11)  # 10 + start_url

    @defer.inlineCallbacks
    def test_delay(self):
        # short to long delays
        yield self._test_delay(0.2, False)
        yield self._test_delay(1, False)
        # randoms
        yield self._test_delay(0.2, True)
        yield self._test_delay(1, True)

    @defer.inlineCallbacks
    def _test_delay(self, delay, randomize):
        settings = {
            "DOWNLOAD_DELAY": delay,
            'RANDOMIZE_DOWNLOAD_DELAY': randomize
        }
        spider = FollowAllSpider(maxlatency=delay * 2)
        yield docrawl(spider, settings)
        t = spider.times
        totaltime = t[-1] - t[0]
        avgd = totaltime / (len(t) - 1)
        tolerance = 0.6 if randomize else 0.2
        self.assertTrue(avgd > delay * (1 - tolerance),
                        "download delay too small: %s" % avgd)

    @defer.inlineCallbacks
    def test_timeout_success(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider)
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 > 0)
        self.assertTrue(spider.t2 > spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)
        # server hangs after receiving response headers
        spider = DelaySpider(n=0.5, b=1)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        spider = SimpleSpider("http://localhost:8998/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        spider = SimpleSpider("http://localhost:65432/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        spider = SimpleSpider("http://localhost666/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_lost(self):
        # connection lost after receiving data
        spider = SimpleSpider("http://localhost:8998/drop?abort=0")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_aborted(self):
        # connection lost before receiving data
        spider = SimpleSpider("http://localhost:8998/drop?abort=1")
        yield docrawl(spider)
        self._assert_retried()

    def _assert_retried(self):
        log = get_testlog()
        self.assertEqual(log.count("Retrying"), 2)
        self.assertEqual(log.count("Gave up retrying"), 1)
Exemple #15
0
class CrawlTestCase(TestCase):
    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        spider = FollowAllSpider()
        yield docrawl(spider)
        self.assertEqual(len(spider.urls_visited), 11)  # 10 + start_url

    @defer.inlineCallbacks
    def test_delay(self):
        spider = FollowAllSpider()
        yield docrawl(spider, {"DOWNLOAD_DELAY": 0.3})
        t = spider.times[0]
        for t2 in spider.times[1:]:
            self.assertTrue(t2 - t > 0.15,
                            "download delay too small: %s" % (t2 - t))
            t = t2

    @defer.inlineCallbacks
    def test_timeout_success(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider)
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 > 0)
        self.assertTrue(spider.t2 > spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        spider = SimpleSpider("http://localhost:8998/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        spider = SimpleSpider("http://localhost:65432/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        spider = SimpleSpider("http://localhost666/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    def _assert_retried(self):
        log = get_testlog()
        self.assertEqual(log.count("Retrying"), 2)
        self.assertEqual(log.count("Gave up retrying"), 1)
Exemple #16
0
class CrawlTestCase(TestCase):

    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)

    @defer.inlineCallbacks
    def test_follow_all(self):
        spider = FollowAllSpider()
        yield docrawl(spider)
        self.assertEqual(len(spider.urls_visited), 11) # 10 + start_url

    @defer.inlineCallbacks
    def test_delay(self):
        spider = FollowAllSpider()
        yield docrawl(spider, {"DOWNLOAD_DELAY": 1})
        t = spider.times[0]
        for t2 in spider.times[1:]:
            self.assertTrue(t2-t > 0.45, "download delay too small: %s" % (t2-t))
            t = t2

    @defer.inlineCallbacks
    def test_timeout_success(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider)
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 > 0)
        self.assertTrue(spider.t2 > spider.t1)

    @defer.inlineCallbacks
    def test_timeout_failure(self):
        spider = DelaySpider(n=0.5)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)
        # server hangs after receiving response headers
        spider = DelaySpider(n=0.5, b=1)
        yield docrawl(spider, {"DOWNLOAD_TIMEOUT": 0.35})
        self.assertTrue(spider.t1 > 0)
        self.assertTrue(spider.t2 == 0)
        self.assertTrue(spider.t2_err > 0)
        self.assertTrue(spider.t2_err > spider.t1)

    @defer.inlineCallbacks
    def test_retry_503(self):
        spider = SimpleSpider("http://localhost:8998/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_conn_failed(self):
        spider = SimpleSpider("http://localhost:65432/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    @defer.inlineCallbacks
    def test_retry_dns_error(self):
        spider = SimpleSpider("http://localhost666/status?n=503")
        yield docrawl(spider)
        self._assert_retried()

    def _assert_retried(self):
        log = get_testlog()
        self.assertEqual(log.count("Retrying"), 2)
        self.assertEqual(log.count("Gave up retrying"), 1)
Exemple #17
0
class ProxyConnectTestCase(TestCase):
    def setUp(self):
        self.mockserver = MockServer()
        self.mockserver.__enter__()
        self._oldenv = os.environ.copy()
        self._proxy = HTTPSProxy(8888)
        self._proxy.start()
        # Wait for the proxy to start.
        time.sleep(1.0)
        os.environ['http_proxy'] = 'http://*****:*****@localhost:8888'
        os.environ['https_proxy'] = 'http://*****:*****@localhost:8888'

    def tearDown(self):
        self.mockserver.__exit__(None, None, None)
        self._proxy.shutdown()
        os.environ = self._oldenv

    @defer.inlineCallbacks
    def test_https_connect_tunnel(self):
        spider = SimpleSpider("https://localhost:8999/status?n=200")
        yield docrawl(spider)
        self._assert_got_response_code(200)

    @defer.inlineCallbacks
    def test_https_noconnect(self):
        os.environ[
            'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect'
        spider = SimpleSpider("https://*****:*****@localhost:8888'

    @defer.inlineCallbacks
    def test_https_connect_tunnel_error(self):
        spider = SimpleSpider("https://localhost:99999/status?n=200")
        yield docrawl(spider)
        self._assert_got_tunnel_error()

    @defer.inlineCallbacks
    def test_https_tunnel_auth_error(self):
        os.environ['https_proxy'] = 'http://*****:*****@localhost:8888'
        spider = SimpleSpider("https://*****:*****@localhost:8888'

    @defer.inlineCallbacks
    def test_https_noconnect_auth_error(self):
        os.environ[
            'https_proxy'] = 'http://*****:*****@localhost:8888?noconnect'
        spider = SimpleSpider("https://localhost:8999/status?n=200")
        yield docrawl(spider)
        self._assert_got_response_code(407)

    def _assert_got_response_code(self, code):
        log = get_testlog()
        self.assertEqual(log.count('Crawled (%d)' % code), 1)

    def _assert_got_tunnel_error(self):
        log = get_testlog()
        self.assertEqual(log.count('TunnelError'), 1)
Exemple #18
0
 def setUp(self):
     self.mockserver = MockServer()
     self.mockserver.__enter__()
Exemple #19
0
 def run(self, args, opts):
     with MockServer():
         spider = FollowAllSpider(total=100000)
         self.crawler.crawl(spider)
         self.crawler.start()