コード例 #1
0
    def test_url_is_from_spider_with_allowed_domains(self):
        spider = BaseSpider(name='example.com',
                            allowed_domains=['example.org', 'example.net'])
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html',
                               spider))
        self.assertTrue(
            url_is_from_spider('http://sub.example.com/some/page.html',
                               spider))
        self.assertTrue(
            url_is_from_spider('http://example.com/some/page.html', spider))
        self.assertTrue(
            url_is_from_spider('http://www.example.org/some/page.html',
                               spider))
        self.assertTrue(
            url_is_from_spider('http://www.example.net/some/page.html',
                               spider))
        self.assertFalse(
            url_is_from_spider('http://www.example.us/some/page.html', spider))

        spider = BaseSpider(name='example.com',
                            allowed_domains=set(
                                ('example.com', 'example.net')))
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html',
                               spider))

        spider = BaseSpider(name='example.com',
                            allowed_domains=('example.com', 'example.net'))
        self.assertTrue(
            url_is_from_spider('http://www.example.com/some/page.html',
                               spider))
コード例 #2
0
 def _assert_stores(self, storage, path):
     yield storage.store(StringIO("content"), BaseSpider("default"))
     self.failUnless(os.path.exists(path))
     self.failUnlessEqual(open(path).read(), "content")
     # again, to check files are overwritten properly
     yield storage.store(StringIO("new content"), BaseSpider("default"))
     self.failUnlessEqual(open(path).read(), "new content")
コード例 #3
0
 def setUp(self):
     self.spider1 = BaseSpider('name1')
     self.spider2 = BaseSpider('name2')
     open_spiders = set([self.spider1, self.spider2])
     crawler = CrawlerMock(open_spiders)
     self.spref = SpiderReferencer(crawler)
     self.encoder = ScrapyJSONEncoder(spref=self.spref)
     self.decoder = ScrapyJSONDecoder(spref=self.spref)
コード例 #4
0
    def test_host_header_seted_in_request_headers(self):
        def _test(response):
            self.assertEquals(response.body, 'example.com')
            self.assertEquals(request.headers.get('Host'), 'example.com')

        request = Request(self.getURL('host'), headers={'Host': 'example.com'})
        return self.download_request(request, BaseSpider('foo')).addCallback(_test)

        d = self.download_request(request, BaseSpider('foo'))
        d.addCallback(lambda r: r.body)
        d.addCallback(self.assertEquals, 'example.com')
        return d
コード例 #5
0
 def test_payload(self):
     body = '1' * 100  # PayloadResource requires body length to be 100
     request = Request(self.getURL('payload'), method='POST', body=body)
     d = self.download_request(request, BaseSpider('foo'))
     d.addCallback(lambda r: r.body)
     d.addCallback(self.assertEquals, body)
     return d
コード例 #6
0
    def test_host_header_not_in_request_headers(self):
        def _test(response):
            self.assertEquals(response.body, '127.0.0.1:%d' % self.portno)
            self.assertEquals(request.headers, {})

        request = Request(self.getURL('host'))
        return self.download_request(request, BaseSpider('foo')).addCallback(_test)
コード例 #7
0
    def test_rules_manager_callbacks(self):
        mycallback = lambda: True

        spider = BaseSpider('foo')
        spider.parse_item = lambda: True

        response1 = HtmlResponse('http://example.org')
        response2 = HtmlResponse('http://othersite.org')

        rulesman = RulesManager([
            Rule('example', mycallback),
            Rule('othersite', 'parse_item'),
        ],
                                spider,
                                default_matcher=UrlRegexMatcher)

        rule1 = rulesman.get_rule_from_response(response1)
        rule2 = rulesman.get_rule_from_response(response2)

        self.failUnlessEqual(rule1.callback, mycallback)
        self.failUnlessEqual(rule2.callback, spider.parse_item)

        # fail unknown callback
        self.assertRaises(AttributeError, RulesManager,
                          [Rule(BaseMatcher(), 'mycallback')], spider)
        # fail not callable
        spider.not_callable = True
        self.assertRaises(AttributeError, RulesManager,
                          [Rule(BaseMatcher(), 'not_callable')], spider)
コード例 #8
0
    def test_rules_manager_callback_with_arguments(self):
        spider = BaseSpider('foo')
        response = HtmlResponse('http://example.org')

        kwargs = {'a': 1}

        def myfunc(**mykwargs):
            return mykwargs

        # verify return validation
        self.failUnlessEquals(kwargs, myfunc(**kwargs))

        # test callback w/o arguments
        rulesman = RulesManager([
            Rule(BaseMatcher(), myfunc),
        ], spider)
        rule = rulesman.get_rule_from_response(response)

        # without arguments should return same callback
        self.failUnlessEqual(rule.callback, myfunc)

        # test callback w/ arguments
        rulesman = RulesManager([
            Rule(BaseMatcher(), myfunc, **kwargs),
        ], spider)
        rule = rulesman.get_rule_from_response(response)

        # with argument should return partial applied callback
        self.failUnless(isinstance(rule.callback, partial))
        self.failUnlessEquals(kwargs, rule.callback())
コード例 #9
0
 def test_store(self):
     out = StringIO()
     storage = StdoutFeedStorage('stdout:', _stdout=out)
     file = storage.open(BaseSpider("default"))
     file.write("content")
     yield storage.store(file)
     self.assertEqual(out.getvalue(), "content")
コード例 #10
0
    def test_scheduler_persistent(self):
        messages = []
        spider = BaseSpider('myspider')
        spider.log = lambda *args, **kwargs: messages.append([args, kwargs])

        self.scheduler.persist = True
        self.scheduler.open(spider)

        self.assertEqual(messages, [])

        self.scheduler.enqueue_request(Request('http://example.com/page1'))
        self.scheduler.enqueue_request(Request('http://example.com/page2'))

        self.assertTrue(self.scheduler.has_pending_requests())
        self.scheduler.close('finish')

        self.scheduler.open(spider)
        self.assertEqual(messages, [
            [('Resuming crawl (2 requests scheduled)', ), {}],
        ])
        self.assertEqual(len(self.scheduler), 2)

        self.scheduler.persist = False
        self.scheduler.close('finish')

        self.assertEqual(len(self.scheduler), 0)
コード例 #11
0
    def setUp(self):
        self.spider = BaseSpider('scrapytest.org')

        self.stats = StatsCollector()
        self.stats.open_spider(self.spider)

        self.mw = DepthMiddleware(1, self.stats, True)
 def get_defaults_spider_mw(self):
     crawler = get_crawler()
     spider = BaseSpider('foo')
     spider.set_crawler(crawler)
     defaults = dict([(k, [v]) for k, v in \
         crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()])
     return defaults, spider, DefaultHeadersMiddleware()
コード例 #13
0
 def _assert_stores(self, storage, path):
     spider = BaseSpider("default")
     file = storage.open(spider)
     file.write("content")
     yield storage.store(file)
     self.failUnless(os.path.exists(path))
     self.failUnlessEqual(open(path).read(), "content")
コード例 #14
0
    def test_download_without_proxy(self):
        def _test(response):
            self.assertEquals(response.status, 200)
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.body, '/path/to/resource')

        request = Request(self.getURL('path/to/resource'))
        return self.download_request(request, BaseSpider('foo')).addCallback(_test)
コード例 #15
0
 def test_state_attribute(self):
     # state attribute must be present if jobdir is not set, to provide a
     # consistent interface
     spider = BaseSpider(name='default')
     ss = SpiderState()
     ss.spider_opened(spider)
     self.assertEqual(spider.state, {})
     ss.spider_closed(spider)
コード例 #16
0
    def test_filter(self):
        spider = BaseSpider('foo')
        filter = NullDupeFilter()
        filter.open_spider(spider)

        r1 = Request('http://scrapytest.org/1')
        assert not filter.request_seen(spider, r1)
        filter.close_spider(spider)
コード例 #17
0
    def setUp(self):
        self.spider = BaseSpider('scrapytest.org')
        self.mw = DownloaderStats()

        stats.open_spider(self.spider)

        self.req = Request('http://scrapytest.org')
        self.res = Response('scrapytest.org', status=400)
コード例 #18
0
    def setUp(self):
        self.spider = BaseSpider('scrapytest.org')

        self.stats = StatsCollector()
        self.stats.open_spider(self.spider)

        self.mw = DepthMiddleware(1, self.stats)
        self.assertEquals(self.stats.get_value('envinfo/request_depth_limit'), 1)
コード例 #19
0
    def test_store_load(self):
        jobdir = self.mktemp()
        os.mkdir(jobdir)
        spider = BaseSpider(name='default')
        dt = datetime.now()

        ss = SpiderState(jobdir)
        ss.spider_opened(spider)
        spider.state['one'] = 1
        spider.state['dt'] = dt
        ss.spider_closed(spider)

        spider2 = BaseSpider(name='default')
        ss2 = SpiderState(jobdir)
        ss2.spider_opened(spider2)
        self.assertEqual(spider.state, {'one': 1, 'dt': dt})
        ss2.spider_closed(spider2)
コード例 #20
0
 def setUp(self):
     self.crawler = get_crawler(self.settings_dict)
     self.spider = BaseSpider('foo')
     self.spider.set_crawler(self.crawler)
     self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler)
     # some mw depends on stats collector
     self.crawler.stats.open_spider(self.spider)
     return self.mwman.open_spider(self.spider)
コード例 #21
0
 def _schedule(self, request, spider):
     if spider is None:
         spider = create_spider_for_request(self.crawler.spiders, request, \
             BaseSpider('default'), log_multiple=True)
     spider.set_crawler(self.crawler)
     self.crawler.engine.open_spider(spider)
     d = self.crawler.engine.schedule(request, spider)
     d.addCallback(lambda x: (x, spider))
     return d
コード例 #22
0
    def setUp(self):
        self.spider = BaseSpider('foo')
        self.mw = HttpErrorMiddleware()
        self.req = Request('http://scrapytest.org')

        self.res200 = Response('http://scrapytest.org', status=200)
        self.res200.request = self.req
        self.res404 = Response('http://scrapytest.org', status=404)
        self.res404.request = self.req
コード例 #23
0
    def test_rules_manager_empty_rule(self):
        spider = BaseSpider('foo')
        response = HtmlResponse('http://example.org')

        rulesman = RulesManager([Rule(follow=True)], spider)

        rule = rulesman.get_rule_from_response(response)
        # default matcher if None: BaseMatcher
        self.failUnless(isinstance(rule.matcher, BaseMatcher))
コード例 #24
0
    def test_download_with_proxy(self):
        def _test(response):
            self.assertEquals(response.status, 200)
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.body, 'https://example.com')

        http_proxy = self.getURL('')
        request = Request('https://example.com', meta={'proxy': http_proxy})
        return self.download_request(request, BaseSpider('foo')).addCallback(_test)
コード例 #25
0
    def test_download(self):
        def _test(response):
            self.assertEquals(response.url, request.url)
            self.assertEquals(response.status, 200)
            self.assertEquals(response.body, '0123456789')

        request = Request(path_to_file_uri(self.tmpname + '^'))
        assert request.url.upper().endswith('%5E')
        return self.download_request(request, BaseSpider('foo')).addCallback(_test)
コード例 #26
0
 def _assert_stores(self, storage, path):
     spider = BaseSpider("default")
     file = storage.open(spider)
     file.write("content")
     yield storage.store(file)
     self.failUnless(os.path.exists(path))
     self.failUnlessEqual(open(path).read(), "content")
     # again, to check s3 objects are overwritten
     yield storage.store(StringIO("new content"))
     self.failUnlessEqual(open(path).read(), "new content")
コード例 #27
0
 def _open_spider(self, request, spider):
     if self.spider:
         return self.spider
     if spider is None:
         spider = create_spider_for_request(self.crawler.spiders, request, \
             BaseSpider('default'), log_multiple=True)
     spider.set_crawler(self.crawler)
     self.crawler.engine.open_spider(spider, close_if_idle=False)
     self.spider = spider
     return spider
コード例 #28
0
    def setUp(self):
        self.spider = BaseSpider('foo')
        self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True}))
        self.req = Request('http://scrapytest.org')

        self.res200 = Response('http://scrapytest.org', status=200)
        self.res200.request = self.req
        self.res404 = Response('http://scrapytest.org', status=404)
        self.res404.request = self.req
        self.res402 = Response('http://scrapytest.org', status=402)
        self.res402.request = self.req
コード例 #29
0
 def setUp(self):
     self.crawler = get_crawler()
     self.spider = BaseSpider('example.com')
     self.tmpdir = tempfile.mkdtemp()
     self.request = Request('http://www.example.com',
                            headers={'User-Agent': 'test'})
     self.response = Response('http://www.example.com',
                              headers={'Content-Type': 'text/html'},
                              body='test body',
                              status=202)
     self.crawler.stats.open_spider(self.spider)
コード例 #30
0
 def test_timeout_download_from_spider(self):
     spider = BaseSpider('foo')
     meta = {'download_timeout': 0.2}
     # client connects but no data is received
     request = Request(self.getURL('wait'), meta=meta)
     d = self.download_request(request, spider)
     yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)
     # client connects, server send headers and some body bytes but hangs
     request = Request(self.getURL('hang-after-headers'), meta=meta)
     d = self.download_request(request, spider)
     yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)