def test_url_is_from_spider_with_allowed_domains(self): spider = BaseSpider(name='example.com', allowed_domains=['example.org', 'example.net']) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://sub.example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://example.com/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://www.example.org/some/page.html', spider)) self.assertTrue( url_is_from_spider('http://www.example.net/some/page.html', spider)) self.assertFalse( url_is_from_spider('http://www.example.us/some/page.html', spider)) spider = BaseSpider(name='example.com', allowed_domains=set( ('example.com', 'example.net'))) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider)) spider = BaseSpider(name='example.com', allowed_domains=('example.com', 'example.net')) self.assertTrue( url_is_from_spider('http://www.example.com/some/page.html', spider))
def _assert_stores(self, storage, path): yield storage.store(StringIO("content"), BaseSpider("default")) self.failUnless(os.path.exists(path)) self.failUnlessEqual(open(path).read(), "content") # again, to check files are overwritten properly yield storage.store(StringIO("new content"), BaseSpider("default")) self.failUnlessEqual(open(path).read(), "new content")
def setUp(self): self.spider1 = BaseSpider('name1') self.spider2 = BaseSpider('name2') open_spiders = set([self.spider1, self.spider2]) crawler = CrawlerMock(open_spiders) self.spref = SpiderReferencer(crawler) self.encoder = ScrapyJSONEncoder(spref=self.spref) self.decoder = ScrapyJSONDecoder(spref=self.spref)
def test_host_header_seted_in_request_headers(self): def _test(response): self.assertEquals(response.body, 'example.com') self.assertEquals(request.headers.get('Host'), 'example.com') request = Request(self.getURL('host'), headers={'Host': 'example.com'}) return self.download_request(request, BaseSpider('foo')).addCallback(_test) d = self.download_request(request, BaseSpider('foo')) d.addCallback(lambda r: r.body) d.addCallback(self.assertEquals, 'example.com') return d
def test_payload(self): body = '1' * 100 # PayloadResource requires body length to be 100 request = Request(self.getURL('payload'), method='POST', body=body) d = self.download_request(request, BaseSpider('foo')) d.addCallback(lambda r: r.body) d.addCallback(self.assertEquals, body) return d
def test_host_header_not_in_request_headers(self): def _test(response): self.assertEquals(response.body, '127.0.0.1:%d' % self.portno) self.assertEquals(request.headers, {}) request = Request(self.getURL('host')) return self.download_request(request, BaseSpider('foo')).addCallback(_test)
def test_rules_manager_callbacks(self): mycallback = lambda: True spider = BaseSpider('foo') spider.parse_item = lambda: True response1 = HtmlResponse('http://example.org') response2 = HtmlResponse('http://othersite.org') rulesman = RulesManager([ Rule('example', mycallback), Rule('othersite', 'parse_item'), ], spider, default_matcher=UrlRegexMatcher) rule1 = rulesman.get_rule_from_response(response1) rule2 = rulesman.get_rule_from_response(response2) self.failUnlessEqual(rule1.callback, mycallback) self.failUnlessEqual(rule2.callback, spider.parse_item) # fail unknown callback self.assertRaises(AttributeError, RulesManager, [Rule(BaseMatcher(), 'mycallback')], spider) # fail not callable spider.not_callable = True self.assertRaises(AttributeError, RulesManager, [Rule(BaseMatcher(), 'not_callable')], spider)
def test_rules_manager_callback_with_arguments(self): spider = BaseSpider('foo') response = HtmlResponse('http://example.org') kwargs = {'a': 1} def myfunc(**mykwargs): return mykwargs # verify return validation self.failUnlessEquals(kwargs, myfunc(**kwargs)) # test callback w/o arguments rulesman = RulesManager([ Rule(BaseMatcher(), myfunc), ], spider) rule = rulesman.get_rule_from_response(response) # without arguments should return same callback self.failUnlessEqual(rule.callback, myfunc) # test callback w/ arguments rulesman = RulesManager([ Rule(BaseMatcher(), myfunc, **kwargs), ], spider) rule = rulesman.get_rule_from_response(response) # with argument should return partial applied callback self.failUnless(isinstance(rule.callback, partial)) self.failUnlessEquals(kwargs, rule.callback())
def test_store(self): out = StringIO() storage = StdoutFeedStorage('stdout:', _stdout=out) file = storage.open(BaseSpider("default")) file.write("content") yield storage.store(file) self.assertEqual(out.getvalue(), "content")
def test_scheduler_persistent(self): messages = [] spider = BaseSpider('myspider') spider.log = lambda *args, **kwargs: messages.append([args, kwargs]) self.scheduler.persist = True self.scheduler.open(spider) self.assertEqual(messages, []) self.scheduler.enqueue_request(Request('http://example.com/page1')) self.scheduler.enqueue_request(Request('http://example.com/page2')) self.assertTrue(self.scheduler.has_pending_requests()) self.scheduler.close('finish') self.scheduler.open(spider) self.assertEqual(messages, [ [('Resuming crawl (2 requests scheduled)', ), {}], ]) self.assertEqual(len(self.scheduler), 2) self.scheduler.persist = False self.scheduler.close('finish') self.assertEqual(len(self.scheduler), 0)
def setUp(self): self.spider = BaseSpider('scrapytest.org') self.stats = StatsCollector() self.stats.open_spider(self.spider) self.mw = DepthMiddleware(1, self.stats, True)
def get_defaults_spider_mw(self): crawler = get_crawler() spider = BaseSpider('foo') spider.set_crawler(crawler) defaults = dict([(k, [v]) for k, v in \ crawler.settings.get('DEFAULT_REQUEST_HEADERS').iteritems()]) return defaults, spider, DefaultHeadersMiddleware()
def _assert_stores(self, storage, path): spider = BaseSpider("default") file = storage.open(spider) file.write("content") yield storage.store(file) self.failUnless(os.path.exists(path)) self.failUnlessEqual(open(path).read(), "content")
def test_download_without_proxy(self): def _test(response): self.assertEquals(response.status, 200) self.assertEquals(response.url, request.url) self.assertEquals(response.body, '/path/to/resource') request = Request(self.getURL('path/to/resource')) return self.download_request(request, BaseSpider('foo')).addCallback(_test)
def test_state_attribute(self): # state attribute must be present if jobdir is not set, to provide a # consistent interface spider = BaseSpider(name='default') ss = SpiderState() ss.spider_opened(spider) self.assertEqual(spider.state, {}) ss.spider_closed(spider)
def test_filter(self): spider = BaseSpider('foo') filter = NullDupeFilter() filter.open_spider(spider) r1 = Request('http://scrapytest.org/1') assert not filter.request_seen(spider, r1) filter.close_spider(spider)
def setUp(self): self.spider = BaseSpider('scrapytest.org') self.mw = DownloaderStats() stats.open_spider(self.spider) self.req = Request('http://scrapytest.org') self.res = Response('scrapytest.org', status=400)
def setUp(self): self.spider = BaseSpider('scrapytest.org') self.stats = StatsCollector() self.stats.open_spider(self.spider) self.mw = DepthMiddleware(1, self.stats) self.assertEquals(self.stats.get_value('envinfo/request_depth_limit'), 1)
def test_store_load(self): jobdir = self.mktemp() os.mkdir(jobdir) spider = BaseSpider(name='default') dt = datetime.now() ss = SpiderState(jobdir) ss.spider_opened(spider) spider.state['one'] = 1 spider.state['dt'] = dt ss.spider_closed(spider) spider2 = BaseSpider(name='default') ss2 = SpiderState(jobdir) ss2.spider_opened(spider2) self.assertEqual(spider.state, {'one': 1, 'dt': dt}) ss2.spider_closed(spider2)
def setUp(self): self.crawler = get_crawler(self.settings_dict) self.spider = BaseSpider('foo') self.spider.set_crawler(self.crawler) self.mwman = DownloaderMiddlewareManager.from_crawler(self.crawler) # some mw depends on stats collector self.crawler.stats.open_spider(self.spider) return self.mwman.open_spider(self.spider)
def _schedule(self, request, spider): if spider is None: spider = create_spider_for_request(self.crawler.spiders, request, \ BaseSpider('default'), log_multiple=True) spider.set_crawler(self.crawler) self.crawler.engine.open_spider(spider) d = self.crawler.engine.schedule(request, spider) d.addCallback(lambda x: (x, spider)) return d
def setUp(self): self.spider = BaseSpider('foo') self.mw = HttpErrorMiddleware() self.req = Request('http://scrapytest.org') self.res200 = Response('http://scrapytest.org', status=200) self.res200.request = self.req self.res404 = Response('http://scrapytest.org', status=404) self.res404.request = self.req
def test_rules_manager_empty_rule(self): spider = BaseSpider('foo') response = HtmlResponse('http://example.org') rulesman = RulesManager([Rule(follow=True)], spider) rule = rulesman.get_rule_from_response(response) # default matcher if None: BaseMatcher self.failUnless(isinstance(rule.matcher, BaseMatcher))
def test_download_with_proxy(self): def _test(response): self.assertEquals(response.status, 200) self.assertEquals(response.url, request.url) self.assertEquals(response.body, 'https://example.com') http_proxy = self.getURL('') request = Request('https://example.com', meta={'proxy': http_proxy}) return self.download_request(request, BaseSpider('foo')).addCallback(_test)
def test_download(self): def _test(response): self.assertEquals(response.url, request.url) self.assertEquals(response.status, 200) self.assertEquals(response.body, '0123456789') request = Request(path_to_file_uri(self.tmpname + '^')) assert request.url.upper().endswith('%5E') return self.download_request(request, BaseSpider('foo')).addCallback(_test)
def _assert_stores(self, storage, path): spider = BaseSpider("default") file = storage.open(spider) file.write("content") yield storage.store(file) self.failUnless(os.path.exists(path)) self.failUnlessEqual(open(path).read(), "content") # again, to check s3 objects are overwritten yield storage.store(StringIO("new content")) self.failUnlessEqual(open(path).read(), "new content")
def _open_spider(self, request, spider): if self.spider: return self.spider if spider is None: spider = create_spider_for_request(self.crawler.spiders, request, \ BaseSpider('default'), log_multiple=True) spider.set_crawler(self.crawler) self.crawler.engine.open_spider(spider, close_if_idle=False) self.spider = spider return spider
def setUp(self): self.spider = BaseSpider('foo') self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True})) self.req = Request('http://scrapytest.org') self.res200 = Response('http://scrapytest.org', status=200) self.res200.request = self.req self.res404 = Response('http://scrapytest.org', status=404) self.res404.request = self.req self.res402 = Response('http://scrapytest.org', status=402) self.res402.request = self.req
def setUp(self): self.crawler = get_crawler() self.spider = BaseSpider('example.com') self.tmpdir = tempfile.mkdtemp() self.request = Request('http://www.example.com', headers={'User-Agent': 'test'}) self.response = Response('http://www.example.com', headers={'Content-Type': 'text/html'}, body='test body', status=202) self.crawler.stats.open_spider(self.spider)
def test_timeout_download_from_spider(self): spider = BaseSpider('foo') meta = {'download_timeout': 0.2} # client connects but no data is received request = Request(self.getURL('wait'), meta=meta) d = self.download_request(request, spider) yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError) # client connects, server send headers and some body bytes but hangs request = Request(self.getURL('hang-after-headers'), meta=meta) d = self.download_request(request, spider) yield self.assertFailure(d, defer.TimeoutError, error.TimeoutError)