def test_crawl(tmpdir): settings = {'CRAWL_ONCE_PATH': str(tmpdir)} crawler = get_crawler(settings_dict=settings) req1 = scrapy.Request('http://example.com/1', meta={'crawl_once': True}) req2 = scrapy.Request('http://example.com/2') req3 = scrapy.Request('http://example.com/3', meta={'crawl_once': True}) resp1 = Response(req1.url, request=req1) resp2 = Response(req2.url, request=req2) with opened_middleware(crawler) as mw: # 1. check spider middleware interface assert len(mw.db) == 0 assert crawler.stats.get_value('crawl_once/initial') == 0 output = [{}, scrapy.Request('http://example.com')] # crawl_once is False res = list(mw.process_spider_output(resp2, output, crawler.spider)) assert res == output assert len(mw.db) == 0 # crawl_once is True res = list(mw.process_spider_output(resp1, output, crawler.spider)) assert res == output assert len(mw.db) == 1 assert crawler.stats.get_value('crawl_once/initial') == 0 assert crawler.stats.get_value('crawl_once/stored') == 1 # 2. check downloader middleware interface assert mw.process_request(req2, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 0 with pytest.raises(IgnoreRequest): mw.process_request(req1, crawler.spider) assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert mw.process_request(req3, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert crawler.stats.get_value('crawl_once/initial') == 0 crawler = get_crawler(settings_dict=settings) with opened_middleware(crawler) as mw2: # it reuses the same file, so there are records assert len(mw2.db) == 1 assert crawler.stats.get_value('crawl_once/initial') == 1 assert mw2.process_request(req2, crawler.spider) is None assert crawler.stats.get_value('crawl_once/ignored', 0) == 0 with pytest.raises(IgnoreRequest): mw2.process_request(req1, crawler.spider) assert crawler.stats.get_value('crawl_once/ignored', 0) == 1 assert mw2.process_request(req3, crawler.spider) is None
def test_log_formatter_scrapy_1(): middleware = get_test_middleware() logformatter = CrawleraFetchLogFormatter() formatter = Formatter() for case in get_test_requests(): original = case["original"] response = Response(original.url) processed = middleware.process_request(original, foo_spider) crawlera_meta = original.meta.get("crawlera_fetch") or {} if crawlera_meta.get("skip"): assert processed is None continue # crawled result = logformatter.crawled(processed, response, foo_spider) assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=1, exc_info=None, **result) logstr = formatter.format(record) expected = "Crawled (200) {request} ['original url: {url}'] (referer: None)".format( request=original, url=original.url) assert logstr == expected
def errback_broken_link(self, failure: Failure) -> FoundLink: # Failure may not be the right typehint """ Handles behavior for links which cause Twisted failures - which is most of the broken links this spider hopes to find :param failure: A Twisted failure raised by the Retry middleware :return: None """ # Structure of this function heavily inspired by: # https://docs.scrapy.org/en/latest/topics/request-response.html#topics-request-response-ref-errbacks # If its a TCP or DNS error, short-circuit to the pipeline if failure.check(DNSLookupError, TCPTimedOutError): self.logger.info(f'Handled DNS/TCP related error. {failure.request.url}') request = failure.request dummy_response = Response( url=request.url, status=404, # Kind of a lie request=request ) yield from self.parse_broken_link(dummy_response) # If the client timed out, report that elif failure.check(TimeoutError): self.logger.info(f'Client timeout. {failure.request.url}') self.logger.error(repr(failure))
def process_request(self,request,spider): if spider.name= 'Douyu_info': spider.driver.get(request.url) print("深V股v结婚两个关于一一一一一一一一一") print(request.url) '''for x in xrange(1,11,2): i=float(x)/10 js="document.body.scrollTop=document.body.scrollHeight * %f"%i spider.driver.execute_script(js) time.sleep(1)''' response = Response(url = request.url,body=bytes(spider.driver.page_source),request = request)
def test_log_formatter_scrapy_2(): middleware = get_test_middleware() logformatter = CrawleraFetchLogFormatter() formatter = Formatter() spider = Spider("foo") for case in deepcopy(test_requests): original = case["original"] response = Response(original.url) processed = middleware.process_request(original, spider) crawlera_meta = original.meta.get("crawlera_fetch") or {} if crawlera_meta.get("skip"): assert processed is None continue # crawled result = logformatter.crawled(processed, response, spider) assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=1, exc_info=None, **result) logstr = formatter.format(record) assert logstr == "Crawled (200) %s (referer: None)" % str(original) # spider_error result = logformatter.spider_error(Failure(Exception("exc")), processed, response, spider) assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=1, exc_info=None, **result) logstr = formatter.format(record) assert logstr == "Spider error processing %s (referer: None)" % str( original) # download_error result = logformatter.download_error(Failure(Exception("exc")), processed, spider, "foo") assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=2, exc_info=None, **result) logstr = formatter.format(record) assert logstr == "Error downloading %s: foo" % str(original)
def test_parse(self): graph_photo_spider = FacebookPhotoGraphSpider( target_username='******', access_token='testAccessToken') # created_time and some other metadata are placed in the Request object req = Request('https://m.facebook.com/testUsername/1', meta={ 'id': '1', 'created_time': '2016-04-10T12:47:36+0000', 'name': 'This is a name text' }) resp = Response('https://m.facebook.com/testUsername/1', body=photo_page_html_resp, request=req) with requests_mock.Mocker() as m: # Mock the reactions and comments graph api request response. m.get( 'https://graph.facebook.com/v2.7/1/reactions' '?access_token=testAccessToken', content=id_reactions_resp, headers={'content-type': 'json'}) m.get( 'https://graph.facebook.com/v2.7/1/comments' '?access_token=testAccessToken', content=id_comments_resp, headers={'content-type': 'json'}) result = graph_photo_spider.parse(resp) self.assertEqual( result, { 'comments': [{ 'created_time': '2017-07-20T03:18:29+0000', 'from': { 'id': '1234568', 'name': 'People B' }, 'id': '123456', 'message': 'comment1' }, { 'created_time': '2017-07-19T13:43:14+0000', 'from': { 'id': '1234567', 'name': 'People A' }, 'id': '123456', 'message': 'comment2' }], 'comments_num': 2, 'created_time': '2016-04-10T12:47:36+0000', 'fb_id': '1', 'image_url': 'https://justexampleurl.com/test.jpg', 'name': 'This is a name text', 'reactions': [{ 'id': '61', 'name': 'Irvan', 'type': 'LIKE' }, { 'id': '62', 'name': 'Raymond', 'type': 'WOW' }, { 'id': '63', 'name': 'Ida', 'type': 'HAHA' }], 'reactions_num': 3, 'url': 'https://m.facebook.com/testUsername/1' })
def process_exception(self, request, exception, spider): if isinstance(exception, self.ALL_EXCEPTIONS): print('Got exception: %s' % exception) response = Response(url='exception') return response
def process_exception(self, request, exception, spider): return Response( url="http://localhost/", body=b"Caught " + exception.__class__.__name__.encode("utf-8"), )