Beispiel #1
0
def test_crawl(tmpdir):
    settings = {'CRAWL_ONCE_PATH': str(tmpdir)}
    crawler = get_crawler(settings_dict=settings)
    req1 = scrapy.Request('http://example.com/1', meta={'crawl_once': True})
    req2 = scrapy.Request('http://example.com/2')
    req3 = scrapy.Request('http://example.com/3', meta={'crawl_once': True})

    resp1 = Response(req1.url, request=req1)
    resp2 = Response(req2.url, request=req2)

    with opened_middleware(crawler) as mw:

        # 1. check spider middleware interface
        assert len(mw.db) == 0
        assert crawler.stats.get_value('crawl_once/initial') == 0
        output = [{}, scrapy.Request('http://example.com')]

        # crawl_once is False
        res = list(mw.process_spider_output(resp2, output, crawler.spider))
        assert res == output
        assert len(mw.db) == 0

        # crawl_once is True
        res = list(mw.process_spider_output(resp1, output, crawler.spider))
        assert res == output
        assert len(mw.db) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 0
        assert crawler.stats.get_value('crawl_once/stored') == 1

        # 2. check downloader middleware interface
        assert mw.process_request(req2, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 0

        with pytest.raises(IgnoreRequest):
            mw.process_request(req1, crawler.spider)
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1

        assert mw.process_request(req3, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 0

    crawler = get_crawler(settings_dict=settings)
    with opened_middleware(crawler) as mw2:
        # it reuses the same file, so there are records
        assert len(mw2.db) == 1
        assert crawler.stats.get_value('crawl_once/initial') == 1
        assert mw2.process_request(req2, crawler.spider) is None
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 0
        with pytest.raises(IgnoreRequest):
            mw2.process_request(req1, crawler.spider)
        assert crawler.stats.get_value('crawl_once/ignored', 0) == 1
        assert mw2.process_request(req3, crawler.spider) is None
Beispiel #2
0
def test_log_formatter_scrapy_1():
    middleware = get_test_middleware()
    logformatter = CrawleraFetchLogFormatter()
    formatter = Formatter()

    for case in get_test_requests():
        original = case["original"]
        response = Response(original.url)
        processed = middleware.process_request(original, foo_spider)

        crawlera_meta = original.meta.get("crawlera_fetch") or {}
        if crawlera_meta.get("skip"):
            assert processed is None
            continue

        # crawled
        result = logformatter.crawled(processed, response, foo_spider)
        assert result["args"]["request"] == str(original)
        record = LogRecord(name="logger",
                           pathname="n/a",
                           lineno=1,
                           exc_info=None,
                           **result)
        logstr = formatter.format(record)
        expected = "Crawled (200) {request} ['original url: {url}'] (referer: None)".format(
            request=original, url=original.url)
        assert logstr == expected
Beispiel #3
0
    def errback_broken_link(self, failure: Failure) -> FoundLink:  # Failure may not be the right typehint
        """
        Handles behavior for links which cause Twisted failures - which is most of the broken links this spider
        hopes to find

        :param failure: A Twisted failure raised by the Retry middleware
        :return: None
        """
        # Structure of this function heavily inspired by:
        # https://docs.scrapy.org/en/latest/topics/request-response.html#topics-request-response-ref-errbacks

        # If its a TCP or DNS error, short-circuit to the pipeline
        if failure.check(DNSLookupError, TCPTimedOutError):
            self.logger.info(f'Handled DNS/TCP related error. {failure.request.url}')
            request = failure.request
            dummy_response = Response(
                url=request.url,
                status=404,  # Kind of a lie
                request=request
            )
            yield from self.parse_broken_link(dummy_response)

        # If the client timed out, report that
        elif failure.check(TimeoutError):
            self.logger.info(f'Client timeout. {failure.request.url}')
            self.logger.error(repr(failure))
Beispiel #4
0
    def process_request(self,request,spider):

        if spider.name= 'Douyu_info':
            spider.driver.get(request.url)
            print("深V股v结婚两个关于一一一一一一一一一")
            print(request.url)
            '''for x in xrange(1,11,2):
                i=float(x)/10
                js="document.body.scrollTop=document.body.scrollHeight * %f"%i
                spider.driver.execute_script(js)
                time.sleep(1)'''
            response = Response(url = request.url,body=bytes(spider.driver.page_source),request = request)
Beispiel #5
0
def test_log_formatter_scrapy_2():
    middleware = get_test_middleware()
    logformatter = CrawleraFetchLogFormatter()
    formatter = Formatter()
    spider = Spider("foo")

    for case in deepcopy(test_requests):
        original = case["original"]
        response = Response(original.url)
        processed = middleware.process_request(original, spider)

        crawlera_meta = original.meta.get("crawlera_fetch") or {}
        if crawlera_meta.get("skip"):
            assert processed is None
            continue

        # crawled
        result = logformatter.crawled(processed, response, spider)
        assert result["args"]["request"] == str(original)
        record = LogRecord(name="logger",
                           pathname="n/a",
                           lineno=1,
                           exc_info=None,
                           **result)
        logstr = formatter.format(record)
        assert logstr == "Crawled (200) %s (referer: None)" % str(original)

        # spider_error
        result = logformatter.spider_error(Failure(Exception("exc")),
                                           processed, response, spider)
        assert result["args"]["request"] == str(original)
        record = LogRecord(name="logger",
                           pathname="n/a",
                           lineno=1,
                           exc_info=None,
                           **result)
        logstr = formatter.format(record)
        assert logstr == "Spider error processing %s (referer: None)" % str(
            original)

        # download_error
        result = logformatter.download_error(Failure(Exception("exc")),
                                             processed, spider, "foo")
        assert result["args"]["request"] == str(original)
        record = LogRecord(name="logger",
                           pathname="n/a",
                           lineno=2,
                           exc_info=None,
                           **result)
        logstr = formatter.format(record)
        assert logstr == "Error downloading %s: foo" % str(original)
    def test_parse(self):
        graph_photo_spider = FacebookPhotoGraphSpider(
            target_username='******', access_token='testAccessToken')

        # created_time and some other metadata are placed in the Request object
        req = Request('https://m.facebook.com/testUsername/1',
                      meta={
                          'id': '1',
                          'created_time': '2016-04-10T12:47:36+0000',
                          'name': 'This is a name text'
                      })
        resp = Response('https://m.facebook.com/testUsername/1',
                        body=photo_page_html_resp,
                        request=req)

        with requests_mock.Mocker() as m:
            # Mock the reactions and comments graph api request response.
            m.get(
                'https://graph.facebook.com/v2.7/1/reactions'
                '?access_token=testAccessToken',
                content=id_reactions_resp,
                headers={'content-type': 'json'})
            m.get(
                'https://graph.facebook.com/v2.7/1/comments'
                '?access_token=testAccessToken',
                content=id_comments_resp,
                headers={'content-type': 'json'})

            result = graph_photo_spider.parse(resp)
            self.assertEqual(
                result, {
                    'comments': [{
                        'created_time': '2017-07-20T03:18:29+0000',
                        'from': {
                            'id': '1234568',
                            'name': 'People B'
                        },
                        'id': '123456',
                        'message': 'comment1'
                    }, {
                        'created_time': '2017-07-19T13:43:14+0000',
                        'from': {
                            'id': '1234567',
                            'name': 'People A'
                        },
                        'id': '123456',
                        'message': 'comment2'
                    }],
                    'comments_num':
                    2,
                    'created_time':
                    '2016-04-10T12:47:36+0000',
                    'fb_id':
                    '1',
                    'image_url':
                    'https://justexampleurl.com/test.jpg',
                    'name':
                    'This is a name text',
                    'reactions': [{
                        'id': '61',
                        'name': 'Irvan',
                        'type': 'LIKE'
                    }, {
                        'id': '62',
                        'name': 'Raymond',
                        'type': 'WOW'
                    }, {
                        'id': '63',
                        'name': 'Ida',
                        'type': 'HAHA'
                    }],
                    'reactions_num':
                    3,
                    'url':
                    'https://m.facebook.com/testUsername/1'
                })
Beispiel #7
0
 def process_exception(self, request, exception, spider):
     if isinstance(exception, self.ALL_EXCEPTIONS):
         print('Got exception: %s' % exception)
         response = Response(url='exception')
     return response
Beispiel #8
0
 def process_exception(self, request, exception, spider):
     return Response(
         url="http://localhost/",
         body=b"Caught " + exception.__class__.__name__.encode("utf-8"),
     )