class TestHttpErrorMiddlewareHandleAll(TestCase):
    def setUp(self):
        self.spider = BaseSpider("foo")
        self.mw = HttpErrorMiddleware(Settings({"HTTPERROR_ALLOW_ALL": True}))
        self.req = Request("http://scrapytest.org")

        self.res200 = Response("http://scrapytest.org", status=200)
        self.res200.request = self.req
        self.res404 = Response("http://scrapytest.org", status=404)
        self.res404.request = self.req
        self.res402 = Response("http://scrapytest.org", status=402)
        self.res402.request = self.req

    def test_process_spider_input(self):
        self.assertEquals(None, self.mw.process_spider_input(self.res200, self.spider))
        self.assertEquals(None, self.mw.process_spider_input(self.res404, self.spider))

    def test_meta_overrides_settings(self):
        request = Request("http://scrapytest.org", meta={"handle_httpstatus_list": [404]})
        res404 = self.res404.copy()
        res404.request = request
        res402 = self.res402.copy()
        res402.request = request

        self.assertEquals(None, self.mw.process_spider_input(res404, self.spider))
        self.assertRaises(HttpError, self.mw.process_spider_input, res402, self.spider)
 def test_request_cacheability(self):
     res0 = Response(self.request.url, status=200,
                     headers={'Expires': self.tomorrow})
     req0 = Request('http://example.com')
     req1 = req0.replace(headers={'Cache-Control': 'no-store'})
     req2 = req0.replace(headers={'Cache-Control': 'no-cache'})
     with self._middleware() as mw:
         # response for a request with no-store must not be cached
         res1 = self._process_requestresponse(mw, req1, res0)
         self.assertEqualResponse(res1, res0)
         assert mw.storage.retrieve_response(self.spider, req1) is None
         # Re-do request without no-store and expect it to be cached
         res2 = self._process_requestresponse(mw, req0, res0)
         assert 'cached' not in res2.flags
         res3 = mw.process_request(req0, self.spider)
         assert 'cached' in res3.flags
         self.assertEqualResponse(res2, res3)
         # request with no-cache directive must not return cached response
         # but it allows new response to be stored
         res0b = res0.replace(body=b'foo')
         res4 = self._process_requestresponse(mw, req2, res0b)
         self.assertEqualResponse(res4, res0b)
         assert 'cached' not in res4.flags
         res5 = self._process_requestresponse(mw, req0, None)
         self.assertEqualResponse(res5, res0b)
         assert 'cached' in res5.flags
def _responses(request, status_codes):
    responses = []
    for code in status_codes:
        response = Response(request.url, status=code)
        response.request = request
        responses.append(response)
    return responses
 def test_cached_and_stale(self):
     sampledata = [
         (200, {'Date': self.today, 'Expires': self.yesterday}),
         (200, {'Date': self.today, 'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
         (200, {'Expires': self.yesterday}),
         (200, {'Expires': self.yesterday, 'ETag': 'foo'}),
         (200, {'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
         (200, {'Expires': self.tomorrow, 'Age': '86405'}),
         (200, {'Cache-Control': 'max-age=86400', 'Age': '86405'}),
         # no-cache forces expiration, also revalidation if validators exists
         (200, {'Cache-Control': 'no-cache'}),
         (200, {'Cache-Control': 'no-cache', 'ETag': 'foo'}),
         (200, {'Cache-Control': 'no-cache', 'Last-Modified': self.yesterday}),
         (200, {'Cache-Control': 'no-cache,must-revalidate', 'Last-Modified': self.yesterday}),
         (200, {'Cache-Control': 'must-revalidate', 'Expires': self.yesterday, 'Last-Modified': self.yesterday}),
         (200, {'Cache-Control': 'max-age=86400,must-revalidate', 'Age': '86405'}),
     ]
     with self._middleware() as mw:
         for idx, (status, headers) in enumerate(sampledata):
             req0 = Request('http://example-%d.com' % idx)
             res0a = Response(req0.url, status=status, headers=headers)
             # cache expired response
             res1 = self._process_requestresponse(mw, req0, res0a)
             self.assertEqualResponse(res1, res0a)
             assert 'cached' not in res1.flags
             # Same request but as cached response is stale a new response must
             # be returned
             res0b = res0a.replace(body=b'bar')
             res2 = self._process_requestresponse(mw, req0, res0b)
             self.assertEqualResponse(res2, res0b)
             assert 'cached' not in res2.flags
             cc = headers.get('Cache-Control', '')
             # Previous response expired too, subsequent request to same
             # resource must revalidate and succeed on 304 if validators
             # are present
             if 'ETag' in headers or 'Last-Modified' in headers:
                 res0c = res0b.replace(status=304)
                 res3 = self._process_requestresponse(mw, req0, res0c)
                 self.assertEqualResponse(res3, res0b)
                 assert 'cached' in res3.flags
                 # get cached response on server errors unless must-revalidate
                 # in cached response
                 res0d = res0b.replace(status=500)
                 res4 = self._process_requestresponse(mw, req0, res0d)
                 if 'must-revalidate' in cc:
                     assert 'cached' not in res4.flags
                     self.assertEqualResponse(res4, res0d)
                 else:
                     assert 'cached' in res4.flags
                     self.assertEqualResponse(res4, res0b)
             # Requests with max-stale can fetch expired cached responses
             # unless cached response has must-revalidate
             req1 = req0.replace(headers={'Cache-Control': 'max-stale'})
             res5 = self._process_requestresponse(mw, req1, res0b)
             self.assertEqualResponse(res5, res0b)
             if 'no-cache' in cc or 'must-revalidate' in cc:
                 assert 'cached' not in res5.flags
             else:
                 assert 'cached' in res5.flags
Esempio n. 5
0
def pytest_funcarg__mock_response(request):
    """
    Fake response to the scrape request -- we only fill out the fields used by
    the middleware for testing purposes
    """
    scrape_request = request.getfuncargvalue("scrape_request")
    mock_response = Response('http://test.com')
    mock_response.request = scrape_request
    return mock_response
Esempio n. 6
0
    def test_empty_content_type(self):
        name = "ebay4"
        spider = self.smanager.create(name)
        generic_form_request = list(spider.start_requests())[0]

        response = Response(url="http://www.ebay.com/sch/ebayadvsearch/?rt=nc",
                            body=open(join(_PATH, "data", "ebay_advanced_search.html")).read())
        response.request = generic_form_request
        # must not raise an error
        for result in spider.parse(response):
            pass
    def setUp(self):
        self.spider = BaseSpider("foo")
        self.mw = HttpErrorMiddleware(Settings({"HTTPERROR_ALLOW_ALL": True}))
        self.req = Request("http://scrapytest.org")

        self.res200 = Response("http://scrapytest.org", status=200)
        self.res200.request = self.req
        self.res404 = Response("http://scrapytest.org", status=404)
        self.res404.request = self.req
        self.res402 = Response("http://scrapytest.org", status=402)
        self.res402.request = self.req
    def setUp(self):
        self.spider = Spider('foo')
        self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOW_ALL': True}))
        self.req = Request('http://scrapytest.org')

        self.res200 = Response('http://scrapytest.org', status=200)
        self.res200.request = self.req
        self.res404 = Response('http://scrapytest.org', status=404)
        self.res404.request = self.req
        self.res402 = Response('http://scrapytest.org', status=402)
        self.res402.request = self.req
def test_hs_mware_process_spider_output_filter_request(hs_mware):
    response = Response('http://resp-url')
    # provide a response and a new request in result
    child_response = Response('http://resp-url-child')
    child_response.request = Request('http://resp-url-child-req')
    child_request = Request('http://req-url-child')
    hs_mware._seen = WeakKeyDictionary({response: 'riq'})
    result = list(hs_mware.process_spider_output(
        response, [child_response, child_request], Spider('test')))
    assert len(result) == 2
    # make sure that we update hsparent meta only for requests
    assert result[0].meta.get(HS_PARENT_ID_KEY) is None
    assert result[1].meta[HS_PARENT_ID_KEY] == 'riq'
def test_hs_middlewares(hs_downloader_middleware, hs_spider_middleware):
    assert hs_spider_middleware._seen_requests == WeakKeyDictionary()
    assert hs_downloader_middleware._seen_requests == WeakKeyDictionary()
    assert hs_spider_middleware._seen_requests is hs_downloader_middleware._seen_requests

    spider = Spider('test')
    url = 'http://resp-url'
    request_0 = Request(url)
    response_0 = Response(url)

    hs_downloader_middleware.process_request(request_0, spider)

    assert HS_REQUEST_ID_KEY not in request_0.meta
    assert HS_PARENT_ID_KEY not in request_0.meta
    assert len(hs_spider_middleware._seen_requests) == 0
    assert len(hs_downloader_middleware._seen_requests) == 0

    hs_downloader_middleware.process_response(request_0, response_0, spider)

    assert request_0.meta[HS_REQUEST_ID_KEY] == 0
    assert request_0.meta[HS_PARENT_ID_KEY] is None
    assert hs_spider_middleware._seen_requests[request_0] == 0

    response_0.request = request_0
    request_1 = Request(url)
    request_2 = Request(url)
    item1 = {}
    item2 = Item()
    output = [request_1, request_2, item1, item2]
    processed_output = list(hs_spider_middleware.process_spider_output(response_0, output, spider))

    assert processed_output[0] is request_1
    assert request_1.meta[HS_PARENT_ID_KEY] == 0
    assert processed_output[1] is request_2
    assert request_2.meta[HS_PARENT_ID_KEY] == 0
    assert processed_output[2] is item1
    assert processed_output[3] is item2

    response_1 = Response(url)
    hs_downloader_middleware.process_request(request_1, spider)
    hs_downloader_middleware.process_response(request_1, response_1, spider)
    assert request_1.meta[HS_REQUEST_ID_KEY] == 1
    assert request_1.meta[HS_PARENT_ID_KEY] == 0

    response_2 = Response(url)
    hs_downloader_middleware.process_request(request_2, spider)
    hs_downloader_middleware.process_response(request_2, response_2, spider)
    assert request_2.meta[HS_REQUEST_ID_KEY] == 2
    assert request_2.meta[HS_PARENT_ID_KEY] == 0
class TestHttpErrorMiddleware(TestCase):
    def setUp(self):
        self.spider = BaseSpider("foo")
        self.mw = HttpErrorMiddleware(Settings({}))
        self.req = Request("http://scrapytest.org")

        self.res200 = Response("http://scrapytest.org", status=200)
        self.res200.request = self.req
        self.res404 = Response("http://scrapytest.org", status=404)
        self.res404.request = self.req

    def test_process_spider_input(self):
        self.assertEquals(None, self.mw.process_spider_input(self.res200, self.spider))
        self.assertRaises(HttpError, self.mw.process_spider_input, self.res404, self.spider)

    def test_process_spider_exception(self):
        self.assertEquals([], self.mw.process_spider_exception(self.res404, HttpError(self.res404), self.spider))
        self.assertEquals(None, self.mw.process_spider_exception(self.res404, Exception(), self.spider))

    def test_handle_httpstatus_list(self):
        res = self.res404.copy()
        res.request = Request("http://scrapytest.org", meta={"handle_httpstatus_list": [404]})
        self.assertEquals(None, self.mw.process_spider_input(res, self.spider))

        self.spider.handle_httpstatus_list = [404]
        self.assertEquals(None, self.mw.process_spider_input(self.res404, self.spider))
class TestHttpErrorMiddleware(TestCase):

    def setUp(self):
        self.spider = BaseSpider()
        self.mw = HttpErrorMiddleware()
        self.req = Request('http://scrapytest.org')

        self.res200 = Response('http://scrapytest.org', status=200)
        self.res200.request = self.req
        self.res404 = Response('http://scrapytest.org', status=404)
        self.res404.request = self.req

    def test_process_spider_input(self):
        self.assertEquals(self.mw.process_spider_input(self.res200, self.spider),
                          None)

        self.assertEquals(self.mw.process_spider_input(self.res404, self.spider),
                          [])

    def test_handle_httpstatus_list(self):
        res = self.res404.copy()
        res.request = Request('http://scrapytest.org',
                              meta={'handle_httpstatus_list': [404]})

        self.assertEquals(self.mw.process_spider_input(res, self.spider),
                          None)

        self.spider.handle_httpstatus_list = [404]
        self.assertEquals(self.mw.process_spider_input(self.res404, self.spider),
                          None)
def test_hs_mware_process_spider_input(hs_mware):
    response = Response('http://resp-url')
    response.request = Request('http://req-url')
    hs_mware.process_spider_input(response, Spider('test'))
    assert hs_mware.pipe_writer.write_request.call_count == 1
    args = hs_mware.pipe_writer.write_request.call_args[1]
    assert args == {
        'duration': 0,
        'fp': request_fingerprint(response.request),
        'method': 'GET',
        'parent': None,
        'rs': 0,
        'status': 200,
        'url': 'http://resp-url'
    }
    assert hs_mware._seen == WeakKeyDictionary({response: 0})
 def test_response_cacheability(self):
     responses = [
         # 304 is not cacheable no matter what servers sends
         (False, 304, {}),
         (False, 304, {'Last-Modified': self.yesterday}),
         (False, 304, {'Expires': self.tomorrow}),
         (False, 304, {'Etag': 'bar'}),
         (False, 304, {'Cache-Control': 'max-age=3600'}),
         # Always obey no-store cache control
         (False, 200, {'Cache-Control': 'no-store'}),
         # invalid
         (False, 200, {'Cache-Control': 'no-store, max-age=300'}),
         # invalid
         (False, 200, {
          'Cache-Control': 'no-store', 'Expires': self.tomorrow}),
         # Ignore responses missing expiration and/or validation headers
         (False, 200, {}),
         (False, 302, {}),
         (False, 307, {}),
         (False, 404, {}),
         # Cache responses with expiration and/or validation headers
         (True, 200, {'Last-Modified': self.yesterday}),
         (True, 203, {'Last-Modified': self.yesterday}),
         (True, 300, {'Last-Modified': self.yesterday}),
         (True, 301, {'Last-Modified': self.yesterday}),
         (True, 401, {'Last-Modified': self.yesterday}),
         (True, 404, {'Cache-Control': 'public, max-age=600'}),
         (True, 302, {'Expires': self.tomorrow}),
         (True, 200, {'Etag': 'foo'}),
     ]
     with self._middleware() as mw:
         for idx, (shouldcache, status, headers) in enumerate(responses):
             req0 = Request('http://example-%d.com' % idx)
             res0 = Response(req0.url, status=status, headers=headers)
             res1 = self._process_requestresponse(mw, req0, res0)
             res304 = res0.replace(status=304)
             res2 = self._process_requestresponse(
                 mw, req0, res304 if shouldcache else res0)
             self.assertEqualResponse(res1, res0)
             self.assertEqualResponse(res2, res0)
             resc = mw.storage.retrieve_response(self.spider, req0)
             if shouldcache:
                 self.assertEqualResponse(resc, res1)
                 assert 'cached' in res2.flags and res2.status != 304
             else:
                 self.assertFalse(resc)
                 assert 'cached' not in res2.flags
    def test_parse_declaration_doc(self):
        response = Response('http://old.vtek.lt/vtek/.../deklaracija2012.doc', body='msword msword msword')
        response.request = scrapy.Request(response.url)
        response.request.meta['year'] = '2012'

        def mock_doc2xml(msword):
            assert msword == 'msword msword msword'
            return 'xml xml xml'

        with mock.patch('manoseimas.scrapy.spiders.lobbyist_declarations.doc2xml', mock_doc2xml):
            with mock.patch.object(self.spider, 'parse_declaration_xml') as p_d_x:
                list(self.spider.parse_declaration_doc(response))
                assert p_d_x.call_count == 1
                new_response = p_d_x.call_args[0][0]
                assert new_response.meta['year'] == '2012'
                assert new_response.body == 'xml xml xml'
                assert isinstance(new_response, XmlResponse)
 def test_cached_and_fresh(self):
     sampledata = [
         (200, {'Date': self.yesterday, 'Expires': self.tomorrow}),
         (200, {'Date': self.yesterday, 'Cache-Control': 'max-age=86405'}),
         (200, {'Age': '299', 'Cache-Control': 'max-age=300'}),
         # Obey max-age if present over any others
         (200, {'Date': self.today,
                'Age': '86405',
                'Cache-Control': 'max-age=' + str(86400 * 3),
                'Expires': self.yesterday,
                'Last-Modified': self.yesterday,
                }),
         # obey Expires if max-age is not present
         (200, {'Date': self.yesterday,
                'Age': '86400',
                'Cache-Control': 'public',
                'Expires': self.tomorrow,
                'Last-Modified': self.yesterday,
                }),
         # Default missing Date header to right now
         (200, {'Expires': self.tomorrow}),
         # Firefox - Expires if age is greater than 10% of (Date - Last-Modified)
         (200, {'Date': self.today, 'Last-Modified': self.yesterday, 'Age': str(86400 / 10 - 1)}),
         # Firefox - Set one year maxage to permanent redirects missing expiration info
         (300, {}), (301, {}), (308, {}),
     ]
     with self._middleware() as mw:
         for idx, (status, headers) in enumerate(sampledata):
             req0 = Request('http://example-%d.com' % idx)
             res0 = Response(req0.url, status=status, headers=headers)
             # cache fresh response
             res1 = self._process_requestresponse(mw, req0, res0)
             self.assertEqualResponse(res1, res0)
             assert 'cached' not in res1.flags
             # return fresh cached response without network interaction
             res2 = self._process_requestresponse(mw, req0, None)
             self.assertEqualResponse(res1, res2)
             assert 'cached' in res2.flags
             # validate cached response if request max-age set as 0
             req1 = req0.replace(headers={'Cache-Control': 'max-age=0'})
             res304 = res0.replace(status=304)
             assert mw.process_request(req1, self.spider) is None
             res3 = self._process_requestresponse(mw, req1, res304)
             self.assertEqualResponse(res1, res3)
             assert 'cached' in res3.flags
Esempio n. 17
0
def test_spider_crawls_links(spider, scrape_request, html_headers,
                             mock_html_twolinks):
    """Ensure spider always picks up relevant links to HTML pages"""
    # Use only 1 user agent for easier counting
    ua = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0')
    spider.batch_user_agents = [ua]

    # Generate a mock response based on html containing two links
    mock_response = Response('http://test:12345',
                             body=mock_html_twolinks)
    mock_response.request = scrape_request
    mock_response.headers = html_headers
    mock_response.meta['user_agent'] = ua
    mock_response.status = 200
    mock_response.encoding = u'utf-8'
    mock_response.flags = []

    # Call spider on the mock response
    pipeline_generator = spider.parse(mock_response)

    # Assert that we got the expected set of new requests generated in the
    # spider and nothing else
    sites_expected = set([
            mock_response.url + '/link1.html',
            mock_response.url + '/link2.html',
            ])

    sites_collected = []
    for new_request in pipeline_generator:
        if isinstance(new_request, Request):
            sites_collected.append(new_request.url)
        else:
            pass

    assert sites_expected == set(sites_collected)
def test_hs_mware_process_spider_input(hs_mware):
    response = Response('http://resp-url')
    response.request = Request('http://req-url')
    hs_mware.hsref.job.requests.add.return_value = 'riq'
    hs_mware.process_spider_input(response, Spider('test'))
    assert hs_mware.hsref.job.requests.add.call_count == 1
    args = hs_mware.hsref.job.requests.add.call_args[1]
    ts = args.pop('ts', None)
    assert isinstance(ts, float)
    assert args == {
        'duration': 0,
        'fp': request_fingerprint(response.request),
        'method': 'GET',
        'parent': None,
        'rs': 0,
        'status': 200,
        'url': 'http://resp-url'}
    assert hs_mware._seen == WeakKeyDictionary({response: 'riq'})
    def setUp(self):
        self.spider = BaseSpider('foo')
        self.mw = HttpErrorMiddleware()
        self.req = Request('http://scrapytest.org')

        self.res200 = Response('http://scrapytest.org', status=200)
        self.res200.request = self.req
        self.res404 = Response('http://scrapytest.org', status=404)
        self.res404.request = self.req
    def test_process_spider_output(self):
        req = Request('http://scrapytest.org')
        resp = Response('http://scrapytest.org')
        resp.request = req
        result = [Request('http://scrapytest.org')]

        out = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out, result)

        rdc = self.stats.get_value('request_depth_count/1', spider=self.spider)
        self.assertEquals(rdc, 1)

        req.meta['depth'] = 1

        out2 = list(self.mw.process_spider_output(resp, result, self.spider))
        self.assertEquals(out2, [])

        rdm = self.stats.get_value('request_depth_max', spider=self.spider)
        self.assertEquals(rdm, 1)
class TestHttpErrorMiddlewareSettings(TestCase):
    """Similar test, but with settings"""

    def setUp(self):
        self.spider = Spider('foo')
        self.mw = HttpErrorMiddleware(Settings({'HTTPERROR_ALLOWED_CODES': (402,)}))
        self.req = Request('http://scrapytest.org')

        self.res200 = Response('http://scrapytest.org', status=200)
        self.res200.request = self.req
        self.res404 = Response('http://scrapytest.org', status=404)
        self.res404.request = self.req
        self.res402 = Response('http://scrapytest.org', status=402)
        self.res402.request = self.req

    def test_process_spider_input(self):
        self.assertEquals(None,
                self.mw.process_spider_input(self.res200, self.spider))
        self.assertRaises(HttpError,
                self.mw.process_spider_input, self.res404, self.spider)
        self.assertEquals(None,
                self.mw.process_spider_input(self.res402, self.spider))

    def test_meta_overrides_settings(self):
        request = Request('http://scrapytest.org',
                              meta={'handle_httpstatus_list': [404]})
        res404 = self.res404.copy()
        res404.request = request
        res402 = self.res402.copy()
        res402.request = request

        self.assertEquals(None,
            self.mw.process_spider_input(res404, self.spider))
        self.assertRaises(HttpError,
                self.mw.process_spider_input, res402, self.spider)

    def test_spider_override_settings(self):
        self.spider.handle_httpstatus_list = [404]
        self.assertEquals(None,
            self.mw.process_spider_input(self.res404, self.spider))
        self.assertRaises(HttpError,
                self.mw.process_spider_input, self.res402, self.spider)
    def _getresponse(self, coding):
        if coding not in FORMAT:
            raise ValueError()

        samplefile, contentencoding = FORMAT[coding]

        with open(join(SAMPLEDIR, samplefile), "rb") as sample:
            body = sample.read()

        headers = {
            "Server": "Yaws/1.49 Yet Another Web Server",
            "Date": "Sun, 08 Mar 2009 00:41:03 GMT",
            "Content-Length": len(body),
            "Content-Type": "text/html",
            "Content-Encoding": contentencoding,
        }

        response = Response("http://scrapytest.org/", body=body, headers=headers)
        response.request = Request("http://scrapytest.org", headers={"Accept-Encoding": "gzip,deflate"})
        return response
    def _getresponse(self, coding):
        if coding not in FORMAT:
            raise ValueError()

        samplefile, contentencoding = FORMAT[coding]

        with open(join(SAMPLEDIR, samplefile), 'rb') as sample:
            body = sample.read()

        headers = {
                'Server': 'Yaws/1.49 Yet Another Web Server',
                'Date': 'Sun, 08 Mar 2009 00:41:03 GMT',
                'Content-Length': len(body),
                'Content-Type': 'text/html',
                'Content-Encoding': contentencoding,
                }

        response = Response('http://scrapytest.org/', body=body, headers=headers)
        response.request = Request('http://scrapytest.org', headers={'Accept-Encoding': 'gzip,deflate'})
        return response
Esempio n. 24
0
  def _test_404_middleware(self):
    from twcrawler.middleware.handle_404 import Handle404
    mw = Handle404.from_crawler(self.crawler)
    url = 'http://example.com/404'
    req = Request(url)
    req.meta['origin_url'] = url
    req.meta['proxy'] = 'xx.xx.xx.xx:404'
    for string_404_list in mw.settings.get('HTML_404_STRING'):

      body_normal_404 = '\n'.join(['<p>%s</p>'%s for s in string_404_list])
      resp = Response(url, body=body_normal_404, status=404, request=req)
      ret = mw.process_spider_output(resp, [], self.spider)
      ret = list(ret)
      assert not ret

      resp = Response(url, body='bad_string', status=404, request=req)
      ret = mw.process_spider_output(resp, [], self.spider)
      ret = list(ret)
      # TODO, after add the new request to redis, no item return
      #self.assertEqual(ret[0].url, url)

    resp = Response(url, body='bad_string', status=200, request=req)
    from scrapy import Item, Field
    class TestItem(Item):
      uid = Field()
    item = TestItem()
    item['uid'] = 'uid_test'
    ret = mw.process_spider_output(resp, [item], self.spider)
    ret = list(ret)
    self.assertEqual(item, ret[0])

    resp.meta['exception'] = 'test exception'
    ret = mw.process_spider_output(resp, [resp], self.spider)
    ret = list(ret)
    # TODO, after add the new request to redis, no item return
    #self.assertEqual(ret[0].url, url)

    for status in [503, 204, 500]:
      resp = Response(url, body='bad_string', status=status, request=req)
      ret = mw.process_spider_output(resp, [resp], self.spider)
      ret = list(ret)
Esempio n. 25
0
def test_useragents_spider(spider, scrape_request, html_headers,
                           mock_html_nolinks):
    """Ensure multiple requests with different user agent strings emitted"""
    ua1 = factories.BatchUserAgentFactory.build(ua_string='Firefox / 11.0')
    ua2 = factories.BatchUserAgentFactory.build(ua_string='Chrome / 20.0')
    spider.batch_user_agents = [ua1, ua2]

    # Generate a mock response
    mock_response = Response('http://test:12345',
                             body=mock_html_nolinks)
    mock_response.request = scrape_request
    mock_response.headers = html_headers
    mock_response.status = 200
    mock_response.encoding = u'utf-8'
    mock_response.flags = []

    # Call the spider on the mock response
    pipeline_generator = spider.parse(mock_response)

    # Assert that we have two requests for this linkless page, one for each
    # of the user agents we inserted
    request_uas = []
    for new_request in pipeline_generator:
        if isinstance(new_request, Request):
            request_uas.append(new_request.meta['user_agent'].ua_string)
        else:
            # We're not expecting anything other than Requests
            assert False

    assert set(request_uas) == set([u'Firefox / 11.0', u'Chrome / 20.0'])
Esempio n. 26
0
def fake_response_from_file(file_name, url=None):
    """
    Create a Scrapy fake HTTP response from a HTML file
    @param file_name: The relative filename from the responses directory,
                      but absolute paths are also accepted.
    @param url: The URL of the response.
    returns: A scrapy HTTP response which can be used for unittesting.
    """
    if not url:
        url = "http://www.example.com"

    request = Request(url=url)
    if not file_name[0] == "/":
        responses_dir = os.path.dirname(os.path.realpath(__file__))
        file_path = os.path.join(responses_dir, file_name)
    else:
        file_path = file_name

    file_content = open(file_path, "r").read()

    response = Response(url=url, request=request, body=file_content)
    response.encoding = "utf-8"
    return response
Esempio n. 27
0
    def test_get_cached_beautifulsoup(self):
        r1 = Response('http://www.example.com', body='')

        soup1 = get_cached_beautifulsoup(r1)
        soup2 = get_cached_beautifulsoup(r1)

        assert isinstance(soup1, BeautifulSoup)
        assert isinstance(soup2, BeautifulSoup)
        # make sure it's cached
        assert soup1 is soup2

        # when body is None, an empty soup should be returned
        r1 = Response('http://www.example.com')
        assert r1.body == ""
        assert isinstance(get_cached_beautifulsoup(r1), BeautifulSoup)

        r1 = Response('http://www.example.com', body='')
        soup1 = get_cached_beautifulsoup(r1)
        r2 = r1.copy()
        soup2 = get_cached_beautifulsoup(r1)
        soup3 = get_cached_beautifulsoup(r2)

        assert soup1 is soup2
        assert soup1 is not soup3
 def test_cached_and_stale(self):
     sampledata = [
         (200, {"Date": self.today, "Expires": self.yesterday}),
         (200, {"Date": self.today, "Expires": self.yesterday, "Last-Modified": self.yesterday}),
         (200, {"Expires": self.yesterday}),
         (200, {"Expires": self.yesterday, "ETag": "foo"}),
         (200, {"Expires": self.yesterday, "Last-Modified": self.yesterday}),
         (200, {"Expires": self.tomorrow, "Age": "86405"}),
         (200, {"Cache-Control": "max-age=86400", "Age": "86405"}),
         # no-cache forces expiration, also revalidation if validators exists
         (200, {"Cache-Control": "no-cache"}),
         (200, {"Cache-Control": "no-cache", "ETag": "foo"}),
         (200, {"Cache-Control": "no-cache", "Last-Modified": self.yesterday}),
     ]
     with self._middleware() as mw:
         for idx, (status, headers) in enumerate(sampledata):
             req0 = Request("http://example-%d.com" % idx)
             res0a = Response(req0.url, status=status, headers=headers)
             # cache expired response
             res1 = self._process_requestresponse(mw, req0, res0a)
             self.assertEqualResponse(res1, res0a)
             assert "cached" not in res1.flags
             # Same request but as cached response is stale a new response must
             # be returned
             res0b = res0a.replace(body="bar")
             res2 = self._process_requestresponse(mw, req0, res0b)
             self.assertEqualResponse(res2, res0b)
             assert "cached" not in res2.flags
             # Previous response expired too, subsequent request to same
             # resource must revalidate and succeed on 304 if validators
             # are present
             if "ETag" in headers or "Last-Modified" in headers:
                 res0c = res0b.replace(status=304)
                 res3 = self._process_requestresponse(mw, req0, res0c)
                 self.assertEqualResponse(res3, res0b)
                 assert "cached" in res3.flags
Esempio n. 29
0
    def test(self):

        origin = 'http://www.scrapy.org'
        target = 'http://www.example.com'

        for settings, response_headers, request_meta, policy_class, check_warning in self.params[
                3:]:
            spider = Spider('foo')
            mw = RefererMiddleware(Settings(settings))

            response = Response(origin, headers=response_headers)
            request = Request(target, meta=request_meta)

            with warnings.catch_warnings(record=True) as w:
                policy = mw.policy(response, request)
                self.assertIsInstance(policy, policy_class)

                if check_warning:
                    self.assertEqual(len(w), 1)
                    self.assertEqual(w[0].category, RuntimeWarning,
                                     w[0].message)
Esempio n. 30
0
    def test(self):

        for parent, target, redirections, init_referrer, final_referrer in self.scenarii:
            response = self.get_response(parent)
            request = self.get_request(target)

            out = list(
                self.referrermw.process_spider_output(response, [request],
                                                      self.spider))
            self.assertEqual(out[0].headers.get('Referer'), init_referrer)

            for status, url in redirections:
                response = Response(request.url,
                                    headers={'Location': url},
                                    status=status)
                request = self.redirectmw.process_response(
                    request, response, self.spider)
                self.referrermw.request_scheduled(request, self.spider)

            assert isinstance(request, Request)
            self.assertEqual(request.headers.get('Referer'), final_referrer)
Esempio n. 31
0
 def test_result_succeed(self):
     rsp = Response("http://url1")
     req = Request(
         "http://url1",
         meta=dict(response=rsp),
         callback=self._callback,
         errback=self._errback,
     )
     item = dict(requests=req)
     new_item = yield self.pipe.process_item(item, self.spider)
     self.assertEqual(new_item["results"], [(True, rsp)])
     self.assertEqual(
         self.pipe._mockcalled,
         [
             "get_media_requests",
             "media_to_download",
             "media_downloaded",
             "request_callback",
             "item_completed",
         ],
     )
 def test_thumbnail_name(self):
     thumb_path = self.pipeline.thumb_path
     name = '50'
     self.assertEqual(
         thumb_path(Request("file:///tmp/foo.jpg"), name),
         'thumbs/50/38a86208c36e59d4404db9e37ce04be863ef0335.jpg')
     self.assertEqual(
         thumb_path(Request("file://foo.png"), name),
         'thumbs/50/e55b765eba0ec7348e50a1df496040449071b96a.jpg')
     self.assertEqual(
         thumb_path(Request("file:///tmp/foo"), name),
         'thumbs/50/0329ad83ebb8e93ea7c7906d46e9ed55f7349a50.jpg')
     self.assertEqual(
         thumb_path(Request("file:///tmp/some.name/foo"), name),
         'thumbs/50/850233df65a5b83361798f532f1fc549cd13cbe9.jpg')
     self.assertEqual(
         thumb_path(Request("file:///tmp/some.name/foo"),
                    name,
                    response=Response("file:///tmp/some.name/foo"),
                    info=object()),
         'thumbs/50/850233df65a5b83361798f532f1fc549cd13cbe9.jpg')
Esempio n. 33
0
    def parse_info(self, response: Response):
        # 章节名称
        book_id = response.meta['book_id']

        seg_as = response.xpath('//div[@class="volume-wrap"]/div').css(
            '.cf li>a')
        for a in seg_as:
            # a-> Selector
            item = SegItem()
            item['seg_id'] = uuid.uuid4().hex
            item['book_id'] = book_id
            item['title'] = a.css('::text').get()
            item['url'] = 'https:' + a.xpath('./@href').get()

            yield item

            # 下载章节内容
            yield Request(item['url'],
                          callback=self.parse_seg,
                          priority=1,
                          meta={'seg_id': item['seg_id']})
Esempio n. 34
0
    def test_process_spider_output(self):
        res = Response('http://scrapytest.org')

        onsite_reqs = [
            Request('http://scrapytest.org/1'),
            Request('http://scrapy.org/1'),
            Request('http://sub.scrapy.org/1'),
            Request('http://offsite.tld/letmepass', dont_filter=True)
        ]
        offsite_reqs = [
            Request('http://scrapy2.org'),
            Request('http://offsite.tld/'),
            Request('http://offsite.tld/scrapytest.org'),
            Request('http://offsite.tld/rogue.scrapytest.org'),
            Request('http://rogue.scrapytest.org.haha.com'),
            Request('http://roguescrapytest.org')
        ]
        reqs = onsite_reqs + offsite_reqs

        out = list(self.mw.process_spider_output(res, reqs, self.spider))
        self.assertEquals(out, onsite_reqs)
Esempio n. 35
0
    def process_response(self, request, response, spider):
        data = response.body

        if -1 == data.find("<html") == data.find("<meta") == data.find(
                "<body"):
            return response

        h = self.getHash(data)
        if self.treehash.has_key(h):
            if self.treehash[h] >= 5:
                log.msg(format="Filtered dom tree repeat %(request)s",
                        level=log.DEBUG,
                        spider=spider,
                        request=request)
                return Response("")
            else:
                self.treehash[h] += 1
                return response
        else:
            self.treehash[h] = 1
            return response
    def test_setting_enabled_cookies_debug(self):
        crawler = get_crawler(settings_dict={'COOKIES_DEBUG': True})
        mw = CookiesMiddleware.from_crawler(crawler)
        with LogCapture('scrapy.downloadermiddlewares.cookies',
                        propagate=False,
                        level=logging.DEBUG) as l:
            req = Request('http://scrapytest.org/')
            res = Response('http://scrapytest.org/',
                           headers={'Set-Cookie': 'C1=value1; path=/'})
            mw.process_response(req, res, crawler.spider)
            req2 = Request('http://scrapytest.org/sub1/')
            mw.process_request(req2, crawler.spider)

            l.check(
                ('scrapy.downloadermiddlewares.cookies', 'DEBUG',
                 'Received cookies from: <200 http://scrapytest.org/>\n'
                 'Set-Cookie: C1=value1; path=/\n'),
                ('scrapy.downloadermiddlewares.cookies', 'DEBUG',
                 'Sending cookies to: <GET http://scrapytest.org/sub1/>\n'
                 'Cookie: C1=value1\n'),
            )
Esempio n. 37
0
    def test_503(self):
        req = Request('http://www.scrapytest.org/503')
        rsp = Response('http://www.scrapytest.org/503', body=b'', status=503)

        # first retry
        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
        self.assertEqual(req.meta['retry_times'], 1)

        # second retry
        req = self.mw.process_response(req, rsp, self.spider)
        assert isinstance(req, Request)
        self.assertEqual(req.meta['retry_times'], 2)

        # discard it
        assert self.mw.process_response(req, rsp, self.spider) is rsp

        assert self.crawler.stats.get_value('retry/max_reached') == 1
        assert self.crawler.stats.get_value(
            'retry/reason_count/503 Service Unavailable') == 2
        assert self.crawler.stats.get_value('retry/count') == 2
Esempio n. 38
0
class TestHelper(unittest.TestCase):
    bbody = b'utf8-body'
    ubody = bbody.decode('utf8')
    txtresponse = TextResponse(url='http://example.org/', body=bbody, encoding='utf-8')
    response = Response(url='http://example.org/', body=bbody)

    def test_body_or_str(self):
        for obj in (self.bbody, self.ubody, self.txtresponse, self.response):
            r1 = _body_or_str(obj)
            self._assert_type_and_value(r1, self.ubody, obj)
            r2 = _body_or_str(obj, unicode=True)
            self._assert_type_and_value(r2, self.ubody, obj)
            r3 = _body_or_str(obj, unicode=False)
            self._assert_type_and_value(r3, self.bbody, obj)
            self.assertTrue(type(r1) is type(r2))
            self.assertTrue(type(r1) is not type(r3))

    def _assert_type_and_value(self, a, b, obj):
        self.assertTrue(type(a) is type(b),
                        'Got {}, expected {} for {!r}'.format(type(a), type(b), obj))
        self.assertEqual(a, b)
Esempio n. 39
0
    def setUp(self):
        self.test_file = open(TEST_PDF, 'rb')
        self.spider = BaseSpider()
        self.spider.settings = get_project_settings()
        self.spider.crawler = Crawler()

        meta = {
            'data_dict': {
                'title': 'foo',
            }
        }
        headers = {
            'content-type': b'application/pdf'
        }
        request = Request('http://foo.bar', meta=meta)
        self.pdf_response = Response(
            'http://foo.bar',
            body=self.test_file.read(),
            request=request,
            headers=headers
        )
Esempio n. 40
0
    def get_contact(self, response: Response) -> list:
        """
        Gets the contact information.

        :param response: the response object
        :return: a list of contact
        """
        # //div[contains(@class,'field field-name-body field-type-text-with-summary')]/div/div/p[last()]
        contact = {
            'email': '',
            'phone': '',
            'website': response.url,
            'meet': ''
        }
        text = response.xpath(
            "string(//div[contains(@class,'field field-name-body field-type-text-with-summary')]/div/div/p[last()])"
        ).get()
        phone = extract_phone(text)
        if len(phone) > 0:
            contact['phone'] = phone[0]
        return contact
Esempio n. 41
0
    def get_meta(self, response: Response) -> dict:
        """
        Get the meta data of the patent from the table.

        :return dict(str, object)
        """
        title = 'Abstract'
        result = {}
        skip_first_paragraph = True
        for row in response.xpath(
                "//div[contains(@class,'field field-name-body field-type-text-with-summary')]/div/div/*"
        ):
            if row.xpath("name()").get() == 'h2':
                title = row.xpath("text()").get()
            elif row.xpath("name()").get() == 'p':
                if skip_first_paragraph:
                    skip_first_paragraph = False
                    continue
                result[title] = result.get(
                    title, '') + '\n' + row.xpath("string()").get()
        return result
Esempio n. 42
0
    def parse_search_page(self, response: Response):
        """Parse the top-level page.

        The search page contains a list of Chapters, with the names,
        numbers, and internal id's.
        """
        for option in response.css("#browseForm option"):
            db_id: Any = option.xpath("@value").get()
            if db_id == "-1":  # Ignore the heading
                continue

            number, name = map(str.strip,
                               option.xpath("text()").get().split("-", 1))
            chapter = new_chapter(db_id, number, name)

            new_chapter_index = len(self.oar["chapters"])
            self.oar["chapters"].append(chapter)

            request = Request(chapter["url"], callback=self.parse_chapter_page)
            request.meta["chapter_index"] = new_chapter_index
            yield request
    def process_exception(self, request, exception, spider):
        """
        处理由于使用代理导致的连接异常
        """
        print("%s" % self.proxys[request.meta["proxy_index"]].get_proxy())
        print("%s" % exception)
        # logger.debug("%s exception: %s" % (self.proxys[request.meta["proxy_index"]].get_proxy(), exception))
        request_proxy_index = request.meta["proxy_index"]

        # 只有当proxy_index>fixed_proxy-1时才进行比较, 这样能保证至少本地直连是存在的.
        if isinstance(exception, self.DONT_RETRY_ERRORS):
            if request_proxy_index > self.fixed_proxy - 1 and self.invalid_proxy_flag:  # WARNING 直连时超时的话换个代理还是重试? 这是策略问题
                self.invalid_proxy(request_proxy_index)
            else:  # 简单的切换而不禁用
                if request.meta["proxy_index"] == self.proxy_index:
                    self.inc_proxy_index()

            if "ex_count" not in request.meta.keys():
                request.meta["ex_count"] = 1
            else:
                request.meta["ex_count"] += 1
            # 跳过抛出多次异常的请求
            if request.meta["ex_count"] > self.max_exception_url_count:
                logger.info(
                    "beyond max exception url count, url: %s, request next url"
                    % request.url)
                r = redis_factory.get_instance()
                r.rpush(REDIS_KEY_URL_EXCEPTION, request.url)
                request.meta["ex_count"] = 0
                request.meta["change_url"] = True
                response = Response(status=200,
                                    request=request,
                                    url=request.url)
                return response

            request.dont_filter = True
            return request
        else:
            logger.error("this middleware can not handle the exception")
            return None
 def test_save_response(self):
     self.instance._writer = mock.MagicMock()
     self.instance._writer.maxitemsize = 10
     # wrong response type
     self.instance.save_response(
         Response('http://resp', request=Request('http://req')),
         self.spider)
     assert not self.instance._writer.write.called
     # get request with large body
     resp1 = TextResponse('http://resp1',
                          request=Request('http://req1'),
                          body='looong loong body',
                          encoding='cp1251')
     self.instance.save_response(resp1, self.spider)
     assert not self.instance._writer.write.called
     # get request with ok-body
     self.instance.hsref = mock.Mock()
     self.instance.hsref.job.key = '123/45/67'
     resp2 = TextResponse(
         'http://resp2',
         request=Request('http://req2'),
         body='body',
         encoding='cp1251',
         headers={'Set-Cookie': [b'coo1=test;abc=1', b'coo2=tes1;cbd=2']})
     self.instance.save_response(resp2, self.spider)
     self.instance._writer.write.assert_called_with({
         'body':
         u'body',
         '_encoding':
         'cp1251',
         '_type':
         '_pageitem',
         '_key':
         'bad42100b1d34e29973a79e512aabb4db885b712',
         'cookies': ['coo1=test', 'coo2=tes1'],
         'url':
         'http://resp2',
         '_jobid':
         '123/45/67'
     })
Esempio n. 45
0
def test_js_item_emission(spider, linked_js_request, js_headers, mock_js):
    """JS items are emitted correctly"""
    # Generate a mock response based on JS
    mock_url = 'http://test:12345/default.js'
    mock_response = Response(mock_url,
                             body=mock_js)
    mock_response.request = linked_js_request
    mock_response.headers = js_headers
    mock_response.status = 200
    mock_response.encoding = u'ascii'
    mock_response.flags = []

    # Generate a fake urlscan to use in our item comparison
    mock_urlscan = model.URLScan.objects.create(
        site_scan=linked_js_request.meta['sitescan'],
        page_url_hash=sha256("http://test:12345/").hexdigest(),
        page_url=mock_response.url,
        timestamp=spider.get_now_time())

    # Send the mocks to the spider for processing
    pipeline_generator = spider.parse(mock_response)

    # Verify the item returned is what we expected
    item_expected = MarkupItem()
    item_expected['content_type'] = spider.get_content_type(js_headers)
    item_expected['filename'] = os.path.basename(urlparse(mock_url).path)
    item_expected['headers'] = unicode(js_headers)
    item_expected['meta'] = mock_response.meta
    item_expected['raw_content'] = mock_response.body
    item_expected['sitescan'] = linked_js_request.meta['sitescan']
    item_expected['urlscan'] = mock_urlscan
    item_expected['url'] = mock_response.url
    item_expected['user_agent'] = mock_response.meta['user_agent']

    item_collected = None
    for item in pipeline_generator:
        if isinstance(item, MarkupItem):
            item_collected = item
        else:
            assert False

    assert item_expected == item_collected
Esempio n. 46
0
    def parse_sku(self, response: Response):
        attrs = []
        name_elements = response.css(
            'div.bul-showcase-push-name span::text').get()
        name = name_elements + " " + response.css(
            'div.product-name h1::text').get()
        code = response.css('div[itemprop="sku"]::text').get().strip()
        price = {}
        price_cny = response.css('span.price::text').get()
        if price_cny:
            price_cny = price_cny.strip('¥').replace(',', '').strip()
            price = {
                'cny': float(price_cny),
            }

        sizes = response.css('div.bul-size-select-list ul li::text').getall()
        sizes = [size.strip().strip('尺寸:') for size in sizes]
        if len(sizes):
            attrs.append({'name': '尺寸', 'value': ', '.join(sizes)})

        description = response.css(
            'div.bul-edito-texts div.data p::text').get().strip()
        if description is None or len(description) < 1:
            description = response.css(
                'div[itemprop="description"]::text').get()
        # image_elements = response.css('div.fotorama__stage__frame img')
        # image_urls = [item.attrib['src'] for item in image_elements]
        page_data_str = response.css(
            'div.product.media > script[type="text/x-magento-init"]::text'
        ).get()
        page_data = json.loads(page_data_str)
        image_data = page_data['[data-gallery-role=gallery-placeholder]'][
            'mage/gallery/gallery']['data']
        image_urls = [img['full'] for img in image_data]
        sku = SKU(self.brand_name, '', '', code, name, response.url, price,
                  description, image_urls, attrs)
        yield sku
Esempio n. 47
0
    def get_meta(self, response: Response) -> dict:
        """
        Get the meta data of the patent from the table.

        :return dict(str, object)
        """
        result = {}
        # Note: if running with JS, the data can be found in //div[@id='dynamic_content']/table[2]/tbody/tr
        for row in response.xpath("//table[@summary='Project Details']/table[@summary='Project Details']/tr"):
            try:
                title = row.xpath("string(th)").get()
            except Exception as e:
                self.log('Fail to find title for meta', level=logging.WARN)
                continue
            if len(title) < 1:
                continue
            result[title] = ''
            for line in row.xpath('td/*'):
                tag = line.xpath('name()').get()
                if tag.startswith('ul'):
                    # it is a list, keep it in markdown format
                    if len(result[title]) > 0:
                        result[title] += '\n'
                    result[title] += '  - '
                    result[title] += '\n  - '.join(line.xpath("li").xpath('string()').getall())
                else:
                    # anything else, e.g., a paragraph
                    if len(result[title]) > 0:
                        result[title] += '\n'
                    result[title] += line.xpath('string()').get()
            if len(row.xpath('td/*')) < 1:
                if len(result[title]) > 0:
                    result[title] += '\n'
                result[title] += row.xpath('string(td)').get()
            if 'Tags' in title:
                result[title] = row.xpath('td/a/text()').getall()
            elif 'Abstract' in title:
                result['banner'] = self.get_pictures(row.xpath('td'))
        return result
Esempio n. 48
0
    def test_from_response_formname_exists(self):
        respbody = """
<form action="post.php" method="POST">
<input type="hidden" name="one" value="1">
<input type="hidden" name="two" value="2">
</form>
<form name="form2" action="post.php" method="POST">
<input type="hidden" name="three" value="3">
<input type="hidden" name="four" value="4">
</form>
        """
        response = Response("http://www.example.com/formname.html",
                            body=respbody)
        r1 = self.request_class.from_response(response,
                                              formname="form2",
                                              callback=lambda x: x)
        self.assertEqual(r1.method, 'POST')
        fs = cgi.FieldStorage(StringIO(r1.body),
                              r1.headers,
                              environ={"REQUEST_METHOD": "POST"})
        self.assertEqual(fs['three'].value, "3")
        self.assertEqual(fs['four'].value, "4")
Esempio n. 49
0
    def test_from_response_extra_headers(self):
        respbody = """
<form action="post.php" method="POST">
<input type="hidden" name="test" value="val1">
<input type="hidden" name="test" value="val2">
<input type="hidden" name="test2" value="xxx">
</form>
        """
        headers = {"Accept-Encoding": "gzip,deflate"}
        response = Response("http://www.example.com/this/list.html",
                            body=respbody)
        r1 = self.request_class.from_response(response,
                                              formdata={
                                                  'one': ['two', 'three'],
                                                  'six': 'seven'
                                              },
                                              headers=headers,
                                              callback=lambda x: x)
        self.assertEqual(r1.method, 'POST')
        self.assertEqual(r1.headers['Content-type'],
                         'application/x-www-form-urlencoded')
        self.assertEqual(r1.headers['Accept-Encoding'], 'gzip,deflate')
Esempio n. 50
0
 def parse(self, response: Response):
     """
     This is an override of a spider method
     :param response:
     :return:
     """
     print("Extracting...")
     items = self.post_transform_avro(response)
     if not self.dry_run:
         for item in items:
             self.loader.submit(item)
     url_base = 'http://forums.somethingawful.com/'
     url = response.xpath('//a[@title="Next page"]/@href').extract()
     if len(url) > 0:
         url = url_base + url[0]
         log.debug(str(url))
     else:
         log.debug(str(url))
         raise IndexError("No next page for thread!")
     sleep(0.2)
     # log.debug("Iterating in parse: " + str(url))
     yield scrapy.Request(url, callback=self.parse)
Esempio n. 51
0
    def parse_info(self, response: Response):
        """获取章节信息"""
        book_id = response.meta['book_id']
        seg_as = response.xpath('//div[@class="volume-wrap"]/div').css(
            '.cf  li>a')
        for a in seg_as:
            item = SegItem()
            item['seg_id'] = uuid.uuid4().hex
            item['book_id'] = book_id
            item['seg_title'] = a.css('::text').get()
            item['url'] = 'https:' + a.css('::attr("href")').get()

            # 下载章节内容
            yield Request(item['url'],
                          callback=self.parse_seg,
                          priority=10,
                          meta={
                              'book_id': book_id,
                              'seg_id': item['seg_id']
                          })

            yield item
    def test_process_response_encoding_inside_body(self):
        headers = {
            'Content-Type': 'text/html',
            'Content-Encoding': 'gzip',
        }
        f = BytesIO()
        plainbody = (
            b'<html><head><title>Some page</title>'
            b'<meta http-equiv="Content-Type" content="text/html; charset=gb2312">'
        )
        zf = GzipFile(fileobj=f, mode='wb')
        zf.write(plainbody)
        zf.close()
        response = Response("http;//www.example.com/",
                            headers=headers,
                            body=f.getvalue())
        request = Request("http://www.example.com/")

        newresponse = self.mw.process_response(request, response, self.spider)
        assert isinstance(newresponse, HtmlResponse)
        self.assertEqual(newresponse.body, plainbody)
        self.assertEqual(newresponse.encoding, resolve_encoding('gb2312'))
Esempio n. 53
0
 def test_file_path(self):
     file_path = self.pipeline.file_path
     self.assertEqual(
         file_path(
             Request("https://dev.mydeco.com/mydeco.pdf")),
         'full/c9b564df929f4bc635bdd19fde4f3d4847c757c5.pdf')
     self.assertEqual(
         file_path(
             Request("http://www.maddiebrown.co.uk///catalogue-items//image_54642_12175_95307.txt")),
         'full/4ce274dd83db0368bafd7e406f382ae088e39219.txt')
     self.assertEqual(
         file_path(
             Request("https://dev.mydeco.com/two/dirs/with%20spaces%2Bsigns.doc")),
         'full/94ccc495a17b9ac5d40e3eabf3afcb8c2c9b9e1a.doc')
     self.assertEqual(
         file_path(
             Request("http://www.dfsonline.co.uk/get_prod_image.php?img=status_0907_mdm.jpg")),
         'full/4507be485f38b0da8a0be9eb2e1dfab8a19223f2.jpg')
     self.assertEqual(
         file_path(
             Request("http://www.dorma.co.uk/images/product_details/2532/")),
         'full/97ee6f8a46cbbb418ea91502fd24176865cf39b2')
     self.assertEqual(
         file_path(
             Request("http://www.dorma.co.uk/images/product_details/2532")),
         'full/244e0dd7d96a3b7b01f54eded250c9e272577aa1')
     self.assertEqual(
         file_path(
             Request("http://www.dorma.co.uk/images/product_details/2532"),
             response=Response("http://www.dorma.co.uk/images/product_details/2532"),
             info=object()),
         'full/244e0dd7d96a3b7b01f54eded250c9e272577aa1')
     self.assertEqual(
         file_path(
             Request("http://www.dfsonline.co.uk/get_prod_image.php?img=status_0907_mdm.jpg.bohaha")),
         'full/76c00cef2ef669ae65052661f68d451162829507')
     self.assertEqual(file_path(Request("\
                                 //+F0tzCwMK76ZKQ21AMqr7oAAC96JvD5aWM2kvZ78J0N7fmAAC46Y4Ap7y")),
                      'full/178059cbeba2e34120a67f2dc1afc3ecc09b61cb.png')
Esempio n. 54
0
 def test_process_spider_output(self):
     fake_response = mock.Mock()
     fake_response.request = Request('http://source-request')
     fake_result = sorted([Request('ftp://req1'), Request('https://req2'),
                           Response('http://source-request'), DictItem()])
     results = self.instance.process_spider_output(
         fake_response, fake_result, self.spider)
     assert isinstance(results, types.GeneratorType)
     for r in results:
         assert isinstance(r, type(fake_result.pop(0)))
         if isinstance(r, DictItem):
             self.assertEqual(
                 r["_cached_page_id"],
                 request_fingerprint(fake_response.request))
     bad_fake_request = DictItem()
     bad_fake_request._values = None
     self.instance.process_spider_exception = mock.Mock()
     with self.assertRaises(TypeError):
         for _ in self.instance.process_spider_output(
                 fake_response, [bad_fake_request], self.spider):
             pass
     assert self.instance.process_spider_exception.called
    def _get_crawler(self):
        crawler = mock.MagicMock()
        crawler.settings = Settings()
        crawler.settings.set('USER_AGENT', 'CustomAgent')
        self.assertRaises(NotConfigured, RobotsTxtMiddleware, crawler)
        crawler.settings.set('ROBOTSTXT_OBEY', True)
        crawler.engine.download = mock.MagicMock()
        ROBOTS = re.sub(
            r'^\s+(?m)', '', '''
        User-Agent: *
        Disallow: /admin/
        Disallow: /static/
        ''')
        response = Response('http://site.local/robots.txt', body=ROBOTS)

        def return_response(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.callback, response)
            return deferred

        crawler.engine.download.side_effect = return_response
        return crawler
Esempio n. 56
0
def get_response_for_testing(callback: Callable) -> Response:
    """
    Return a response with fake content with the configured callback.
    It is useful for testing providers.
    """
    url = "http://example.com"
    html = """
        <html>
            <body>
                <div class="breadcrumbs">
                    <a href="/food">Food</a> /
                    <a href="/food/sweets">Sweets</a>
                </div>
                <h1 class="name">Chocolate</h1>
                <p>Price: <span class="price">22€</span></p>
                <p class="description">The best chocolate ever</p>
            </body>
        </html>
        """.encode("utf-8")
    request = Request(url, callback=callback)
    response = Response(url, 200, None, html, request=request)
    return response
Esempio n. 57
0
    def parse_external_link(
        response: Response,
        link_text: str,
        resource_language_service: IResourceLanguageService,
        spider_name: str,
        language: Optional[Language] = None,
    ) -> ExternalResource:
        if response.status != 200:
            return

        iso_code = language.id if language is not None else None
        language_id = language.airtable_id if language is not None else None

        if isinstance(response, HtmlResponse):
            lang_attrs = LangAttributeParser.get_lang_values(response)

            resource_language_ids = resource_language_service.get_resource_language_ids(
                lang_attrs)

            yield ExternalResource(
                title=response.css(TITLE_SELECTOR).get(),
                link_text=link_text,
                url=response.url,
                iso_code=iso_code,
                language_id=language_id,
                spider_name=spider_name,
                resource_languages=resource_language_ids,
                resource_languages_raw=lang_attrs,
            )

        else:
            yield ExternalResource(
                title=link_text,
                link_text=link_text,
                url=response.url,
                iso_code=iso_code,
                language_id=language_id,
                spider_name=spider_name,
            )
Esempio n. 58
0
    def parse_sku(self, response: Response):
        attrs = []

        name = response.css(
            'p.product-details-section-name__description::text').get().strip()

        price_cny = response.css(
            'div.product-details-section-price__sell-price::text').get().strip(
            ).strip('¥ ').replace(',', '')

        price = {'cny': float(price_cny)}

        color = response.css(
            'span.product-details-section-color__checked-attribute-value::text'
        ).get()
        attrs.append({
            'name': '颜色',
            'value': color,
        })

        composition = response.css(
            'div.product-details-section-description__text::text').getall()
        for s in composition:
            attrs.append({'name': '参数', 'value': s})

        code = response.url.split('/')[-1].strip('.html').upper()

        page_data = response.xpath('/html/body/script[2]//text()').get()

        image_urls = [
            item for item in re.findall(r'"(.+?)"', page_data)
            if 'Large' in item
        ]
        image_urls = utils.list_unique(image_urls)
        image_urls = [
            url.encode('utf-8').decode('unicode_escape') for url in image_urls
        ]

        sizes = response.css('div.component-size-option::text').getall()
        sizes = [size.strip() for size in sizes]
        if len(sizes):
            attrs.append({'name': '尺寸', 'value': ', '.join(sizes)})

        sku = SKU(self.brand_name, '', '', code, name, response.url, price, '',
                  image_urls, attrs)
        yield sku
Esempio n. 59
0
def test_css_item_emission(spider, linked_css_request, css_headers, mock_css):
    """CSS items are emitted correctly"""
    # Use only 1 user agent for easier counting
    ua1 = factories.BatchUserAgentFactory(ua_string='Firefox / 11.0')
    spider.user_agents = [ua1]

    # Generate a mock response based on CSS
    mock_url = 'http://test:12345/default.css'
    mock_response = Response(mock_url,
                             body=mock_css)
    mock_response.request = linked_css_request
    mock_response.headers = css_headers
    mock_response.status = 200
    mock_response.encoding = u'ascii'
    mock_response.flags = []

    # Generate a fake urlscan to use in our item comparison
    mock_urlscan = model.URLScan.objects.create(
        site_scan=linked_css_request.meta['sitescan'],
        page_url_hash=sha256("http://test:12345/").hexdigest(),
        page_url=mock_response.url,
        timestamp=spider.get_now_time())

    # Send the mocks to the spider for processing
    pipeline_generator = spider.parse(mock_response)

    # Verify the item returned is what we expected
    item_expected = MarkupItem()
    item_expected['content_type'] = spider.get_content_type(css_headers)
    item_expected['filename'] = os.path.basename(urlparse(mock_url).path)
    item_expected['headers'] = unicode(css_headers)
    item_expected['meta'] = mock_response.meta
    item_expected['raw_content'] = mock_response.body
    item_expected['sitescan'] = linked_css_request.meta['sitescan']
    item_expected['urlscan'] = mock_urlscan
    item_expected['url'] = mock_response.url
    item_expected['user_agent'] = mock_response.meta['user_agent']

    item_collected = None
    for item in pipeline_generator:
        if isinstance(item, MarkupItem):
            item_collected = item
        else:
            assert False

    assert item_expected == item_collected
Esempio n. 60
0
def test_js_item_emission(spider, linked_js_request, js_headers, mock_js):
    """JS items are emitted correctly"""
    # Generate a mock response based on JS
    mock_url = 'http://test:12345/default.js'
    mock_response = Response(mock_url,
                             body=mock_js)
    mock_response.request = linked_js_request
    mock_response.headers = js_headers
    mock_response.status = 200
    mock_response.encoding = u'ascii'
    mock_response.flags = []

    # Generate a fake urlscan to use in our item comparison
    mock_urlscan = model.URLScan.objects.create(
        site_scan=linked_js_request.meta['sitescan'],
        page_url_hash=sha256("http://test:12345/").hexdigest(),
        page_url=mock_response.url,
        timestamp=spider.get_now_time())

    # Send the mocks to the spider for processing
    pipeline_generator = spider.parse(mock_response)

    # Verify the item returned is what we expected
    item_expected = MarkupItem()
    item_expected['content_type'] = spider.get_content_type(js_headers)
    item_expected['filename'] = os.path.basename(urlparse(mock_url).path)
    item_expected['headers'] = unicode(js_headers)
    item_expected['meta'] = mock_response.meta
    item_expected['raw_content'] = mock_response.body
    item_expected['sitescan'] = linked_js_request.meta['sitescan']
    item_expected['urlscan'] = mock_urlscan
    item_expected['url'] = mock_response.url
    item_expected['user_agent'] = mock_response.meta['user_agent']
    item_expected['redirected_from'] = ''

    assert list(pipeline_generator) == [item_expected]