Beispiel #1
0
def test_magic_response2():
    # check 'body' handling and another 'headers' format
    mw = _get_mw()
    req = PrerenderRequest('http://example.com/',
                           magic_response=True,
                           headers={'foo': 'bar'},
                           dont_send_headers=True)
    req = mw.process_request(req, None)
    assert 'headers' not in req.meta['prerender']['args']

    resp_data = {
        'body': base64.b64encode(b"binary data").decode('ascii'),
        'headers': {
            'Content-Type': 'text/plain'
        },
    }
    resp = TextResponse("http://myprerender.example.com/execute",
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp2 = mw.process_response(req, resp, None)
    assert resp2.data == resp_data
    assert resp2.body == b'binary data'
    assert resp2.headers == {b'Content-Type': [b'text/plain']}
    assert resp2.prerender_response_headers == {
        b'Content-Type': [b'application/json']
    }
    assert resp2.status == resp2.prerender_response_status == 200
    assert resp2.url == "http://example.com/"
Beispiel #2
0
def test_magic_response_http_error():
    mw = _get_mw()
    req = PrerenderRequest('http://example.com/foo')
    req = mw.process_request(req, None)

    resp_data = {
        "info": {
            "error": "http404",
            "message":
            "Lua error: [string \"function main(prerender)\r...\"]:3: http404",
            "line_number": 3,
            "type": "LUA_ERROR",
            "source": "[string \"function main(prerender)\r...\"]"
        },
        "description": "Error happened while executing Lua script",
        "error": 400,
        "type": "ScriptError"
    }
    resp = TextResponse("http://myprerender.example.com/execute",
                        status=400,
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp = mw.process_response(req, resp, None)
    assert resp.data == resp_data
    assert resp.status == 404
    assert resp.prerender_response_status == 400
    assert resp.url == "http://example.com/foo"
Beispiel #3
0
 def _get_req():
     return PrerenderRequest(
         url="http://example.com",
         endpoint='execute',
         magic_response=True,
         args={'lua_source': 'function main(prerender) end'},
     )
def requests():
    url1 = "http://example.com/foo?x=1&y=2"
    url2 = "http://example.com/foo?y=2&x=1"
    url3 = "http://example.com/foo?x=1&y=2&z=3"
    url4 = "http://example.com/foo?x=1&y=2#id2"
    url5 = "http://example.com/foo?x=1&y=2#!id2"
    request_kwargs = [
        dict(url=url1),  # 0
        dict(url=url1, method='POST'),  # 1
        dict(url=url1, endpoint='render.har'),  # 2
        dict(url=url2),  # 3
        dict(url=url1, args={'wait': 0.5}),  # 4
        dict(url=url2, args={'wait': 0.5}),  # 5
        dict(url=url3),  # 6
        dict(url=url2, method='POST'),  # 7
        dict(args={'wait': 0.5}),  # 8
        dict(args={'wait': 0.5}),  # 9
        dict(args={'wait': 0.7}),  # 10
        dict(url=url4),  # 11
    ]
    prerender_requests = [
        PrerenderRequest(**kwargs) for kwargs in request_kwargs
    ]
    scrapy_requests = [
        scrapy.Request(url=url1),  # 12
        scrapy.Request(url=url2),  # 13
        scrapy.Request(url=url4),  # 14
        scrapy.Request(url=url5),  # 15
    ]
    return prerender_requests + scrapy_requests
Beispiel #5
0
 def parse_3(self, response):
     # Prerender (Twisted) drops requests with huge http headers,
     # but this one should work, as cookies are not sent
     # to Prerender itself.
     yield {'response': response}
     yield PrerenderRequest(self.url + "#bar", self.parse_4,
                         endpoint='execute',
                         args={'lua_source': DEFAULT_SCRIPT},
                         cookies={'bomb': BOMB})
Beispiel #6
0
 def request_with_cookies(cookies):
     req = PrerenderRequest('http://example.com/foo',
                            endpoint='execute',
                            args={'lua_source': 'function main() end'},
                            magic_response=True,
                            cookies=cookies)
     req = cookie_mw.process_request(req, None) or req
     req = mw.process_request(req, None) or req
     return req
Beispiel #7
0
 def parse(self, response):
     le = LinkExtractor()
     for link in le.extract_links(response):
         yield PrerenderRequest(link.url,
                                self.parse_link,
                                endpoint='render.json',
                                args={
                                    'har': 1,
                                    'html': 1,
                                })
Beispiel #8
0
def test_dont_process_response():
    mw = _get_mw()
    req = PrerenderRequest(
        "http://example.com/",
        endpoint="render",
        dont_process_response=True,
    )
    req2 = mw.process_request(req, None)
    resp = Response("http://example.com/")
    resp2 = mw.process_response(req2, resp, None)
    assert resp2.__class__ is Response
    assert resp2 is resp
Beispiel #9
0
def test_change_response_class_to_text():
    mw = _get_mw()
    req = PrerenderRequest('http://example.com/', magic_response=True)
    req = mw.process_request(req, None)
    # Such response can come when downloading a file,
    # or returning prerender:html(): the headers say it's binary,
    # but it can be decoded so it becomes a TextResponse.
    resp = TextResponse('http://myprerender.example.com/execute',
                        headers={b'Content-Type': b'application/pdf'},
                        body=b'ascii binary data',
                        encoding='utf-8')
    resp2 = mw.process_response(req, resp, None)
    assert isinstance(resp2, TextResponse)
    assert resp2.url == 'http://example.com/'
    assert resp2.headers == {b'Content-Type': [b'application/pdf']}
    assert resp2.body == b'ascii binary data'
Beispiel #10
0
def test_unicode_url():
    mw = _get_mw()
    req = PrerenderRequest(
        # note unicode URL
        u"http://example.com/",
        endpoint='execute')
    req2 = mw.process_request(req, None)
    res = {'html': '<html><body>Hello</body></html>'}
    res_body = json.dumps(res)
    response = TextResponse(
        "http://myprerender.example.com/execute",
        # Scrapy doesn't pass request to constructor
        # request=req2,
        headers={b'Content-Type': b'application/json'},
        body=res_body.encode('utf8'))
    response2 = mw.process_response(req2, response, None)
    assert response2.url == "http://example.com/"
Beispiel #11
0
def test_prerender_request_no_url():
    mw = _get_mw()
    lua_source = "function main(prerender) return {result='ok'} end"
    req1 = PrerenderRequest(meta={
        'prerender': {
            'args': {
                'lua_source': lua_source
            },
            'endpoint': 'execute',
        }
    })
    req = mw.process_request(req1, None)
    assert req.url == 'http://127.0.0.1:8050/execute'
    assert json.loads(to_native_str(req.body)) == {
        'url': 'about:blank',
        'lua_source': lua_source
    }
Beispiel #12
0
def test_prerender_request():
    mw = _get_mw()
    cookie_mw = _get_cookie_mw()

    req = PrerenderRequest("http://example.com?foo=bar&url=1&wait=100")
    assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>"

    # check request preprocessing
    req2 = cookie_mw.process_request(req, None) or req
    req2 = mw.process_request(req2, None) or req2
    assert req2 is not None
    assert req2 is not req
    assert req2.url == "http://127.0.0.1:8050/render"
    assert req2.headers == {b'Content-Type': [b'application/json']}
    assert req2.method == 'POST'
    assert isinstance(req2, PrerenderRequest)
    assert repr(
        req2
    ) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render>"

    expected_body = {'url': req.url}
    assert json.loads(to_native_str(req2.body)) == expected_body

    # check response post-processing
    response = TextResponse(
        "http://127.0.0.1:8050/render",
        # Scrapy doesn't pass request to constructor
        # request=req2,
        headers={b'Content-Type': b'text/html'},
        body=b"<html><body>Hello</body></html>")
    response2 = mw.process_response(req2, response, None)
    response2 = cookie_mw.process_response(req2, response2, None)
    assert isinstance(response2, scrapy_prerender.PrerenderTextResponse)
    assert response2 is not response
    assert response2.real_url == req2.url
    assert response2.url == req.url
    assert response2.body == b"<html><body>Hello</body></html>"
    assert response2.css("body").extract_first() == "<body>Hello</body>"
    assert response2.headers == {b'Content-Type': [b'text/html']}

    # check .replace method
    response3 = response2.replace(status=404)
    assert response3.status == 404
    assert isinstance(response3, scrapy_prerender.PrerenderTextResponse)
    for attr in ['url', 'real_url', 'headers', 'body']:
        assert getattr(response3, attr) == getattr(response2, attr)
Beispiel #13
0
def test_change_response_class_to_json_binary():
    mw = _get_mw()
    # We set magic_response to False, because it's not a kind of data we would
    # expect from prerender: we just return binary data.
    # If we set magic_response to True, the middleware will fail,
    # but this is ok because magic_response presumes we are expecting
    # a valid prerender json response.
    req = PrerenderRequest('http://example.com/', magic_response=False)
    req = mw.process_request(req, None)
    resp = Response(
        'http://myprerender.example.com/execute',
        headers={b'Content-Type': b'application/json'},
        body=b'non-decodable data: \x98\x11\xe7\x17\x8f',
    )
    resp2 = mw.process_response(req, resp, None)
    assert isinstance(resp2, Response)
    assert resp2.url == 'http://example.com/'
    assert resp2.headers == {b'Content-Type': [b'application/json']}
    assert resp2.body == b'non-decodable data: \x98\x11\xe7\x17\x8f'
Beispiel #14
0
def test_prerender_request_meta():
    meta = {'foo': 'bar'}
    req = PrerenderRequest('http://example.com', meta=meta)
    assert 'prerender' in req.meta
    assert req.meta['foo'] == 'bar'
    assert meta == {'foo': 'bar'}
Beispiel #15
0
def test_magic_response():
    mw = _get_mw()
    cookie_mw = _get_cookie_mw()

    req = PrerenderRequest('http://example.com/',
                           endpoint='execute',
                           args={'lua_source': 'function main() end'},
                           magic_response=True,
                           cookies=[{
                               'name': 'foo',
                               'value': 'bar'
                           }])
    req = cookie_mw.process_request(req, None) or req
    req = mw.process_request(req, None) or req

    resp_data = {
        'url':
        "http://exmaple.com/#id42",
        'html':
        '<html><body>Hello 404</body></html>',
        'http_status':
        404,
        'headers': [
            {
                'name': 'Content-Type',
                'value': "text/html"
            },
            {
                'name': 'X-My-Header',
                'value': "foo"
            },
            {
                'name': 'Set-Cookie',
                'value': "bar=baz"
            },
        ],
        'cookies': [
            {
                'name': 'foo',
                'value': 'bar'
            },
            {
                'name': 'bar',
                'value': 'baz',
                'domain': '.example.com'
            },
            {
                'name': 'session',
                'value': '12345',
                'path': '/',
                'expires': '2055-07-24T19:20:30Z'
            },
        ],
    }
    resp = TextResponse("http://myprerender.example.com/execute",
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp2 = mw.process_response(req, resp, None)
    resp2 = cookie_mw.process_response(req, resp2, None)
    assert isinstance(resp2, scrapy_prerender.PrerenderJsonResponse)
    assert resp2.data == resp_data
    assert resp2.body == b'<html><body>Hello 404</body></html>'
    assert resp2.text == '<html><body>Hello 404</body></html>'
    assert resp2.headers == {
        b'Content-Type': [b'text/html'],
        b'X-My-Header': [b'foo'],
        b'Set-Cookie': [b'bar=baz'],
    }
    assert resp2.prerender_response_headers == {
        b'Content-Type': [b'application/json']
    }
    assert resp2.status == 404
    assert resp2.prerender_response_status == 200
    assert resp2.url == "http://exmaple.com/#id42"
    assert len(resp2.cookiejar) == 3
    cookies = [c for c in resp2.cookiejar]
    assert {(c.name, c.value)
            for c in cookies} == {('bar', 'baz'), ('foo', 'bar'),
                                  ('session', '12345')}

    # send second request using the same session and check the resulting cookies
    req = PrerenderRequest('http://example.com/foo',
                           endpoint='execute',
                           args={'lua_source': 'function main() end'},
                           magic_response=True,
                           cookies={'spam': 'ham'})
    req = cookie_mw.process_request(req, None) or req
    req = mw.process_request(req, None) or req

    resp_data = {
        'html':
        '<html><body>Hello</body></html>',
        'headers': [
            {
                'name': 'Content-Type',
                'value': "text/html"
            },
            {
                'name': 'X-My-Header',
                'value': "foo"
            },
            {
                'name': 'Set-Cookie',
                'value': "bar=baz"
            },
        ],
        'cookies': [
            {
                'name': 'spam',
                'value': 'ham'
            },
            {
                'name': 'egg',
                'value': 'spam'
            },
            {
                'name': 'bar',
                'value': 'baz',
                'domain': '.example.com'
            },
            #{'name': 'foo', 'value': ''},  -- this won't be in response
            {
                'name': 'session',
                'value': '12345',
                'path': '/',
                'expires': '2056-07-24T19:20:30Z'
            },
        ],
    }
    resp = TextResponse("http://myprerender.example.com/execute",
                        headers={b'Content-Type': b'application/json'},
                        body=json.dumps(resp_data).encode('utf8'))
    resp2 = mw.process_response(req, resp, None)
    resp2 = cookie_mw.process_response(req, resp2, None)
    assert isinstance(resp2, scrapy_prerender.PrerenderJsonResponse)
    assert resp2.data == resp_data
    cookies = [c for c in resp2.cookiejar]
    assert {c.name for c in cookies} == {'session', 'egg', 'bar', 'spam'}
    for c in cookies:
        if c.name == 'session':
            assert c.expires == 2731692030
        if c.name == 'spam':
            assert c.value == 'ham'
Beispiel #16
0
 def parse(self, response):
     yield {'response': response}
     yield PrerenderRequest(self.url + '#foo')
Beispiel #17
0
 def start_requests(self):
     yield PrerenderRequest(self.url)
Beispiel #18
0
 def parse_1(self, response):
     yield {'response': response}
     yield PrerenderRequest(self.url + "#foo", self.parse_2,
                         endpoint='execute',
                         args={'lua_source': DEFAULT_SCRIPT})
Beispiel #19
0
 def _request(self, url):
     return PrerenderRequest(url, endpoint='execute',
                          args={'lua_source': DEFAULT_SCRIPT, 'x': 'yy'},
                          cache_args=['lua_source'])
Beispiel #20
0
 def start_requests(self):
     yield PrerenderRequest(self.url, endpoint='execute',
                         args={'lua_source': DEFAULT_SCRIPT})
Beispiel #21
0
 def start_requests(self):
     yield PrerenderRequest(self.url + "#foo", endpoint='execute',
                     args={'lua_source': DEFAULT_SCRIPT, 'foo': 'bar'})
Beispiel #22
0
def test_prerender_request_parameters():
    mw = _get_mw()
    cookie_mw = _get_cookie_mw()

    def cb():
        pass

    req = PrerenderRequest("http://example.com/#!start",
                           cb,
                           'POST',
                           body="foo=bar",
                           prerender_url="http://myprerender.example.com",
                           slot_policy=SlotPolicy.SINGLE_SLOT,
                           endpoint="execute",
                           prerender_headers={'X-My-Header': 'value'},
                           args={
                               "lua_source": "function main() end",
                               "myarg": 3.0,
                           },
                           magic_response=False,
                           headers={'X-My-Header': 'value'})
    req2 = cookie_mw.process_request(req, None) or req
    req2 = mw.process_request(req2, None)
    assert req2.meta['prerender'] == {
        'endpoint': 'execute',
        'prerender_url': "http://myprerender.example.com",
        'slot_policy': SlotPolicy.SINGLE_SLOT,
        'prerender_headers': {
            'X-My-Header': 'value'
        },
        'magic_response': False,
        'session_id': 'default',
        'http_status_from_error_code': True,
        'args': {
            'url': "http://example.com/#!start",
            'http_method': 'POST',
            'body': 'foo=bar',
            'cookies': [],
            'lua_source': 'function main() end',
            'myarg': 3.0,
            'headers': {
                'X-My-Header': 'value',
            }
        },
    }
    assert req2.callback == cb
    assert req2.headers == {
        b'Content-Type': [b'application/json'],
        b'X-My-Header': [b'value'],
    }

    # check response post-processing
    res = {
        'html': '<html><body>Hello</body></html>',
        'num_divs': 0.0,
    }
    res_body = json.dumps(res)
    response = TextResponse(
        "http://myprerender.example.com/execute",
        # Scrapy doesn't pass request to constructor
        # request=req2,
        headers={b'Content-Type': b'application/json'},
        body=res_body.encode('utf8'))
    response2 = mw.process_response(req2, response, None)
    response2 = cookie_mw.process_response(req2, response2, None)
    assert isinstance(response2, scrapy_prerender.PrerenderJsonResponse)
    assert response2 is not response
    assert response2.real_url == req2.url
    assert response2.url == req.meta['prerender']['args']['url']
    assert response2.data == res
    assert response2.body == res_body.encode('utf8')
    assert response2.text == response2.body_as_unicode() == res_body
    assert response2.encoding == 'utf8'
    assert response2.headers == {b'Content-Type': [b'application/json']}
    assert response2.prerender_response_headers == response2.headers
    assert response2.status == response2.prerender_response_status == 200
Beispiel #23
0
def test_cache_args():
    spider = scrapy.Spider(name='foo')
    mw = _get_mw()
    mw.crawler.spider = spider
    mw.spider_opened(spider)
    dedupe_mw = PrerenderDeduplicateArgsMiddleware()

    # ========= Send first request - it should use save_args:
    lua_source = 'function main(prerender) end'
    req = PrerenderRequest('http://example.com/foo',
                           endpoint='execute',
                           args={'lua_source': lua_source},
                           cache_args=['lua_source'])

    assert req.meta['prerender']['args']['lua_source'] == lua_source
    # <---- spider
    req, = list(dedupe_mw.process_start_requests([req], spider))
    # ----> scheduler
    assert req.meta['prerender']['args']['lua_source'] != lua_source
    assert list(mw._argument_values.values()) == [lua_source]
    assert list(mw._argument_values.keys()) == [
        req.meta['prerender']['args']['lua_source']
    ]
    # <---- scheduler
    # process request before sending it to the downloader
    req = mw.process_request(req, spider) or req
    # -----> downloader
    assert req.meta['prerender']['args']['lua_source'] == lua_source
    assert req.meta['prerender']['args']['save_args'] == ['lua_source']
    assert 'load_args' not in req.meta['prerender']['args']
    assert req.meta['prerender']['_local_arg_fingerprints'] == {
        'lua_source': list(mw._argument_values.keys())[0]
    }
    # <---- downloader
    resp_body = b'{}'
    resp = TextResponse(
        "http://example.com",
        headers={
            b'Content-Type':
            b'application/json',
            b'X-Prerender-Saved-Arguments':
            b'lua_source=ba001160ef96fe2a3f938fea9e6762e204a562b3'
        },
        body=resp_body)
    resp = mw.process_response(req, resp, None)

    # ============ Send second request - it should use load_args
    req2 = PrerenderRequest('http://example.com/bar',
                            endpoint='execute',
                            args={'lua_source': lua_source},
                            cache_args=['lua_source'])
    req2, item = list(
        dedupe_mw.process_spider_output(resp, [req2, {
            'key': 'value'
        }], spider))
    assert item == {'key': 'value'}
    # ----> scheduler
    assert req2.meta['prerender']['args']['lua_source'] != lua_source
    # <---- scheduler
    # process request before sending it to the downloader
    req2 = mw.process_request(req2, spider) or req2
    # -----> downloader
    assert req2.meta['prerender']['args']['load_args'] == {
        "lua_source": "ba001160ef96fe2a3f938fea9e6762e204a562b3"
    }
    assert "lua_source" not in req2.meta['prerender']['args']
    assert "save_args" not in req2.meta['prerender']['args']
    assert json.loads(req2.body.decode('utf8')) == {
        'load_args': {
            'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'
        },
        'url': 'http://example.com/bar'
    }
    # <---- downloader
    resp = TextResponse("http://example.com/bar",
                        headers={b'Content-Type': b'application/json'},
                        body=b'{}')
    resp = mw.process_response(req, resp, spider)

    # =========== Third request is dispatched to another server where
    # =========== arguments are expired:
    req3 = PrerenderRequest('http://example.com/baz',
                            endpoint='execute',
                            args={'lua_source': lua_source},
                            cache_args=['lua_source'])
    req3, = list(dedupe_mw.process_spider_output(resp, [req3], spider))
    # ----> scheduler
    assert req3.meta['prerender']['args']['lua_source'] != lua_source
    # <---- scheduler
    req3 = mw.process_request(req3, spider) or req3
    # -----> downloader
    assert json.loads(req3.body.decode('utf8')) == {
        'load_args': {
            'lua_source': 'ba001160ef96fe2a3f938fea9e6762e204a562b3'
        },
        'url': 'http://example.com/baz'
    }
    # <---- downloader

    resp_body = json.dumps({
        "type": "ExpiredArguments",
        "description": "Arguments stored with ``save_args`` are expired",
        "info": {
            "expired": ["html"]
        },
        "error": 498
    })
    resp = TextResponse("127.0.0.1:8050",
                        headers={b'Content-Type': b'application/json'},
                        status=498,
                        body=resp_body.encode('utf8'))
    req4 = mw.process_response(req3, resp, spider)
    assert isinstance(req4, PrerenderRequest)

    # process this request again
    req4, = list(dedupe_mw.process_spider_output(resp, [req4], spider))
    req4 = mw.process_request(req4, spider) or req4

    # it should become save_args request after all middlewares
    assert json.loads(req4.body.decode('utf8')) == {
        'lua_source': 'function main(prerender) end',
        'save_args': ['lua_source'],
        'url': 'http://example.com/baz'
    }
    assert mw._remote_keys == {}
Beispiel #24
0
def test_meta_None():
    req1 = PrerenderRequest('http://example.com')
    req2 = PrerenderRequest('http://example.com', meta=None)
    assert req1.meta == req2.meta
Beispiel #25
0
def test_prerender_request_url_with_fragment():
    mw = _get_mw()
    url = "http://example.com#id1"
    req = PrerenderRequest(url)
    req = mw.process_request(req, None)
    assert json.loads(to_native_str(req.body)) == {'url': url}
Beispiel #26
0
 def parse(self, response):
     yield PrerenderRequest(self.url + "#egg", self.parse_1,
                         endpoint='execute',
                         args={'lua_source': DEFAULT_SCRIPT},
                         cookies={'x-set-prerender': '1'})