Esempio n. 1
0
    def test_delete_and_contains(self):
        h = Headers()

        h['Content-Type'] = 'text/html'
        assert 'Content-Type' in h

        del h['Content-Type']
        assert 'Content-Type' not in h
Esempio n. 2
0
    def test_multivalue(self):
        h = Headers()

        h['X-Forwarded-For'] = hlist = ['ip1', 'ip2']
        self.assertEqual(h['X-Forwarded-For'], 'ip2')
        self.assertEqual(h.get('X-Forwarded-For'), 'ip2')
        self.assertEqual(h.getlist('X-Forwarded-For'), hlist)
        assert h.getlist('X-Forwarded-For') is not hlist
Esempio n. 3
0
 def test_update(self):
     h = Headers()
     h.update({
         "Content-Type": "text/html",
         "X-Forwarded-For": ["ip1", "ip2"]
     })
     self.assertEqual(h.getlist("Content-Type"), [b"text/html"])
     self.assertEqual(h.getlist("X-Forwarded-For"), [b"ip1", b"ip2"])
Esempio n. 4
0
 def test_update(self):
     h = Headers()
     h.update({
         'Content-Type': 'text/html',
         'X-Forwarded-For': ['ip1', 'ip2']
     })
     self.assertEqual(h.getlist('Content-Type'), ['text/html'])
     self.assertEqual(h.getlist('X-Forwarded-For'), ['ip1', 'ip2'])
    def process_request(self, request, spider):
        """
        The request will be passed to the AutoExtract server only if the request
        is explicitly enabled with `{'autoextract': {'enabled': True}}` meta.
        The page type value must be also present, either in the
        AUTOEXTRACT_PAGE_TYPE option, or in `{'autoextract': {'pageType': '...'}}` meta.
        """
        if not self._is_enabled_for_request(request):
            return

        # If the request was already processed by AutoExtract
        if request.meta.get(AUTOEXTRACT_META_KEY):
            return

        if request.method != 'GET':
            raise AutoExtractError('Only GET requests are supported by AutoExtract')

        request.meta[AUTOEXTRACT_META_KEY] = {
            'original_url': request.url,
            'timing': {
                'start_ts': time.time()
            },
        }

        # Maybe over-write the page type value from the request
        page_type = self._check_page_type(request)
        logger.debug('Process AutoExtract request for %s URL %s',
                     page_type,
                     request,
                     extra={'spider': spider})

        # Define request timeout
        request.meta['download_timeout'] = self.timeout

        # Define concurrency settings
        self._set_download_slot(request, request.meta)

        payload = {'url': request.url, 'pageType': page_type}

        # Add the extra payload, if available
        extra_payload = self._get_extra_payload(request)
        if extra_payload:
            payload.update(extra_payload)

        headers = Headers({
            'Content-Type': 'application/json',
            'Authorization': basic_auth_header(self._api_user, self._api_pass)
        })

        new_request = request.replace(
            url=self._api_url,
            method='POST',
            headers=headers,
            body=json.dumps([payload], sort_keys=True),
        )

        self.inc_metric('autoextract/request_count')
        return new_request
Esempio n. 6
0
 def test_int_value(self):
     h1 = Headers({'hey': 5})
     h1['foo'] = 1
     h1.setdefault('bar', 2)
     h1.setlist('buz', [1, 'dos', 3])
     self.assertEqual(h1.getlist('foo'), [b'1'])
     self.assertEqual(h1.getlist('bar'), [b'2'])
     self.assertEqual(h1.getlist('buz'), [b'1', b'dos', b'3'])
     self.assertEqual(h1.getlist('hey'), [b'5'])
Esempio n. 7
0
 def _build_response(self, body, request):
     request.meta['download_latency'] = self.headers_time - self.start_time
     status = int(self.status)
     headers = Headers(self.response_headers)
     respcls = responsetypes.from_args(headers=headers, url=self._url)
     return respcls(url=self._url,
                    status=status,
                    headers=headers,
                    body=body)
Esempio n. 8
0
 def test_int_value(self):
     h1 = Headers({"hey": 5})
     h1["foo"] = 1
     h1.setdefault("bar", 2)
     h1.setlist("buz", [1, "dos", 3])
     self.assertEqual(h1.getlist("foo"), [b"1"])
     self.assertEqual(h1.getlist("bar"), [b"2"])
     self.assertEqual(h1.getlist("buz"), [b"1", b"dos", b"3"])
     self.assertEqual(h1.getlist("hey"), [b"5"])
Esempio n. 9
0
 def close_used_contexts(self, spider):
     contexts = list(self.used_contexts[id(spider)])
     if contexts:
         request = Request(urljoin(self.service_base_url, '/close_context'),
                           method='POST',
                           headers=Headers({'Content-Type': 'application/json'}),
                           meta={"proxy": None},
                           body=json.dumps(contexts))
         return self.crawler.engine.downloader.fetch(request, None)
Esempio n. 10
0
 def _cb_bodydone(self, result, request, url):
     txresponse, body, flags = result
     status = int(txresponse.code)
     headers = Headers(txresponse.headers.getAllRawHeaders())
     respcls = responsetypes.from_args(headers=headers, url=url, body=body)
     return respcls(url=url,
                    status=status,
                    headers=headers,
                    body=body,
                    flags=flags)
Esempio n. 11
0
def headers_to_scrapy(headers):
    """
    Return scrapy.http.Headers instance from headers data.
    3 data formats are supported:

    * {name: value, ...} dict;
    * [(name, value), ...] list;
    * [{'name': name, 'value': value'}, ...] list (HAR headers format).
    """
    if isinstance(headers or {}, dict):
        return Headers(headers or {})

    if isinstance(headers[0], dict):
        return Headers([
            (d['name'], d.get('value', ''))
            for d in headers
        ])

    return Headers(headers)
Esempio n. 12
0
 def test_from_headers(self):
     mappings = [
         ({'Content-Type': ['text/html; charset=utf-8']}, HtmlResponse),
         ({'Content-Type': ['application/octet-stream'], 'Content-Disposition': ['attachment; filename=data.txt']}, TextResponse),
         ({'Content-Type': ['text/html; charset=utf-8'], 'Content-Encoding': ['gzip']}, Response),
     ]
     for source, cls in mappings:
         source = Headers(source)
         retcls = responsetypes.from_headers(source)
         assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
Esempio n. 13
0
    def test_iterables(self):
        idict = {'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']}

        h = Headers(idict)
        self.assertEqual(dict(h), {'Content-Type': ['text/html'], 'X-Forwarded-For': ['ip1', 'ip2']})
        self.assertEqual(h.keys(), ['X-Forwarded-For', 'Content-Type'])
        self.assertEqual(h.items(), [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])])
        self.assertEqual(list(h.iteritems()),
                [('X-Forwarded-For', ['ip1', 'ip2']), ('Content-Type', ['text/html'])])

        self.assertEqual(h.values(), ['ip2', 'text/html'])
Esempio n. 14
0
 def retrieve_response(self, spider, request):
     data = self._read_data(spider, request)
     if data is None:
         return  # not cached
     url = data['url']
     status = data['status']
     headers = Headers(data['headers'])
     body = data['body']
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 15
0
def response_from_dict(response, spider=None, **kwargs):
    """Returns a dict based on a response from a spider"""
    url = response.get("url")
    status = "200"
    headers = Headers([(x, list(map(str, y))) for x, y in
                       response.get("headers").items()])
    body = response.get("body")

    respcls = responsetypes.from_args(headers=headers, url=url)
    response = respcls(url=url, headers=headers, status=status, body=body)
    return response
Esempio n. 16
0
 def retrieve_response(self, spider, request):
     doc = self.col.find_one({'url': request.url})
     if doc is None:
         return  # not cached
     status = 200
     url = doc['url']
     headers = Headers(doc['headers'])
     body = doc['body'].encode('utf-8')
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 17
0
 def retrieve_response(self, spider, request):
     key = self._request_key(spider, request)
     gf = self._get_file(spider, key)
     if gf is None:
         return # not cached
     url = str(gf.url)
     status = str(gf.status)
     headers = Headers([(x, map(str, y)) for x, y in gf.headers.iteritems()])
     body = gf.read()
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 18
0
    def test_basics(self):
        h = Headers({'Content-Type': 'text/html', 'Content-Length': 1234})
        assert h['Content-Type']
        assert h['Content-Length']

        self.assertRaises(KeyError, h.__getitem__, 'Accept')
        self.assertEqual(h.get('Accept'), None)
        self.assertEqual(h.getlist('Accept'), [])

        self.assertEqual(h.get('Accept', '*/*'), '*/*')
        self.assertEqual(h.getlist('Accept', '*/*'), ['*/*'])
        self.assertEqual(h.getlist('Accept', ['text/html', 'images/jpeg']), ['text/html','images/jpeg'])
Esempio n. 19
0
    def connectionMade(self):
        self.headers = Headers() # bucket for response headers

        # Method command
        self.sendCommand(self.factory.method, self.factory.path)
        # Headers
        for key, values in self.factory.headers.items():
            for value in values:
                self.sendHeader(key, value)
        self.endHeaders()
        # Body
        if self.factory.body is not None:
            self.transport.write(self.factory.body)
Esempio n. 20
0
    def test_iterables(self):
        idict = {'Content-Type': 'text/html', 'X-Forwarded-For': ['ip1', 'ip2']}

        h = Headers(idict)
        self.assertDictEqual(dict(h),
                             {b'Content-Type': [b'text/html'],
                              b'X-Forwarded-For': [b'ip1', b'ip2']})
        self.assertSortedEqual(h.keys(),
                               [b'X-Forwarded-For', b'Content-Type'])
        self.assertSortedEqual(h.items(),
                               [(b'X-Forwarded-For', [b'ip1', b'ip2']),
                                (b'Content-Type', [b'text/html'])])
        self.assertSortedEqual(h.values(), [b'ip2', b'text/html'])
Esempio n. 21
0
 def _cb_bodydone(self, result, request, url):
     headers = Headers(result["txresponse"].headers.getAllRawHeaders())
     respcls = responsetypes.from_args(headers=headers,
                                       url=url,
                                       body=result["body"])
     return respcls(
         url=url,
         status=int(result["txresponse"].code),
         headers=headers,
         body=result["body"],
         flags=result["flags"],
         certificate=result["certificate"],
         ip_address=result["ip_address"],
     )
Esempio n. 22
0
    def test_basics(self):
        h = Headers({"Content-Type": "text/html", "Content-Length": 1234})
        assert h["Content-Type"]
        assert h["Content-Length"]

        self.assertRaises(KeyError, h.__getitem__, "Accept")
        self.assertEqual(h.get("Accept"), None)
        self.assertEqual(h.getlist("Accept"), [])

        self.assertEqual(h.get("Accept", "*/*"), b"*/*")
        self.assertEqual(h.getlist("Accept", "*/*"), [b"*/*"])
        self.assertEqual(
            h.getlist("Accept", ["text/html", "images/jpeg"]),
            [b"text/html", b"images/jpeg"],
        )
Esempio n. 23
0
    def test_replace(self):
        """Test Response.replace() method"""
        hdrs = Headers({"key": "value"})
        r1 = self.response_class("http://www.example.com")
        r2 = r1.replace(status=301, body=b"New body", headers=hdrs)
        assert r1.body == b''
        self.assertEqual(r1.url, r2.url)
        self.assertEqual((r1.status, r2.status), (200, 301))
        self.assertEqual((r1.body, r2.body), (b'', b"New body"))
        self.assertEqual((r1.headers, r2.headers), ({}, hdrs))

        # Empty attributes (which may fail if not compared properly)
        r3 = self.response_class("http://www.example.com", flags=['cached'])
        r4 = r3.replace(body=b'', flags=[])
        self.assertEqual(r4.body, b'')
        self.assertEqual(r4.flags, [])
Esempio n. 24
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return  # not cached
     rpath = self._get_request_path(spider, request)
     with self._open(os.path.join(rpath, 'response_body'), 'rb') as f:
         body = f.read()
     with self._open(os.path.join(rpath, 'response_headers'), 'rb') as f:
         rawheaders = f.read()
     url = metadata.get('response_url')
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 25
0
 def retrieve_response(self, spider, request):
     key = request_fingerprint(request)
     value = self.conn.hget(self.name, key)
     if not value:
         return
     value_arr = value.split(self.separator)
     stored_data = json.loads(value_arr[0])
     metadata = stored_data['metadata']
     body = str(value_arr[2])
     rawheaders = stored_data['response_headers']
     url = str(metadata['response_url'])
     status = str(metadata['status'])
     headers = Headers(rawheaders)
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 26
0
    def process_request(self, request, spider):
        def get_ua():
            return getattr(self.ua, self.ua_type)

        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language':
            'zh-CN,zh;q=0.9',
            'Connection':
            'keep - alive',
            'referer':
            'https://search.jd.com/Search?keyword=%E5%9B%BE%E4%B9%A6&enc=utf-8&wq=%E5%9B%BE%E4%B9%A6&page=1'
        }
        headers['User_Agent'] = get_ua()
        request.headers = Headers(headers)
        print(request.headers)
Esempio n. 27
0
    def test_replace(self):
        """Test Request.replace() method"""
        r1 = self.request_class("http://www.example.com", method='GET')
        hdrs = Headers(dict(r1.headers, key='value'))
        r2 = r1.replace(method="POST", body="New body", headers=hdrs)
        self.assertEqual(r1.url, r2.url)
        self.assertEqual((r1.method, r2.method), ("GET", "POST"))
        self.assertEqual((r1.body, r2.body), ('', "New body"))
        self.assertEqual((r1.headers, r2.headers), (self.default_headers, hdrs))

        # Empty attributes (which may fail if not compared properly)
        r3 = self.request_class("http://www.example.com", meta={'a': 1}, dont_filter=True)
        r4 = r3.replace(url="http://www.example.com/2", body='', meta={}, dont_filter=False)
        self.assertEqual(r4.url, "http://www.example.com/2")
        self.assertEqual(r4.body, '')
        self.assertEqual(r4.meta, {})
        assert r4.dont_filter is False
Esempio n. 28
0
def retrieve_response(rpath):
    """
    Return response if present in cache, or None otherwise
    """
    metadata = read_meta(rpath)
    if metadata is None:
        return  # not cached
    with open(join(rpath, 'response_body'), 'rb') as f:
        body = f.read()
    with open(join(rpath, 'response_headers'), 'rb') as f:
        rawheaders = f.read()
    url = metadata.get('response_url')
    status = metadata['status']
    headers = Headers(headers_raw_to_dict(rawheaders))
    respcls = responsetypes.from_args(headers=headers, url=url)
    response = respcls(url=url, headers=headers, status=status, body=body)
    return response
Esempio n. 29
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return  # not cached
     rpath = self._get_request_path(spider, request)
     with open(join(rpath, 'response_body'), 'rb') as f:
         body = f.read()
     with open(join(rpath, 'response_headers'), 'rb') as f:
         rawheaders = f.read()
     # We failback to metadata['url'] to support old generated caches. TODO: remove for Scrapy 0.11
     url = metadata.get('response_url') or metadata['url']
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Esempio n. 30
0
    def test_headers(self):
        # Different ways of setting headers attribute
        url = 'http://www.scrapy.org'
        headers = {b'Accept': 'gzip', b'Custom-Header': 'nothing to tell you'}
        r = self.request_class(url=url, headers=headers)
        p = self.request_class(url=url, headers=r.headers)

        self.assertEqual(r.headers, p.headers)
        self.assertFalse(r.headers is headers)
        self.assertFalse(p.headers is r.headers)

        # headers must not be unicode
        h = Headers({'key1': u'val1', u'key2': 'val2'})
        h[u'newkey'] = u'newval'
        for k, v in h.iteritems():
            self.assert_(isinstance(k, bytes))
            for s in v:
                self.assert_(isinstance(s, bytes))