Example #1
0
 def test_referer_header(self):
     """Referer header is set by RefererMiddleware unless it is already set"""
     req0 = Request(self.mockserver.url('/echo?headers=1&body=0'), dont_filter=1)
     req1 = req0.replace()
     req2 = req0.replace(headers={'Referer': None})
     req3 = req0.replace(headers={'Referer': 'http://example.com'})
     req0.meta['next'] = req1
     req1.meta['next'] = req2
     req2.meta['next'] = req3
     crawler = self.runner.create_crawler(SingleRequestSpider)
     yield crawler.crawl(seed=req0, mockserver=self.mockserver)
     # basic asserts in case of weird communication errors
     self.assertIn('responses', crawler.spider.meta)
     self.assertNotIn('failures', crawler.spider.meta)
     # start requests doesn't set Referer header
     echo0 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
     self.assertNotIn('Referer', echo0['headers'])
     # following request sets Referer to start request url
     echo1 = json.loads(to_unicode(crawler.spider.meta['responses'][1].body))
     self.assertEqual(echo1['headers'].get('Referer'), [req0.url])
     # next request avoids Referer header
     echo2 = json.loads(to_unicode(crawler.spider.meta['responses'][2].body))
     self.assertNotIn('Referer', echo2['headers'])
     # last request explicitly sets a Referer header
     echo3 = json.loads(to_unicode(crawler.spider.meta['responses'][3].body))
     self.assertEqual(echo3['headers'].get('Referer'), ['http://example.com'])
Example #2
0
 def to_unicode_dict(self):
     """ Return headers as a CaselessDict with unicode keys
     and unicode values. Multiple values are joined with ','.
     """
     return CaselessDict(
         (to_unicode(key, encoding=self.encoding),
          to_unicode(b','.join(value), encoding=self.encoding))
         for key, value in self.items())
Example #3
0
 def render_GET(self, request):
     output = {
         'headers': dict(
             (to_unicode(k), [to_unicode(v) for v in vs])
             for k, vs in request.requestHeaders.getAllRawHeaders()),
         'body': to_unicode(request.content.read()),
     }
     return to_bytes(json.dumps(output))
Example #4
0
def scrapy_headers_to_unicode_dict(headers):
    """
    Convert scrapy.http.Headers instance to a dictionary
    suitable for JSON encoding.
    """
    return {
        to_unicode(key): to_unicode(b','.join(value))
        for key, value in headers.items()
    }
Example #5
0
 def _connect(self, factory):
     host, port = to_unicode(factory.host), factory.port
     if factory.scheme == b'https':
         return reactor.connectSSL(host, port, factory,
                                   self.ClientContextFactory())
     else:
         return reactor.connectTCP(host, port, factory)
Example #6
0
 def assertTwoItemsExported(self, item):
     self.ie.start_exporting()
     self.ie.export_item(item)
     self.ie.export_item(item)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     self.assertEqual(exported, [dict(item), dict(item)])
Example #7
0
 def _clientfactory(url, *args, **kwargs):
     url = to_unicode(url)
     timeout = kwargs.pop('timeout', 0)
     f = client.ScrapyHTTPClientFactory(
         Request(url, *args, **kwargs), timeout=timeout)
     f.deferred.addCallback(response_transform or (lambda r: r.body))
     return f
Example #8
0
def parse_url(url, encoding=None):
    """Return urlparsed url from the given argument (which could be an already
    parsed url)
    """
    if isinstance(url, ParseResult):
        return url
    return urlparse(to_unicode(url, encoding))
Example #9
0
def request_to_dict(request, spider=None):
    """Convert Request object to a dict.

    If a spider is given, it will try to find out the name of the spider method
    used in the callback and store that as the callback.
    """
    cb = request.callback
    if callable(cb):
        cb = _find_method(spider, cb)
    eb = request.errback
    if callable(eb):
        eb = _find_method(spider, eb)
    d = {
        'url': to_unicode(request.url),  # urls should be safe (safe_string_url)
        'callback': cb,
        'errback': eb,
        'method': request.method,
        'headers': dict(request.headers),
        'body': request.body,
        'cookies': request.cookies,
        'meta': request.meta,
        '_encoding': request._encoding,
        'priority': request.priority,
        'dont_filter': request.dont_filter,
    }
    return d
Example #10
0
 def to_native_str(text, encoding=None, errors='strict'):
     """ Return str representation of `text`
     (bytes in Python 2.x and unicode in Python 3.x). """
     if six.PY2:
         return to_bytes(text, encoding, errors)
     else:
         return to_unicode(text, encoding, errors)
Example #11
0
 def from_content_disposition(self, content_disposition):
     try:
         filename = to_unicode(
             content_disposition, encoding='latin-1',
             errors='replace').split(';')[1].split('=')[1].strip('"\'')
         return self.from_filename(filename)
     except IndexError:
         return Response
Example #12
0
 def _debug_set_cookie(self, response, spider):
     if self.debug:
         cl = [to_unicode(c, errors='replace')
               for c in response.headers.getlist('Set-Cookie')]
         if cl:
             cookies = "\n".join(f"Set-Cookie: {c}\n" for c in cl)
             msg = f"Received cookies from: {response}\n{cookies}"
             logger.debug(msg, extra={'spider': spider})
Example #13
0
 def _debug_cookie(self, request, spider):
     if self.debug:
         cl = [to_unicode(c, errors='replace')
               for c in request.headers.getlist('Cookie')]
         if cl:
             cookies = "\n".join(f"Cookie: {c}\n" for c in cl)
             msg = f"Sending cookies to: {request}\n{cookies}"
             logger.debug(msg, extra={'spider': spider})
Example #14
0
 def test_nonstring_types_item(self):
     item = self._get_nonstring_types_item()
     self.ie.start_exporting()
     self.ie.export_item(item)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     item['time'] = str(item['time'])
     self.assertEqual(exported, [item])
Example #15
0
 def test_nonstring_types_item(self):
     item = self._get_nonstring_types_item()
     self.ie.start_exporting()
     self.ie.export_item(item)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     item['time'] = str(item['time'])
     self.assertEqual(exported, [item])
Example #16
0
 def _body_inferred_encoding(self):
     if self._cached_benc is None:
         content_type = to_unicode(self.headers.get(b'Content-Type', b''))
         benc, ubody = html_to_unicode(content_type, self.body,
                                       auto_detect_fun=self._auto_detect_fun,
                                       default_encoding=self._DEFAULT_ENCODING)
         self._cached_benc = benc
         self._cached_ubody = ubody
     return self._cached_benc
Example #17
0
def replace_chars(text,
                  which_ones=('\n', '\t', '\x85', '\x97'),
                  replace_by=u'',
                  encoding=None):
    """Remove escape characters.

    `which_ones` is a tuple of which escape characters we want to remove.
    By default removes ``\\n``, ``\\t``, ``\\x85``, ``\\x97``.

    `replace_by` is the string to replace the escape characters by.
    It defaults to ``''``, meaning the escape characters are removed.

    """

    text = to_unicode(text, encoding)
    for ec in which_ones:
        text = text.replace(ec, to_unicode(replace_by, encoding))
    return text
Example #18
0
 def _connect(self, factory):
     host, port = to_unicode(factory.host), factory.port
     if factory.scheme == b'https':
         client_context_factory = create_instance(
             self.ClientContextFactory, settings=self._settings, crawler=None)
         return reactor.connectSSL(
             host, port, factory, client_context_factory)
     else:
         return reactor.connectTCP(host, port, factory)
Example #19
0
 def test_nested_item(self):
     i1 = TestItem(name=u'Joseph', age='22')
     i2 = dict(name=u'Maria', age=i1)
     i3 = TestItem(name=u'Jesus', age=i2)
     self.ie.start_exporting()
     self.ie.export_item(i3)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     self.assertEqual(exported, self._expected_nested)
Example #20
0
 def test_nested_item(self):
     i1 = self.item_class(name="Joseph", age="22")
     i2 = dict(name="Maria", age=i1)
     i3 = self.item_class(name="Jesus", age=i2)
     self.ie.start_exporting()
     self.ie.export_item(i3)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     self.assertEqual(exported, self._expected_nested)
Example #21
0
 def assertTwoItemsExported(self, item):
     self.ie.start_exporting()
     self.ie.export_item(item)
     self.ie.export_item(item)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     self.assertEqual(
         exported, [ItemAdapter(item).asdict(),
                    ItemAdapter(item).asdict()])
Example #22
0
 def test_nested_item(self):
     i1 = TestItem(name=u'Joseph', age='22')
     i2 = dict(name=u'Maria', age=i1)
     i3 = TestItem(name=u'Jesus', age=i2)
     self.ie.start_exporting()
     self.ie.export_item(i3)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     self.assertEqual(exported, self._expected_nested)
Example #23
0
 def _serialize_dict(self, value, pre=None, field_filter=None):
     for key, val in value.items():
         k = None
         if field_filter:
             if pre is not None:
                 k = pre_join(pre, key)
                 if k in field_filter:
                     continue
         yield to_unicode(key), self._serialize_value(
             val, pre=k, field_filter=field_filter)
Example #24
0
    def test_unicode_url(self):
        # instantiate with unicode url without encoding (should set default encoding)
        resp = self.response_class(u"http://www.example.com/")
        self._assert_response_encoding(resp, self.response_class._DEFAULT_ENCODING)

        # make sure urls are converted to str
        resp = self.response_class(url=u"http://www.example.com/", encoding='utf-8')
        assert isinstance(resp.url, str)

        resp = self.response_class(url=u"http://www.example.com/price/\xa3", encoding='utf-8')
        self.assertEqual(resp.url, to_unicode(b'http://www.example.com/price/\xc2\xa3'))
        resp = self.response_class(url=u"http://www.example.com/price/\xa3", encoding='latin-1')
        self.assertEqual(resp.url, 'http://www.example.com/price/\xa3')
        resp = self.response_class(u"http://www.example.com/price/\xa3",
                                   headers={"Content-type": ["text/html; charset=utf-8"]})
        self.assertEqual(resp.url, to_unicode(b'http://www.example.com/price/\xc2\xa3'))
        resp = self.response_class(u"http://www.example.com/price/\xa3",
                                   headers={"Content-type": ["text/html; charset=iso-8859-1"]})
        self.assertEqual(resp.url, 'http://www.example.com/price/\xa3')
Example #25
0
 def _debug_cookie(self, request, spider):
     if self.debug:
         cl = [
             to_unicode(c, errors='replace')
             for c in request.headers.getlist('Cookie')
         ]
         if cl:
             cookies = "\n".join("Cookie: {}\n".format(c) for c in cl)
             msg = "Sending cookies to: {}\n{}".format(request, cookies)
             logger.debug(msg, extra={'spider': spider})
Example #26
0
 def test_nested_dict_item(self):
     i1 = dict(name="Joseph\xa3", age="22")
     i2 = self.item_class(name="Maria", age=i1)
     i3 = dict(name="Jesus", age=i2)
     self.ie.start_exporting()
     self.ie.export_item(i3)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     expected = {"name": "Jesus", "age": {"name": "Maria", "age": i1}}
     self.assertEqual(exported, [expected])
Example #27
0
 def test_nested_dict_item(self):
     i1 = dict(name=u"Joseph\xa3", age="22")
     i2 = TestItem(name=u"Maria", age=i1)
     i3 = dict(name=u"Jesus", age=i2)
     self.ie.start_exporting()
     self.ie.export_item(i3)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     expected = {"name": u"Jesus", "age": {"name": "Maria", "age": i1}}
     self.assertEqual(exported, [expected])
Example #28
0
 def test_nested_item(self):
     i1 = self.item_class(name='Joseph\xa3', age='22')
     i2 = self.item_class(name='Maria', age=i1)
     i3 = self.item_class(name='Jesus', age=i2)
     self.ie.start_exporting()
     self.ie.export_item(i3)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     expected = {'name': 'Jesus', 'age': {'name': 'Maria', 'age': ItemAdapter(i1).asdict()}}
     self.assertEqual(exported, [expected])
Example #29
0
    def proc(self, *new_args, **popen_kwargs):
        args = (sys.executable, '-m', 'scrapy.cmdline') + new_args
        p = subprocess.Popen(args, cwd=self.cwd, env=self.env,
                             stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                             **popen_kwargs)

        def kill_proc():
            p.kill()
            p.communicate()
            assert False, 'Command took too much time to complete'

        timer = Timer(15, kill_proc)
        try:
            timer.start()
            stdout, stderr = p.communicate()
        finally:
            timer.cancel()

        return p, to_unicode(stdout), to_unicode(stderr)
Example #30
0
 def response_to_dict(response):
     d = {
         'url': to_unicode(response.url),
         'status': int(response.status),
         'headers': dict(response.headers),
         'body': response.body,
         'flags': list(response.flags),
         'request': request_to_dict(response.request)
     }
     return d
Example #31
0
 def _build_response(self, body, request):
     request.meta['download_latency'] = self.headers_time - self.start_time
     status = int(self.status)
     headers = Headers(self.response_headers)
     respcls = responsetypes.from_args(headers=headers, url=self._url)
     return respcls(url=self._url,
                    status=status,
                    headers=headers,
                    body=body,
                    protocol=to_unicode(self.version))
Example #32
0
 def test_nested_dict_item(self):
     i1 = dict(name=u'Joseph\xa3', age='22')
     i2 = TestItem(name=u'Maria', age=i1)
     i3 = dict(name=u'Jesus', age=i2)
     self.ie.start_exporting()
     self.ie.export_item(i3)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     expected = {'name': u'Jesus', 'age': {'name': 'Maria', 'age': i1}}
     self.assertEqual(exported, [expected])
Example #33
0
 def test_nested_dict_item(self):
     i1 = dict(name=u'Joseph\xa3', age='22')
     i2 = TestItem(name=u'Maria', age=i1)
     i3 = dict(name=u'Jesus', age=i2)
     self.ie.start_exporting()
     self.ie.export_item(i3)
     self.ie.finish_exporting()
     exported = json.loads(to_unicode(self.output.getvalue()))
     expected = {'name': u'Jesus', 'age': {'name': 'Maria', 'age': i1}}
     self.assertEqual(exported, [expected])
Example #34
0
 def assertExportedJsonLines(self, items, rows, settings=None):
     settings = settings or {}
     settings.update({
         'FEEDS': {
             self._random_temp_filename(): {'format': 'jl'},
         },
     })
     data = yield self.exported_data(items, settings)
     parsed = [json.loads(to_unicode(line)) for line in data['jl'].splitlines()]
     rows = [{k: v for k, v in row.items() if v} for row in rows]
     self.assertEqual(rows, parsed)
Example #35
0
 def _serialize_value(self, value):
     if isinstance(value, BaseItem):
         return self.export_item(value)
     if isinstance(value, dict):
         return dict(self._serialize_dict(value))
     if is_listlike(value):
         return [self._serialize_value(v) for v in value]
     if self.binary:
         return to_bytes(value, encoding=self.encoding)
     else:
         return to_unicode(value, encoding=self.encoding)
Example #36
0
 def push(self, request):
     url = request.url
     cb = request.callback
     if callable(cb):
         cb = _find_method(self.spider, cb)
     eb = request.errback
     if callable(eb):
         eb = _find_method(self.spider, eb)
     d = {'url': to_unicode(url), 'callback': cb, 'errback': eb}
     d = self.serializer.dumps(d)
     self.server.rpush(self.key, d)
Example #37
0
def parse_data(data):
    if isinstance(data, (dict, scrapy.Item)):
        return {parse_data(k): parse_data(v) for k, v in data.items()}
    elif isinstance(data, list):
        return [parse_data(x) for x in data]
    elif isinstance(data, bytes):
        return to_unicode(data)
    elif isinstance(data, datetime):
        return data.isoformat()
    elif isinstance(data, (int, float)):
        return data
    return str(data)
Example #38
0
 def _connect(self, factory):
     from twisted.internet import reactor
     host, port = to_unicode(factory.host), factory.port
     if factory.scheme == b'https':
         client_context_factory = create_instance(
             objcls=self.ClientContextFactory,
             settings=self._settings,
             crawler=self._crawler,
         )
         return reactor.connectSSL(host, port, factory, client_context_factory)
     else:
         return reactor.connectTCP(host, port, factory)
 def _serialize_value(self, value):
     try:
         if isinstance(value, dict):
             return dict(self._serialize_dict(value))
         value = super(TextDictKeyPythonItemExporter,
                       self)._serialize_value(value)
     except UnicodeDecodeError as e:
         if self.ensure_base64 and isinstance(value, bytes):
             value = to_unicode(base64.encodebytes(value))
         else:
             raise e
     return value
Example #40
0
    def _get_agent(self, request, timeout):
        from twisted.internet import reactor
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            proxyScheme, proxyNetloc, proxyHost, proxyPort, proxyParams = _parse(
                proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if omitConnectTunnel:
                warnings.warn(
                    "Using HTTPS proxies in the noconnect mode is deprecated. "
                    "If you use Zyte Smart Proxy Manager, it doesn't require "
                    "this mode anymore, so you should update scrapy-crawlera "
                    "to scrapy-zyte-smartproxy and remove '?noconnect' "
                    "from the Zyte Smart Proxy Manager URL.",
                    ScrapyDeprecationWarning,
                )
            if scheme == b'https' and not omitConnectTunnel:
                proxyAuth = request.headers.get(b'Proxy-Authorization', None)
                proxyConf = (proxyHost, proxyPort, proxyAuth)
                return self._TunnelingAgent(
                    reactor=reactor,
                    proxyConf=proxyConf,
                    contextFactory=self._contextFactory,
                    connectTimeout=timeout,
                    bindAddress=bindaddress,
                    pool=self._pool,
                )
            else:
                proxyScheme = proxyScheme or b'http'
                proxyHost = to_bytes(proxyHost, encoding='ascii')
                proxyPort = to_bytes(str(proxyPort), encoding='ascii')
                proxyURI = urlunparse(
                    (proxyScheme, proxyNetloc, proxyParams, '', '', ''))
                return self._ProxyAgent(
                    reactor=reactor,
                    proxyURI=to_bytes(proxyURI, encoding='ascii'),
                    connectTimeout=timeout,
                    bindAddress=bindaddress,
                    pool=self._pool,
                )

        return self._Agent(
            reactor=reactor,
            contextFactory=self._contextFactory,
            connectTimeout=timeout,
            bindAddress=bindaddress,
            pool=self._pool,
        )
Example #41
0
    def export_item(self, item):
        if self._headers_not_written:
            self._headers_not_written = False
            self._write_headers_and_set_fields_to_export(item)

        fields = self._get_serialized_fields(
            item,
            default_value='',
            include_empty=True)

        values = list(self._build_row(x for _, x in fields))

        self.stream.write(to_unicode(
            serialize_html_table_row(values),
            self.encoding))
Example #42
0
    def test_jsonrpc_client_call_request(self):
        sentcall = {}

        def _urlopen(url, data):
            sentcall['url'] = url
            sentcall['data'] = data
            return _umock(1)

        with patch.object(urllib.request, 'urlopen', _urlopen):
            jsonrpc_client_call('url', 'test', 'one', 2)
            req = json.loads(to_unicode(sentcall['data']))
            assert 'id' in req
            self.assertEqual(sentcall['url'], 'url')
            self.assertEqual(req['jsonrpc'], '2.0')
            self.assertEqual(req['method'], 'test')
            self.assertEqual(req['params'], ['one', 2])
Example #43
0
def decode_robotstxt(robotstxt_body, spider, to_native_str_type=False):
    try:
        if to_native_str_type:
            robotstxt_body = to_unicode(robotstxt_body)
        else:
            robotstxt_body = robotstxt_body.decode('utf-8')
    except UnicodeDecodeError:
        # If we found garbage or robots.txt in an encoding other than UTF-8, disregard it.
        # Switch to 'allow all' state.
        logger.warning(
            "Failure while parsing robots.txt. "
            "File either contains garbage or is in an encoding other than UTF-8, treating it as an empty file.",
            exc_info=sys.exc_info(),
            extra={'spider': spider})
        robotstxt_body = ''
    return robotstxt_body
Example #44
0
 def assertExportedMultiple(self, items, rows, settings=None):
     settings = settings or {}
     settings.update({
         'FEEDS': {
             self._random_temp_filename(): {'format': 'xml'},
             self._random_temp_filename(): {'format': 'json'},
         },
     })
     data = yield self.exported_data(items, settings)
     rows = [{k: v for k, v in row.items() if v} for row in rows]
     # XML
     root = lxml.etree.fromstring(data['xml'])
     xml_rows = [{e.tag: e.text for e in it} for it in root.findall('item')]
     self.assertEqual(rows, xml_rows)
     # JSON
     json_rows = json.loads(to_unicode(data['json']))
     self.assertEqual(rows, json_rows)
Example #45
0
    def assertExportedCsv(self, items, header, rows, settings=None, ordered=True):
        settings = settings or {}
        settings.update({
            'FEEDS': {
                self._random_temp_filename(): {'format': 'csv'},
            },
        })
        data = yield self.exported_data(items, settings)

        reader = csv.DictReader(to_unicode(data['csv']).splitlines())
        got_rows = list(reader)
        if ordered:
            self.assertEqual(reader.fieldnames, header)
        else:
            self.assertEqual(set(reader.fieldnames), set(header))

        self.assertEqual(rows, got_rows)
Example #46
0
    def _extract_links(self, response_text, response_url, response_encoding, base_url=None):
        """ Do the real extraction work """
        self.reset()
        self.feed(response_text)
        self.close()

        ret = []
        if base_url is None:
            base_url = urljoin(response_url, self.base_url) if self.base_url else response_url
        for link in self.links:
            if isinstance(link.url, unicode):
                link.url = link.url.encode(response_encoding)
            link.url = urljoin(base_url, link.url)
            link.url = safe_url_string(link.url, response_encoding)
            link.text = to_unicode(link.text, response_encoding, errors='replace').strip()
            ret.append(link)

        return ret
Example #47
0
    def _get_agent(self, request, timeout):
        bindaddress = request.meta.get('bindaddress') or self._bindAddress
        proxy = request.meta.get('proxy')
        if proxy:
            _, _, proxyHost, proxyPort, proxyParams = _parse(proxy)
            scheme = _parse(request.url)[0]
            proxyHost = to_unicode(proxyHost)
            omitConnectTunnel = b'noconnect' in proxyParams
            if  scheme == b'https' and not omitConnectTunnel:
                proxyConf = (proxyHost, proxyPort,
                             request.headers.get(b'Proxy-Authorization', None))
                return self._TunnelingAgent(reactor, proxyConf,
                    contextFactory=self._contextFactory, connectTimeout=timeout,
                    bindAddress=bindaddress, pool=self._pool)
            else:
                return self._ProxyAgent(reactor, proxyURI=to_bytes(proxy, encoding='ascii'),
                    connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)

        return self._Agent(reactor, contextFactory=self._contextFactory,
            connectTimeout=timeout, bindAddress=bindaddress, pool=self._pool)
Example #48
0
def parse_x_splash_saved_arguments_header(value):
    """
    Parse X-Splash-Saved-Arguments header value.

    >>> value = u"name1=9a6747fc6259aa374ab4e1bb03074b6ec672cf99;name2=ba001160ef96fe2a3f938fea9e6762e204a562b3"
    >>> dct = parse_x_splash_saved_arguments_header(value)
    >>> sorted(list(dct.keys()))
    ['name1', 'name2']
    >>> dct['name1']
    '9a6747fc6259aa374ab4e1bb03074b6ec672cf99'
    >>> dct['name2']
    'ba001160ef96fe2a3f938fea9e6762e204a562b3'

    Binary header values are also supported:
    >>> dct2 = parse_x_splash_saved_arguments_header(value.encode('utf8'))
    >>> dct2 == dct
    True
    """
    value = to_unicode(value)
    return dict(kv.split('=', 1) for kv in  value.split(";"))
Example #49
0
 def request_to_dict(self, request):
     '''
     Convert Request object to a dict.
     modified from scrapy.utils.reqser
     '''
     req_dict = {
         # urls should be safe (safe_string_url)
         'url': to_unicode(request.url),
         'method': request.method,
         'headers': dict(request.headers),
         'body': request.body,
         'cookies': request.cookies,
         'meta': request.meta,
         '_encoding': request._encoding,
         'priority': request.priority,
         'dont_filter': request.dont_filter,
          #  callback/errback are assumed to be a bound instance of the spider
         'callback': None if request.callback is None else request.callback.__name__,
         'errback': None if request.errback is None else request.errback.__name__,
     }
     return req_dict
Example #50
0
def extract_regex(regex, text, encoding="utf-8"):
    """Extract a list of unicode strings from the given text/encoding using the following policies:

    * if the regex contains a named group called "extract" that will be returned
    * if the regex contains multiple numbered groups, all those will be returned (flattened)
    * if the regex doesn't contain any group the entire regex matching is returned
    """

    if isinstance(regex, six.string_types):
        regex = re.compile(regex, re.UNICODE)

    try:
        strings = [regex.search(text).group("extract")]  # named group
    except:
        strings = regex.findall(text)  # full regex or numbered groups
    strings = flatten(strings)

    if isinstance(text, six.text_type):
        return [replace_entities(s, keep=["lt", "amp"]) for s in strings]
    else:
        return [replace_entities(to_unicode(s, encoding), keep=["lt", "amp"]) for s in strings]
Example #51
0
 def _getrow(csv_r):
     return [to_unicode(field, encoding) for field in next(csv_r)]
Example #52
0
 def assertCsvEqual(self, first, second, msg=None):
     first = to_unicode(first)
     second = to_unicode(second)
     csvsplit = lambda csv: [sorted(re.split(r'(,|\s+)', line))
                             for line in csv.splitlines(True)]
     return self.assertEqual(csvsplit(first), csvsplit(second), msg)
Example #53
0
 def _check_output(self):
     self.assertCsvEqual(to_unicode(self.output.getvalue()), u'age,name\r\n22,John\xa3\r\n')
Example #54
0
 def test_errors_argument(self):
     self.assertEqual(
         to_unicode(b'a\xedb', 'utf-8', errors='replace'),
         u'a\ufffdb'
     )
Example #55
0
 def _assert_expected_item(self, exported_dict):
     for k, v in exported_dict.items():
         exported_dict[k] = to_unicode(v)
     self.assertEqual(self.i, exported_dict)
Example #56
0
 def _check_output(self):
     exported = json.loads(to_unicode(self.output.getvalue().strip()))
     self.assertEqual(exported, [dict(self.i)])
Example #57
0
def rfc1123_to_epoch(date_str):
    try:
        date_str = to_unicode(date_str, encoding='ascii')
        return mktime_tz(parsedate_tz(date_str))
    except Exception:
        return None
Example #58
0
 def _check_Encoding(self, response, original_body):
     content_encoding = to_unicode(response.headers[b'Content-Encoding'])
     self.assertEquals(content_encoding, EncodingResource.out_encoding)
     self.assertEquals(
         response.body.decode(content_encoding), to_unicode(original_body))
Example #59
0
 def testFactoryInfo(self):
     url = self.getURL('file')
     _, _, host, port, _ = client._parse(url)
     factory = client.ScrapyHTTPClientFactory(Request(url))
     reactor.connectTCP(to_unicode(host), port, factory)
     return factory.deferred.addCallback(self._cbFactoryInfo, factory)
Example #60
0
 def render(self, request):
     body = to_unicode(request.content.read())
     request.setHeader(b'content-encoding', self.out_encoding)
     return body.encode(self.out_encoding)