def uri_to_iri(uri, charset='utf-8', errors='replace'): r"""Converts a URI in a given charset to a IRI. Examples for URI versus IRI >>> uri_to_iri('http://xn--n3h.net/') u'http://\u2603.net/' >>> uri_to_iri('http://%C3%BCser:p%C3%[email protected]/p%C3%A5th') u'http://\xfcser:p\xe4ssword@\u2603.net/p\xe5th' Query strings are left unchanged: >>> uri_to_iri('/?foo=24&x=%26%2f') u'/?foo=24&x=%26%2f' .. versionadded:: 0.6 :param uri: the URI to convert :param charset: the charset of the URI :param errors: the error handling on decode """ uri = url_fix(str(uri), charset) scheme, auth, hostname, port, path, query, fragment = _uri_split(uri) scheme = _decode_unicode(scheme, 'ascii', errors) try: hostname = hostname.decode('idna') except UnicodeError: # dammit, that codec raised an error. Because it does not support # any error handling we have to fake it.... badly if errors not in ('ignore', 'replace'): raise hostname = hostname.decode('ascii', errors) if auth: if ':' in auth: auth, password = auth.split(':', 1) else: password = None auth = _decode_unicode(_unquote(auth), charset, errors) if password: auth += u':' + _decode_unicode(_unquote(password), charset, errors) hostname = auth + u'@' + hostname if port: # port should be numeric, but you never know... hostname += u':' + port.decode(charset, errors) path = _decode_unicode(_unquote(path, '/;?'), charset, errors) query = _decode_unicode(_unquote(query, ';/?:@&=+,$'), charset, errors) return urlparse.urlunsplit([scheme, hostname, path, query, fragment])
def from_file(cls, file, charset='utf-8', errors='strict', unicode_mode=True): """Load a template from a file. .. versionchanged:: 0.5 The encoding parameter was renamed to charset. :param file: a filename or file object to load the template from. :param charset: the charset of the template to load. :param errors: the error behavior of the charset decoding. :param unicode_mode: set to `False` to disable unicode mode. :return: a template """ close = False f = file if isinstance(file, basestring): f = open(file, 'r') close = True try: data = _decode_unicode(f.read(), charset, errors) finally: if close: f.close() return cls(data, getattr(f, 'name', '<template>'), charset, errors, unicode_mode)
def start_file_streaming(self, filename, headers, total_content_length): filename = _decode_unicode(filename, self.charset, self.errors) filename = self._fix_ie_filename(filename) content_type = headers.get('content_type') try: content_length = int(headers['content-length']) except (KeyError, ValueError): content_length = 0 container = self.stream_factory(total_content_length, content_type, filename, content_length) return filename, container
def __init__(self, source, filename='<template>', charset='utf-8', errors='strict', unicode_mode=True): if isinstance(source, str): source = _decode_unicode(source, charset, errors) if isinstance(filename, unicode): filename = filename.encode('utf-8') node = Parser(tokenize(u'\n'.join(source.splitlines()), filename), filename).parse() self.code = TemplateCodeGenerator(node, filename).getCode() self.filename = filename self.charset = charset self.errors = errors self.unicode_mode = unicode_mode
def url_unquote(s, charset='utf-8', errors='replace'): """URL decode a single string with a given decoding. Per default encoding errors are ignored. If you want a different behavior you can set `errors` to ``'replace'`` or ``'strict'``. In strict mode a `HTTPUnicodeError` is raised. :param s: the string to unquote. :param charset: the charset to be used. :param errors: the error handling for the charset decoding. """ if isinstance(s, unicode): s = s.encode(charset) return _decode_unicode(_unquote(s), charset, errors)
def _url_decode_impl(pair_iter, charset, decode_keys, include_empty, errors): for pair in pair_iter: if not pair: continue if '=' in pair: key, value = pair.split('=', 1) else: if not include_empty: continue key = pair value = '' key = _unquote_plus(key) if decode_keys: key = _decode_unicode(key, charset, errors) yield key, url_unquote_plus(value, charset, errors)
def parse_cookie(header, charset='utf-8', errors='replace', cls=None): """Parse a cookie. Either from a string or WSGI environ. Per default encoding errors are ignored. If you want a different behavior you can set `errors` to ``'replace'`` or ``'strict'``. In strict mode a :exc:`HTTPUnicodeError` is raised. .. versionchanged:: 0.5 This function now returns a :class:`TypeConversionDict` instead of a regular dict. The `cls` parameter was added. :param header: the header to be used to parse the cookie. Alternatively this can be a WSGI environment. :param charset: the charset for the cookie values. :param errors: the error behavior for the charset decoding. :param cls: an optional dict class to use. If this is not specified or `None` the default :class:`TypeConversionDict` is used. """ if isinstance(header, dict): header = header.get('HTTP_COOKIE', '') if cls is None: cls = TypeConversionDict cookie = _ExtendedCookie() cookie.load(header) result = {} # decode to unicode and skip broken items. Our extended morsel # and extended cookie will catch CookieErrors and convert them to # `None` items which we have to skip here. for key, value in cookie.iteritems(): if value.value is not None: result[key] = _decode_unicode(unquote_header_value(value.value), charset, errors) return cls(result)
def parse(self, file, boundary, content_length): next_part = '--' + boundary last_part = next_part + '--' form = [] files = [] in_memory = 0 iterator = chain(make_line_iter(file, limit=content_length, buffer_size=self.buffer_size), _empty_string_iter) terminator = self._find_terminator(iterator) if terminator != next_part: self.fail('Expected boundary at start of multipart data') while terminator != last_part: headers = parse_multipart_headers(iterator) disposition = headers.get('content-disposition') if disposition is None: self.fail('Missing Content-Disposition header') disposition, extra = parse_options_header(disposition) transfer_encoding = self.get_part_encoding(headers) name = extra.get('name') filename = extra.get('filename') part_charset = self.get_part_charset(headers) # if no content type is given we stream into memory. A list is # used as a temporary container. if filename is None: is_file = False container = [] _write = container.append guard_memory = self.max_form_memory_size is not None # otherwise we parse the rest of the headers and ask the stream # factory for something we can write in. else: is_file = True guard_memory = False filename, container = self.start_file_streaming( filename, headers, content_length) _write = container.write buf = '' for line in iterator: if not line: self.fail('unexpected end of stream') if line[:2] == '--': terminator = line.rstrip() if terminator in (next_part, last_part): break if transfer_encoding is not None: try: line = line.decode(transfer_encoding) except Exception: self.fail('could not decode transfer encoded chunk') # we have something in the buffer from the last iteration. # this is usually a newline delimiter. if buf: _write(buf) buf = '' # If the line ends with windows CRLF we write everything except # the last two bytes. In all other cases however we write # everything except the last byte. If it was a newline, that's # fine, otherwise it does not matter because we will write it # the next iteration. this ensures we do not write the # final newline into the stream. That way we do not have to # truncate the stream. However we do have to make sure that # if something else than a newline is in there we write it # out. if line[-2:] == '\r\n': buf = '\r\n' cutoff = -2 else: buf = line[-1] cutoff = -1 _write(line[:cutoff]) # if we write into memory and there is a memory size limit we # count the number of bytes in memory and raise an exception if # there is too much data in memory. if guard_memory: in_memory += len(line) if in_memory > self.max_form_memory_size: self.in_memory_threshold_reached(in_memory) else: # pragma: no cover raise ValueError('unexpected end of part') # if we have a leftover in the buffer that is not a newline # character we have to flush it, otherwise we will chop of # certain values. if buf not in ('', '\r', '\n', '\r\n'): _write(buf) if is_file: container.seek(0) files.append((name, FileStorage(container, filename, name, headers=headers))) else: form.append((name, _decode_unicode(''.join(container), part_charset, self.errors))) return self.cls(form), self.cls(files)
def to_unicode(self, value): if isinstance(value, str): return _decode_unicode(value, self.charset, self.errors) return unicode(value)