def get_output_message_id(self, source, created_timestamp, output_data_body, **processed_data): """ Get the output message id (aka `rid`). Kwargs: `source`: The source specification string (based on pattern: '<source label>.<source channel>'). `output_data_body`: The output AMQP message body (bytes) as returned by the get_output_data_body() method. `created_timestamp`: Message creation timestamp as an int number. <some keyword arguments>: Processed data (as returned by the process_input_data() method) passed as keyword arguments (the default implementation ignores them). Returns: The output message id (a string). Typically, this method is used indirectly -- being called in get_output_prop_kwargs() (which is called in get_output_components()). The default implementation of this method should be sufficient in most cases. """ components = ( as_bytes(source), as_bytes('{0:d}'.format(created_timestamp)), output_data_body, ) hashed_bytes = b'\0'.join(components) return hashlib.md5(hashed_bytes, usedforsecurity=False).hexdigest()
def generate_secret_key_qr_code_url(secret_key, login, issuer_name): # type: (String, String, String) -> String login = as_unicode(login) issuer_name = as_unicode(issuer_name) if sys.version_info[0] < 3: #3-- login = as_bytes(login) #3-- issuer_name = as_bytes(issuer_name) #3-- return make_totp_handler(secret_key).provisioning_uri( name=login, issuer_name=issuer_name)
def _deterministic_conv_to_bytes(self, value): CONVERTIBLE_TO_BYTES_TYPES = str, bytes, bytearray, memoryview, SupportsBytes if isinstance(value, dict): converted_key_to_val = {} for k, v in value.items(): if not isinstance(k, (str, bytes)): raise TypeError( 'dict {!a} contains a non-str-or-bytes key ({!a})'. format(value, k)) k = self._py2_bytestring_like_repr(k).encode('ascii') if isinstance(v, int): v = b'%d' % v else: if not isinstance(v, CONVERTIBLE_TO_BYTES_TYPES): raise TypeError('dict {!a} contains a value ({!a}) ' 'whose type ({!a}) is illegal'.format( value, v, type(v))) v = self._py2_bytestring_like_repr(v).encode('ascii') assert isinstance(k, bytes) assert isinstance(v, bytes) converted_key_to_val[k] = v value = b'{%b}' % b', '.join( b'%b: %b' % (k, v) for k, v in sorted(converted_key_to_val.items())) elif isinstance(value, int): value = b'%d' % value else: if not isinstance(value, CONVERTIBLE_TO_BYTES_TYPES): raise TypeError('encountered a value ({!a}) ' 'whose type ({!a}) is illegal)'.format( value, type(value))) value = as_bytes(value) assert isinstance(value, bytes) return value
def get_output_message_id(self, parsed): """ Make the id of the output message (aka `id`). Args: `parsed` (dict): As yielded by parse(). Returns: A string being the output message id. Typically, this method is used indirectly -- being called in get_output_bodies(). """ # Be careful when modifying this method or any method that this # method does call: after any code changes it should generate # the same ids for already stored data! (That's why this code # may already seem weird a bit...) assert isinstance(parsed, RecordDict) components = [] for k, v in sorted(self.iter_output_id_base_items(parsed)): if not isinstance(k, str): raise TypeError('encountered a non-str key ({!a})'.format(k)) k = as_bytes(k) if isinstance(v, (list, tuple)): v = b','.join(sorted(map(self._deterministic_conv_to_bytes, v))) else: v = self._deterministic_conv_to_bytes(v) assert isinstance(k, bytes) assert isinstance(v, bytes) components.append(b'%b,%b' % (k, v)) hashed_bytes = b'\n'.join(components) return hashlib.md5(hashed_bytes, usedforsecurity=False).hexdigest()
def render_content(self, data, **kwargs): if self.RULE_TEMPLATE is None: raise NotImplementedError if self.filter_renderer_specific(data) or self.filter_common(data): return b'' parsed_content = self.parse_data(data, **kwargs) return as_bytes(self.RULE_TEMPLATE.format(**parsed_content))
def _get_path(match, scheme, epslash): conv = _proper_conv(match) path = match.group('path') or conv('') if (epslash and as_bytes(scheme) in (b'http', b'https', b'ftp') and not path): path = conv('/') return path
def before_content(self, **kwargs): output = StringIO(newline='') writer = csv.DictWriter(output, fieldnames=self.EVENT_FIELDS, extrasaction='ignore', delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) writer.writeheader() content = output.getvalue() output.close() return as_bytes(content)
def iter_unzip_from_bytes( zipped, #*, #3: uncomment this line password=None, filenames=None, yielding_with_dirs=False): """ Extract files from a ZIP archive. Args: `zipped` (typically a `bytes`/`bytearray`; *cannot* be a `str`): The ZIP archive as a *bytes-like* object. Kwargs: `password` (optional; if given, typically a `str`/`bytes`): The password to extract encrypted files. If given (and not `None`), it will be, firstly, coerced to `bytes` using the `as_bytes()` helper from `n6lib.common_helpers` (by performing an `as_bytes(password, 'strict')` call). `filenames` (optional; if given, typically a list of `str`/`bytes`): A container (e.g., a sequence or a set) of the filenames (without dir parts) we are interested in. If given (and not `None`) then only the specified files will be extracted, ignoring non-existent ones. Each filename will be, firstly, coerced to `str` using the `as_unicode()` helper from `n6lib.common_helpers`. # maybe TODO: add support for Py3's *path*/*path-like* objects... `yielding_with_dirs` (default: False): If False -- dir names will be stripped off from yielded file names. If True -- file names will be yielded as found in the archive (including dir parts). Yields: Pairs: `(<file name (a str obj)>, <file content (a bytes obj)>).` Raises: zipfile.BadZipfile, EOFError: as zipfile.ZipFile can raise it for invalid input. RuntimeError (or subclasses, in particular NotImplementedError): as zipfile.ZipFile can raise it for unsupported input features, as well as for unspecified or incorrect password. """ if password is not None: password = as_bytes(password, 'strict') if filenames is not None: if sys.version_info[0] < 3: #3-- filenames = frozenset(filenames) #3-- else: #3-- filenames = frozenset(map(as_unicode, filenames)) zfile = zipfile.ZipFile(io.BytesIO(zipped)) for fullname in zfile.namelist(): #assert isinstance(fullname, str) #3: uncomment this line basename = (os.path.basename(fullname) if fullname else fullname) #assert isinstance(basename, str) #3: uncomment this line if filenames is None or basename in filenames: content = zfile.read(fullname, pwd=password) yield (fullname if yielding_with_dirs else basename), content
def _get_sha256(self): attr_name = 'sha256' if self._attr_in_params(attr_name): return random.choice(self._params[attr_name]) if self._include_in_event(attr_name): random_str = ''.join( random.choice(string.ascii_letters + string.digits) for _ in range(64)) return hashlib.sha256(as_bytes(random_str)).hexdigest() return None
def _prepare_url_data_items(self, item_prototype, custom_items): url_data = self.get('_url_data_ready') if url_data is not None: assert 'url_data' not in custom_items str = basestring #3-- assert isinstance(url_data.get('url_orig'), str) url_orig = base64.urlsafe_b64decode(as_bytes( url_data['url_orig'])) #3: `as_bytes(`-- `)-- item_prototype['url'] = make_provisional_url_search_key( url_orig) # [sic] custom_items['url_data'] = url_data
def render_content(self, data, **kwargs): data = self._dict_to_csv_ready(data) # fields = sorted(data[0].keys()) output = StringIO(newline='') writer = csv.DictWriter(output, fieldnames=self.EVENT_FIELDS, extrasaction='ignore', delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL) writer.writerow(data) content = output.getvalue() output.close() return as_bytes(content)
def _urlsafe_b64decode(self, value): value = value.rstrip( '\r\n') # some encoders like to append a newline... try: # `base64.urlsafe_b64decode()` just ignores illegal # characters *but* we want to be *more strict* if not self._URLSAFE_B64_VALID_CHARACTERS.issuperset(value): raise ValueError # `base64.urlsafe_b64decode()` (contrary to `base64.standard_b64decode()`) #3-- # does *not* accept unicode strings (even not pure-ASCII ones) :-/ #3-- value = as_bytes(value) #3-- value = base64.urlsafe_b64decode(value) except ( ValueError, TypeError ): # (TypeError is raised on incorrect Base64 padding) # <- TODO: check if it is still true in Py3 raise FieldValueError(public_message=( '"{}" is not a valid URL-safe-Base64-encoded string ' '[see: RFC 4648, section 5]'.format(ascii_str(value)))) return value
def normalize_url(url, transcode1st=False, epslash=False, rmzone=False): r""" Apply to the given string (or binary data blob) as much of the basic URL/IRI normalization as possible, provided that no semantic changes are made (i.e., the intent is that the resultant URL/IRI is semantically equivalent to the given one). Args (required): `url` (str or bytes/bytearray): The URL (or URI, or IRI) to be normalized. Kwargs (optional): `transcode1st` (bool; default: False): Whether, before the actual URL normalization (see the description in the steps 1-18 below...), the given `url` should be: * if given as a bytes/bytearray instance: decoded using the 'utf-8' codec with our custom error handler: 'utf8_surrogatepass_and_surrogateescape'; * otherwise (assuming a str instance): "transcoded" using `try_to_normalize_surrogate_pairs_to_proper_codepoints()` (to ensure that representation of non-BMP characters is consistent...). `epslash` (bool; default: False): Whether the *path* component of the given URL should be replaced with `/` if the `url`'s *scheme* is `http`, `https` or `ftp` *and* the *path* is empty (note that, generally, this normalization step does not change the URL semantics, with the exception of an URL being the request target of an `OPTIONS` HTTP request; see RFC 7230, section 2.7.3). `rmzone` (bool; default: False): Whether the IPv6 zone identifier being a part of an IPv6 address in the `url`'s *host* component should be removed (note that, generally, IPv6 zone identifier has no meaning outside the local system it is related to; see RFC 6874, section 1). Returns: A `str` object (`if a `str` was given) or a `bytes` object (if a `bytes` or `bytearray` object was given *and* `transcode1st` was false) representing the URL after a *best effort* but *keeping semantic equivalence* normalization (see below: the description of the algorithm). Raises: `TypeError` if `url` is not a str or bytes/bytearray instance. The algorithm of normalization consists of the following steps [the `+` operator in this description means *string concatenation*]: 0. Optional `url` transcoding (see the above description of the `transcode1st` argument). 1. Try to split the `url` into two parts: the `scheme` component (matching the `scheme` group of the regular expression `URL_SCHEME_AND_REST_REGEX`) and `rest` (the rest of the URL). 2. If no `scheme` could be singled out in step 1 then stop here -- returning the whole `url`; otherwise proceed to step 3. 3. Convert `scheme` to *lowercase*. 4. Try to split `rest` into the following parts: * `before host` (i.e., the "://" separator, optionally followed by any number of non-"/?#@" characters which, if present, are obligatorily followed by exactly one "@"), * `host` (see below: steps 6 to 13...), * optional `port` (i.e., ":<decimal number>" or just ":"), * optional `path` (i.e., "/" + optionally any number of non-"?#" characters), * optional `after path` (that is: "?" or "#", optionally followed by any number of any characters). 5. If `rest` could not be split in step 4 then stop here -- returning `scheme` + `rest`; otherwise proceed to step 6. 6. If `host` consists of "[" + `ipv6` + optional `ipv6 zone` + "]" -- where `ipv6` (consisting of hexadecimal digits and ":" characters, with optional suffix in the IPv4 four-octets format) is a supposed IPv6 address (see RFC 3986) and `ipv6 zone` (consisting, if present, of one "%" character followed by some non-"/?#[]" characters) is a supposed IPv6 zone identifier (see RFC 6874) -- then proceed to step 7, otherwise skip to step 12. 7. Convert `ipv6` to the normalized IPv6 format which: * uses only *lowercase* hexadecimal digits, and `:` characters as separators (in particular, the last 32 bits of the address are *not* represented using the IPv4 four-octets format), * is *condensed*, i.e., non-zero hexadecimal segments are formatted without leading zeros, and the `::` marker (if applicable) is used to replace the leftmost of the longest sequences of '0' segments (see RFC 5952, Section 4.2). 8. If normalization in step 7 was impossible because of syntactic incorrectness (i.e., `ipv6` could not be parsed as a valid IPv6 address) then leave `ipv6` intact. 9. If `ipv6 zone` is *not* present, or the `rmzone` argument is true, then set `ipv6 zone` to an empty string and skip to step 11; otherwise proceed to step 10. 10. If `ipv6 zone` consists only of ASCII characters then convert it to *lowercase*; otherwise leave it intact. 11. Set `host` to "[" + `ipv6` + `ipv6 zone` + "]"; then skip to step 14. 12. Split `host` (consisting of some non-":/?#" characters, presumably representing some hostname or IPv4/IPv[Future] address; see RFC 3986...) into *labels*, using dot characters defined by the `DOMAIN_LABEL_SEPARATOR_..._REGEX` constants as the delimiter (in such a way that *labels* do not include delimiter dots); for each such a `label` do the following: if `label` consists only of ASCII characters then convert it to *lowercase*, otherwise leave it intact. 13. Set `host` to the result of concatenation of the *labels* from step 12 (each of them converted to *lowercase* if ASCII-only) interleaved with ".". 14. If `port` is *not* present, or `port` is ":", or ":" followed by the known *default port number* for the particular `scheme` (according to the mapping `URL_SCHEME_TO_DEFAULT_PORT`; e.g., 80 for the "http" value of `scheme`), then set `port` to an empty string; otherwise leave `port` intact. 15. If `path` is present then leave it intact and skip to step 17; otherwise proceed to step 16. 16. If the `epslash` argument is true and `scheme` is one of: "http", "https", "ftp" -- then set `path` to "/"; otherwise set `path` to an empty string. 17. If `after path` is *not* present then set it to an empty string. 18. Stop here -- returning `scheme` + `before host` + `host` + `port` + `path` + `after path`. Ad 0: >>> normalize_url('\xf4\x8f\xbf\xbf') '\xf4\x8f\xbf\xbf' >>> normalize_url('\xf4\x8f\xbf\xbf', transcode1st=True) u'\U0010ffff' >>> normalize_url(u'\udbff\udfff') # look at this! u'\udbff\udfff' >>> normalize_url(u'\udbff\udfff', transcode1st=True) u'\U0010ffff' >>> normalize_url(u'\U0010ffff') u'\U0010ffff' >>> normalize_url(u'\U0010ffff', transcode1st=True) u'\U0010ffff' Ad 0-2: >>> normalize_url('Blabla-bla!@#$ %^&\xc4\x85\xcc') 'Blabla-bla!@#$ %^&\xc4\x85\xcc' >>> normalize_url('Blabla-bla!@#$ %^&\xc4\x85\xcc', transcode1st=True) u'Blabla-bla!@#$ %^&\u0105\udccc' >>> normalize_url(u'Blabla-bla!@#$ %^&\u0105\udccc') u'Blabla-bla!@#$ %^&\u0105\udccc' Ad 0-1 + 3 + 5: >>> normalize_url('SOME-scheme:Blabla-bla!@#$ %^&\xc4\x85\xcc') 'some-scheme:Blabla-bla!@#$ %^&\xc4\x85\xcc' >>> normalize_url('SOME-scheme:Blabla-bla!@#$ %^&\xc4\x85\xcc', transcode1st=True) u'some-scheme:Blabla-bla!@#$ %^&\u0105\udccc' >>> normalize_url(u'somE-sCHEmE:Blabla-bla!@#$ %^&\u0105\udccc') u'some-scheme:Blabla-bla!@#$ %^&\u0105\udccc' Ad 0-1 + 3-4 + 6-11 + 14-18: >>> normalize_url('HtTP://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]') 'http://[2001:db8:85a3::8a2e:370:7334]' >>> normalize_url('HtTP://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334FAB]') 'http://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334FAB]' >>> normalize_url('HtTP://[2001:0DB8:85A3:0000:0000:8A2E:3.112.115.52%25en1]') 'http://[2001:db8:85a3::8a2e:370:7334%25en1]' >>> normalize_url('HtTP://[2001:0DB8:85A3::8A2E:0370:7334]/fooBAR', ... epslash=True) 'http://[2001:db8:85a3::8a2e:370:7334]/fooBAR' >>> normalize_url('HtTP://[2001:0DB8:85A3:0000:0000:8A2E:3.112.115.52]:80') 'http://[2001:db8:85a3::8a2e:370:7334]' >>> normalize_url('HtTP://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334%25en1]:80', ... epslash=True) 'http://[2001:db8:85a3::8a2e:370:7334%25en1]/' >>> normalize_url('HtTP://[2001:DB8:85A3::8A2E:3.112.115.52]', ... rmzone=True) 'http://[2001:db8:85a3::8a2e:370:7334]' >>> normalize_url('HtTP://[2001:0db8:85a3:0000:0000:8a2e:0370:7334%25EN1]', ... rmzone=True) 'http://[2001:db8:85a3::8a2e:370:7334]' >>> normalize_url('HtTP://[2001:0DB8:85A3:0000:0000:8A2E:3.112.115.52%25en1]', ... rmzone=True, epslash=True) 'http://[2001:db8:85a3::8a2e:370:7334]/' >>> normalize_url('HtTP://[2001:0DB8:85A3::8A2E:0370:7334%25en1]:80', ... rmzone=True) 'http://[2001:db8:85a3::8a2e:370:7334]' >>> normalize_url('HtTP://[2001:DB8:85A3:0000:0000:8A2E:3.112.115.52%25en1]:80', ... rmzone=True, epslash=True) 'http://[2001:db8:85a3::8a2e:370:7334]/' >>> normalize_url(u'HtTP://[2001:0DB8:85A3:0000:0000:8A2E:3.112.115.52]') u'http://[2001:db8:85a3::8a2e:370:7334]' >>> normalize_url(u'HtTP://[2001:0db8:85a3::8a2e:370:7334%25EN1]') u'http://[2001:db8:85a3::8a2e:370:7334%25en1]' >>> normalize_url(u'HtTP://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334FAB%25eN1]', ... epslash=True) u'http://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334FAB%25en1]/' >>> normalize_url(u'HtTP://[2001:0DB8:85A3:0000:0000:8a2e:3.112.115.52]', ... epslash=True) u'http://[2001:db8:85a3::8a2e:370:7334]/' >>> normalize_url(u'HtTP://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334]:80') u'http://[2001:db8:85a3::8a2e:370:7334]' >>> normalize_url(u'HtTP://[2001:0DB8:85A3::8A2E:3.112.115.52%25en1]:80', ... epslash=True) u'http://[2001:db8:85a3::8a2e:370:7334%25en1]/' >>> normalize_url(u'HtTP://[2001:db8:85a3:0000:0000:8A2E:0370:7334]', ... rmzone=True) u'http://[2001:db8:85a3::8a2e:370:7334]' >>> normalize_url(u'HtTP://[2001:0DB8:85A3:0000:0000:8A2E:3.112.115.52%25en1]/fooBAR', ... rmzone=True) u'http://[2001:db8:85a3::8a2e:370:7334]/fooBAR' >>> normalize_url(u'HtTP://[2001:0DB8:85A3::8A2E:0370:7334%25en1]', ... rmzone=True, epslash=True) u'http://[2001:db8:85a3::8a2e:370:7334]/' >>> normalize_url(u'HtTP://[2001:0DB8:85A3:0000:0000:8A2E:3.112.115.52%25en1]:80', ... rmzone=True) u'http://[2001:db8:85a3::8a2e:370:7334]' >>> normalize_url(u'HtTP://[2001:0DB8:85A3:0000:0000:8A2E:0370:7334%25en1]:80', ... rmzone=True, epslash=True) u'http://[2001:db8:85a3::8a2e:370:7334]/' >>> normalize_url('HtTPS://[2001:DB8:85A3:0000:0000:8A2E:3.112.115.52%25En1]:80') 'https://[2001:db8:85a3::8a2e:370:7334%25en1]:80' >>> normalize_url('HtTPS://[2001:DB8:85A3:0000:0000:8A2E:3.112.115.52%25en1]:80', ... rmzone=True) 'https://[2001:db8:85a3::8a2e:370:7334]:80' >>> normalize_url('HtTPS://[2001:0db8:85a3::8a2E:3.112.115.52%25en1]:443', ... rmzone=True) 'https://[2001:db8:85a3::8a2e:370:7334]' >>> normalize_url('HtTPS://[2001:DB8:85A3:0000:0000:8A2E:0370:7334%25eN\xc4\x851]:80', ... epslash=True) 'https://[2001:db8:85a3::8a2e:370:7334%25eN\xc4\x851]:80/' >>> normalize_url(u'HtTPS://[2001:0db8:85a3::8a2E:3.112.115.52%25En1]:443') u'https://[2001:db8:85a3::8a2e:370:7334%25en1]' >>> normalize_url(u'HtTPS://[2001:0DB8:85A3:0000:0000:8A2E:3.112.115.52%25eN\xc4\x851]:443', ... epslash=True) u'https://[2001:db8:85a3::8a2e:370:7334%25eN\xc4\x851]/' >>> normalize_url(u'HtTPS://[2001:0DB8:85A3::8A2E:0370:7334%25eN1]:80', ... rmzone=True, epslash=True) u'https://[2001:db8:85a3::8a2e:370:7334]:80/' >>> normalize_url(u'HtTPS://[2001:0DB8:85A3::8A2E:370:7334%25eN1]:443', ... rmzone=True, epslash=True) u'https://[2001:db8:85a3::8a2e:370:7334]/' Ad 0-1 + 3-4 + 12-18: >>> normalize_url('HTTP://WWW.XyZ-\xc4\x85\xcc.eXamplE.com', epslash=True) 'http://www.XyZ-\xc4\x85\xcc.example.com/' >>> normalize_url('HTTP://WWW.XyZ-\xc4\x85\xcc.eXamplE.com', transcode1st=True) u'http://www.XyZ-\u0105\udccc.example.com' >>> normalize_url('HTTP://WWW.XyZ-\xc4\x85.eXamplE.com:80/fooBAR') 'http://www.XyZ-\xc4\x85.example.com/fooBAR' >>> normalize_url('HtTP://WWW.XyZ-\xc4\x85.eXamplE.com:80', epslash=True) 'http://www.XyZ-\xc4\x85.example.com/' >>> normalize_url('HtTP://WWW.XyZ-\xc4\x85.eXamplE.com:80/fooBAR', epslash=True) 'http://www.XyZ-\xc4\x85.example.com/fooBAR' >>> normalize_url('HTTP://WWW.XyZ-\xc4\x85\xcc.eXamplE.com', transcode1st=True) u'http://www.XyZ-\u0105\udccc.example.com' >>> normalize_url(u'HTtp://WWW.XyZ-\u0105\udccc.eXamplE.com:80') u'http://www.XyZ-\u0105\udccc.example.com' >>> normalize_url(u'HTtp://WWW.XyZ-\u0105.eXamplE.com:80/') u'http://www.XyZ-\u0105.example.com/' >>> normalize_url(u'hTTP://WWW.XyZ-\u0105.eXamplE.com:80', epslash=True) u'http://www.XyZ-\u0105.example.com/' >>> normalize_url('HTTPS://WWW.XyZ-\xc4\x85.eXamplE.com:80') 'https://www.XyZ-\xc4\x85.example.com:80' >>> normalize_url('HTTPS://WWW.XyZ-\xc4\x85.eXamplE.com:80/fooBAR') 'https://www.XyZ-\xc4\x85.example.com:80/fooBAR' >>> normalize_url('HTTPs://WWW.XyZ-\xc4\x85.eXamplE.com:443', epslash=True) 'https://www.XyZ-\xc4\x85.example.com/' >>> normalize_url('HTTPs://WWW.XyZ-\xc4\x85.eXamplE.com:443', epslash=True, transcode1st=True) u'https://www.XyZ-\u0105.example.com/' >>> normalize_url(u'httpS://WWW.XyZ-\u0105.eXamplE.com:80', epslash=True) u'https://www.XyZ-\u0105.example.com:80/' >>> normalize_url(u'httpS://WWW.XyZ-\u0105.eXamplE.com:80/fooBAR', epslash=True) u'https://www.XyZ-\u0105.example.com:80/fooBAR' >>> normalize_url(u'hTtpS://WWW.XyZ-\u0105.eXamplE.com:443') u'https://www.XyZ-\u0105.example.com' >>> normalize_url(u'httpS://WWW.XyZ-\u0105.eXamplE.com:80/fooBAR', epslash=True, ... transcode1st=True) u'https://www.XyZ-\u0105.example.com:80/fooBAR' """ if isinstance(url, bytearray): url = as_bytes(url) if transcode1st: url = _transcode(url) scheme = _get_scheme(url) if scheme is None: # does not look like a URL at all # -> no normalization return url rest = url[len(scheme):] regex = (_AFTER_SCHEME_COMPONENTS_OF_URL_WITH_AUTHORITY_BYTES_REGEX if isinstance(url, bytes) else _AFTER_SCHEME_COMPONENTS_OF_URL_WITH_AUTHORITY_REGEX) match = regex.search(rest) if match is None: # probably a URL without the *authority* component # -> the only normalized component is *scheme* return scheme + rest before_host = _get_before_host(match) host = _get_host(match, rmzone) port = _get_port(match, scheme) path = _get_path(match, scheme, epslash) after_path = _get_after_path(match) return scheme + before_host + host + port + path + after_path
def _py2_bytestring_like_repr(self, obj): ascii_repr = ascii(as_bytes(obj)) assert ascii_repr.startswith(("b'", 'b"')) return ascii_repr[1:]
def _get_value_for_md5_attr(self): random_str = ''.join( random.choice(string.ascii_letters + string.digits) for _ in range(32)) return hashlib.md5(as_bytes(random_str)).hexdigest()
def _get_hex_hash_of_names(names): hash_base = as_bytes('-'.join(names)) return hashlib.sha256(hash_base).hexdigest()
def before_content(self, **kwargs): if 'category' in self.request.params: return b'# ' + as_bytes(str( self.request.params.get('category'))) + b'\n' else: return b''