def test_std3(self):

        self.assertEqual(idna.uts46_remap('A_', std3_rules=False), 'a_')
        self.assertRaises(idna.InvalidCodepoint,
                          idna.uts46_remap,
                          'A_',
                          std3_rules=True)
Example #2
0
def punycode_special_host(url):
    if url.host and url.scheme in urlcanon.SPECIAL_SCHEMES:
        # https://github.com/kjd/idna/issues/40#issuecomment-285496926
        try:
            url.host = idna.encode(url.host.decode('utf-8'), uts46=True)
        except:
            try:
                remapped = idna.uts46_remap(url.host.decode('utf-8'))
                labels = remapped.split('.')
                punycode_labels = [idna2003.ToASCII(label) for label in labels]
                url.host = b'.'.join(punycode_labels)
            except:
                pass
Example #3
0
def _domain_to_ascii(domain, strict=False):
    """Attempt to encode with IDNA 2008 first, if that fails
    then attempt to encode with IDNA 2003.
    """
    try:
        return idna.encode(domain,
                           strict=strict,
                           std3_rules=strict,
                           uts46=True,
                           transitional=False)
    except idna.IDNAError:
        if isinstance(domain, (bytes, bytearray)):
            domain = domain.decode("ascii")
        domain = idna.uts46_remap(domain,
                                  std3_rules=strict,
                                  transitional=False)
        trailing_dot = False
        result = []
        if strict:
            labels = domain.split(".")
        else:
            labels = IDNA_DOTS_REGEX.split(domain)

        if not labels or labels == [""]:
            raise idna.IDNAError("Empty domain")
        if labels[-1] == "":
            del labels[-1]
            trailing_dot = True

        for label in labels:
            try:
                s = idna2003.ToASCII(label)
            except UnicodeError:
                if strict:
                    raise
                result.append(label.encode("utf-8"))
                continue
            if s:
                result.append(s)
            else:
                raise idna.IDNAError("Empty label")
        if trailing_dot:
            result.append(b"")
        s = b".".join(result)
        if not idna.valid_string_length(s, trailing_dot):
            raise idna.IDNAError("Domain too long")
        return s
Example #4
0
def normalize_name(name: str) -> str:
    """
    Clean the fully qualified name, as defined in ENS `EIP-137
    <https://github.com/ethereum/EIPs/blob/master/EIPS/eip-137.md#name-syntax>`_

    This does *not* enforce whether ``name`` is a label or fully qualified domain.

    :param str name: the dot-separated ENS name
    :raises InvalidName: if ``name`` has invalid syntax
    """
    if not name:
        return name
    elif isinstance(name, (bytes, bytearray)):
        name = name.decode('utf-8')

    try:
        return idna.uts46_remap(name, std3_rules=True, transitional=False)
    except idna.IDNAError as exc:
        raise InvalidName(f"{name} is an invalid name, because {exc}") from exc
Example #5
0
def validate_email_domain_part(domain):
    # Empty?
    if len(domain) == 0:
        raise EmailSyntaxError("There must be something after the @-sign.")

    # Perform UTS-46 normalization, which includes casefolding, NFC normalization,
    # and converting all label separators (the period/full stop, fullwidth full stop,
    # ideographic full stop, and halfwidth ideographic full stop) to basic periods.
    # It will also raise an exception if there is an invalid character in the input,
    # such as "⒈" which is invalid because it would expand to include a period.
    try:
        domain = idna.uts46_remap(domain, std3_rules=False, transitional=False)
    except idna.IDNAError as e:
        raise EmailSyntaxError(
            "The domain name %s contains invalid characters (%s)." %
            (domain, str(e)))

    # Now we can perform basic checks on the use of periods (since equivalent
    # symbols have been mapped to periods). These checks are needed because the
    # IDNA library doesn't handle well domains that have empty labels (i.e. initial
    # dot, trailing dot, or two dots in a row).
    if domain.endswith("."):
        raise EmailSyntaxError("An email address cannot end with a period.")
    if domain.startswith("."):
        raise EmailSyntaxError(
            "An email address cannot have a period immediately after the @-sign."
        )
    if ".." in domain:
        raise EmailSyntaxError(
            "An email address cannot have two periods in a row.")

    # Regardless of whether international characters are actually used,
    # first convert to IDNA ASCII. For ASCII-only domains, the transformation
    # does nothing. If internationalized characters are present, the MTA
    # must either support SMTPUTF8 or the mail client must convert the
    # domain name to IDNA before submission.
    #
    # Unfortunately this step incorrectly 'fixes' domain names with leading
    # periods by removing them, so we have to check for this above. It also gives
    # a funky error message ("No input") when there are two periods in a
    # row, also checked separately above.
    try:
        ascii_domain = idna.encode(domain, uts46=False).decode("ascii")
    except idna.IDNAError as e:
        if "Domain too long" in str(e):
            # We can't really be more specific because UTS-46 normalization means
            # the length check is applied to a string that is different from the
            # one the user supplied. Also I'm not sure if the length check applies
            # to the internationalized form, the IDNA ASCII form, or even both!
            raise EmailSyntaxError(
                "The email address is too long after the @-sign.")
        raise EmailSyntaxError(
            "The domain name %s contains invalid characters (%s)." %
            (domain, str(e)))

    # We may have been given an IDNA ASCII domain to begin with. Check
    # that the domain actually conforms to IDNA. It could look like IDNA
    # but not be actual IDNA. For ASCII-only domains, the conversion out
    # of IDNA just gives the same thing back.
    #
    # This gives us the canonical internationalized form of the domain,
    # which we should use in all error messages.
    try:
        domain_i18n = idna.decode(ascii_domain.encode('ascii'))
    except idna.IDNAError as e:
        raise EmailSyntaxError("The domain name %s is not valid IDNA (%s)." %
                               (ascii_domain, str(e)))

    # RFC 5321 4.5.3.1.2
    # We're checking the number of bytes (octets) here, which can be much
    # higher than the number of characters in internationalized domains,
    # on the assumption that the domain may be transmitted without SMTPUTF8
    # as IDNA ASCII. This is also checked by idna.encode, so this exception
    # is never reached.
    if len(ascii_domain) > DOMAIN_MAX_LENGTH:
        raise EmailSyntaxError(
            "The email address is too long after the @-sign.")

    # A "dot atom text", per RFC 2822 3.2.4, but using the restricted
    # characters allowed in a hostname (see ATEXT_HOSTNAME above).
    DOT_ATOM_TEXT = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*'

    # Check the regular expression. This is probably entirely redundant
    # with idna.decode, which also checks this format.
    m = re.match(DOT_ATOM_TEXT + "\\Z", ascii_domain)
    if not m:
        raise EmailSyntaxError(
            "The email address contains invalid characters after the @-sign.")

    # All publicly deliverable addresses have domain named with at least
    # one period. We also know that all TLDs end with a letter.
    if "." not in ascii_domain:
        raise EmailSyntaxError(
            "The domain name %s is not valid. It should have a period." %
            domain_i18n)
    if not re.search(r"[A-Za-z]\Z", ascii_domain):
        raise EmailSyntaxError(
            "The domain name %s is not valid. It is not within a valid top-level domain."
            % domain_i18n)

    # Return the IDNA ASCII-encoded form of the domain, which is how it
    # would be transmitted on the wire (except when used with SMTPUTF8
    # possibly), as well as the canonical Unicode form of the domain,
    # which is better for display purposes. This should also take care
    # of RFC 6532 section 3.1's suggestion to apply Unicode NFC
    # normalization to addresses.
    return {
        "ascii_domain": ascii_domain,
        "domain": domain_i18n,
    }
Example #6
0
def _validate_email_address(
    value,
    allow_unnormalized,
    allow_smtputf8,
    required,
):
    if value is None:
        if required:
            raise TypeError("required value is None")
        return

    if not isinstance(value, six.text_type):
        raise TypeError(
            ("expected unicode string, but value is of type {cls!r}").format(
                cls=value.__class__.__name__))

    parts = value.split("@")
    if len(parts) < 2:
        raise ValueError("email address is missing an '@' sign")
    if len(parts) > 2:
        raise ValueError("email address contains multiple '@' signs")

    local_part, domain = parts

    # === Validate and normalize the email address' local part ===

    if not local_part:
        raise ValueError("expected local part before '@', but found nothing")

    # RFC 5321 4.5.3.1.1
    # We're checking the number of characters here. If the local part
    # is ASCII-only, then that's the same as bytes (octets). If it's
    # internationalized, then the UTF-8 encoding may be longer, but
    # that may not be relevant. We will check the total address length
    # instead.
    if len(local_part) > _LOCAL_PART_MAX_LENGTH:
        raise ValueError(
            "expected at most 64 characters, "
            "but local part contains {chars}".format(chars=len(local_part)))

    if re.match(_DOT_ATOM_TEXT + "\\Z", local_part):
        # The local part is valid ascii.
        normalized_local_part = local_part
        ascii_local_part = local_part

    else:
        if not re.match(_DOT_ATOM_TEXT_UTF8 + "\\Z", local_part):
            # It's not a valid internationalized address either. Report which
            # characters were not valid.
            bad_chars = ", ".join(
                sorted(
                    set(c for c in local_part if not re.match(
                        u"[" +
                        (_ATEXT if not allow_smtputf8 else _ATEXT_UTF8) + u"]",
                        c,
                    ))))
            raise ValueError(
                "local part contains invalid characters: {bad_chars!r}".format(
                    bad_chars=bad_chars))

        if not allow_smtputf8:
            raise ValueError("invalid non-ascii characters in local part")

        # RFC 6532 section 3.1 also says that Unicode NFC normalization should
        # be applied.
        normalized_local_part = unicodedata.normalize("NFC", local_part)
        ascii_local_part = None

    # === Validate and normalize the email address' domain ===

    if len(domain) == 0:
        raise ValueError("expected domain name after '@', but found nothing")

    # Perform UTS-46 normalization, which includes casefolding, NFC
    # normalization, and converting all label separators (the period/full
    # stop, fullwidth full stop, ideographic full stop, and halfwidth
    # ideographic full stop) to basic periods.
    # It will also raise an exception if there is an invalid character in the
    # input, such as "⒈" which is invalid because it would expand to include
    # a period.
    try:
        domain = idna.uts46_remap(domain, std3_rules=False, transitional=False)
    except idna.IDNAError as e:
        raise ValueError(
            "domain name contains invalid characters: {error}".format(error=e))

    # Now we can perform basic checks on the use of periods (since equivalent
    # symbols have been mapped to periods). These checks are needed because the
    # IDNA library doesn't handle well domains that have empty labels (i.e.
    # initial dot, trailing dot, or two dots in a row).
    if domain.endswith("."):
        raise ValueError("unexpected period at end of domain name")
    if domain.startswith("."):
        raise ValueError("unexpected period at start of domain name")
    if ".." in domain:
        raise ValueError("unexpected consecutive periods in domain name")

    # Regardless of whether international characters are actually used,
    # first convert to IDNA ASCII. For ASCII-only domains, the transformation
    # does nothing. If internationalized characters are present, the MTA
    # must either support SMTPUTF8 or the mail client must convert the
    # domain name to IDNA before submission.
    #
    # Unfortunately this step incorrectly 'fixes' domain names with leading
    # periods by removing them, so we have to check for this above. It also
    # gives a funky error message ("No input") when there are two periods in a
    # row, also checked separately above.
    try:
        ascii_domain = idna.encode(domain, uts46=False).decode("ascii")
    except idna.IDNAError as e:
        if "Domain too long" in str(e):
            # We can't really be more specific because UTS-46 normalization
            # means the length check is applied to a string that is different
            # from the one the user supplied. Also I'm not sure if the length
            # check applies to the internationalized form, the IDNA ASCII
            # form, or even both!
            raise ValueError("domain name is too long")
        raise ValueError(
            "domain name contains invalid characters: {error}".format(error=e))

    # We may have been given an IDNA ASCII domain to begin with. Check that
    # the domain actually conforms to IDNA. It could look like IDNA but not be
    # actual IDNA. For ASCII-only domains, the conversion out of IDNA just
    # gives the same thing back.
    #
    # This gives us the canonical internationalized form of the domain, which
    # we should use in all error messages.
    try:
        normalized_domain = idna.decode(ascii_domain.encode("ascii"))
    except idna.IDNAError as e:
        raise ValueError(
            "domain name is not valid idna: {error}".format(error=e))

    # RFC 5321 4.5.3.1.2
    # We're checking the number of bytes (octets) here, which can be much
    # higher than the number of characters in internationalized domains, on
    # the assumption that the domain may be transmitted without SMTPUTF8 as
    # IDNA ASCII. This is also checked by idna.encode, so this exception is
    # never reached.
    if len(ascii_domain) > _DOMAIN_MAX_LENGTH:
        raise ValueError(
            "expected no more than 255 characters after idna encoding, "
            "but domain expanded to {count}".format(count=len(ascii_domain)))

    # Check the regular expression. This is probably entirely redundant with
    # idna.decode, which also checks this format.
    m = re.match(_DOT_ATOM_TEXT_HOSTNAME + "\\Z", ascii_domain)
    if not m:
        raise ValueError("unexpected characters in address domain")

    # All publicly deliverable addresses have domain named with at least one
    # period. We also know that all TLDs end with a letter.
    if "." not in ascii_domain:
        raise ValueError(
            "expected a subdomain of a tld, but domain is missing a period")
    if not re.search(r"[A-Za-z]\Z", ascii_domain):
        raise ValueError(
            "expected a subdomain of a tld, but tld does not match pattern")

    # === Check bulk properties of the email address ===

    normalized_email = normalized_local_part + "@" + normalized_domain
    if ascii_local_part and ascii_domain:
        ascii_email = ascii_local_part + "@" + ascii_domain
    else:
        ascii_email = None

    # If the email address has an ASCII representation, then we assume it may
    # be transmitted in ASCII (we can't assume SMTPUTF8 will be used on all
    # hops to the destination) and the length limit applies to ASCII
    # characters (which is the same as octets). The number of characters in
    # may be many fewer (because IDNA ASCII is verbose) and could be less than
    # 254 Unicode characters, and of course the number of octets over the
    # limit may not be the number of characters over the limit, so if the
    # email address is internationalized, we can't give any simple information
    # about why the address is too long.
    #
    # In addition, check that the UTF-8 encoding (i.e. not IDNA ASCII and not
    # Unicode characters) is at most 254 octets. If the addres is transmitted
    # using SMTPUTF8, then the length limit probably applies to the UTF-8
    # encoded octets.  If the email address has an ASCII form that differs
    # from its internationalized form, I don't think the internationalized
    # form can be longer, and so the ASCII form length check would be
    # sufficient.  If there is no ASCII form, then we have to check the UTF-8
    # encoding. The UTF-8 encoding could be up to about four times longer than
    # the number of characters.
    if ascii_email and len(ascii_email) > _EMAIL_MAX_LENGTH:
        raise ValueError("email address is too long when isda encoded")
    else:
        if len(normalized_email) > _EMAIL_MAX_LENGTH:
            raise ValueError("email address is too long")

        if len(normalized_email.encode("utf8")) > _EMAIL_MAX_LENGTH:
            raise ValueError("email address is too long when utf-8 encoded")

    if not allow_unnormalized and value != normalized_email:
        raise ValueError("email address is not normalised")
def validate_email_domain_part(domain):
	# Empty?
	if len(domain) == 0:
		raise EmailSyntaxError("There must be something after the @-sign.")

	# Perform UTS-46 normalization, which includes casefolding, NFC normalization,
	# and converting all label separators (the period/full stop, fullwidth full stop,
	# ideographic full stop, and halfwidth ideographic full stop) to basic periods.
	# It will also raise an exception if there is an invalid character in the input,
	# such as "⒈" which is invalid because it would expand to include a period.
	try:
		domain = idna.uts46_remap(domain, std3_rules=False, transitional=False)
	except idna.IDNAError as e:
		raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e)))

	# Now we can perform basic checks on the use of periods (since equivalent
	# symbols have been mapped to periods). These checks are needed because the
	# IDNA library doesn't handle well domains that have empty labels (i.e. initial
	# dot, trailing dot, or two dots in a row).
	if domain.endswith("."):
		raise EmailSyntaxError("An email address cannot end with a period.")
	if domain.startswith("."):
		raise EmailSyntaxError("An email address cannot have a period immediately after the @-sign.")
	if ".." in domain:
		raise EmailSyntaxError("An email address cannot have two periods in a row.")

	# Regardless of whether international characters are actually used,
	# first convert to IDNA ASCII. For ASCII-only domains, the transformation
	# does nothing. If internationalized characters are present, the MTA
	# must either support SMTPUTF8 or the mail client must convert the
	# domain name to IDNA before submission.
	#
	# Unfortunately this step incorrectly 'fixes' domain names with leading
	# periods by removing them, so we have to check for this above. It also gives
	# a funky error message ("No input") when there are two periods in a
	# row, also checked separately above.
	try:
		domain = idna.encode(domain, uts46=False).decode("ascii")
	except idna.IDNAError as e:
		raise EmailSyntaxError("The domain name %s contains invalid characters (%s)." % (domain, str(e)))

	# We may have been given an IDNA ASCII domain to begin with. Check
	# that the domain actually conforms to IDNA. It could look like IDNA
	# but not be actual IDNA. For ASCII-only domains, the conversion out
	# of IDNA just gives the same thing back.
	#
	# This gives us the canonical internationalized form of the domain,
	# which we should use in all error messages.
	try:
		domain_i18n = idna.decode(domain.encode('ascii'))
	except idna.IDNAError as e:
		raise EmailSyntaxError("The domain name %s is not valid IDNA (%s)." % (domain, str(e)))

	# RFC 5321 4.5.3.1.2
	if len(domain) > 255:
		raise EmailSyntaxError("The email address is too long after the @-sign.")

	# A "dot atom text", per RFC 2822 3.2.4, but using the restricted
	# characters allowed in a hostname (see ATEXT_HOSTNAME above).
	DOT_ATOM_TEXT = ATEXT_HOSTNAME + r'(?:\.' + ATEXT_HOSTNAME + r')*'

	# Check the regular expression. This is probably entirely redundant
	# with idna.decode, which also checks this format.
	m = re.match(DOT_ATOM_TEXT + "$", domain)
	if not m:
		raise EmailSyntaxError("The email address contains invalid characters after the @-sign.")

	# All publicly deliverable addresses have domain named with at least
	# one period. We also know that all TLDs end with a letter.
	if "." not in domain:
		raise EmailSyntaxError("The domain name %s is not valid. It should have a period." % domain_i18n)
	if not re.search(r"[A-Za-z]$", domain):
		raise EmailSyntaxError("The domain name %s is not valid. It is not within a valid top-level domain." % domain_i18n)

	# Return the IDNA ASCII-encoded form of the domain, which is how it
	# would be transmitted on the wire (except when used with SMTPUTF8
	# possibly), as well as the canonical Unicode form of the domain,
	# which is better for display purposes. This should also take care
	# of RFC 6532 section 3.1's suggestion to apply Unicode NFC
	# normalization to addresses.
	return {
		"domain": domain,
		"domain_i18n": domain_i18n,
	}
    def test_std3(self):

        self.assertEqual(idna.uts46_remap('A_', std3_rules=False), 'a_')
        self.assertRaises(idna.InvalidCodepoint, idna.uts46_remap, 'A_', std3_rules=True)
Example #9
0
def to_bytes(proto, string):
    return idna.uts46_remap(string).encode("utf-8")