コード例 #1
0
ファイル: handlers.py プロジェクト: brettgoss/datazoomer
    def _calc_checksum(self, secret):
        # NOTE: this bypasses bcrypt's _calc_checksum,
        #       so has to take care of all it's issues, such as secret encoding.
        if isinstance(secret, unicode):
            secret = secret.encode("utf-8")

        # generate the mysql323 hash first (as it would be in the db
        MASK_32 = 0xffffffff
        MASK_31 = 0x7fffffff
        WHITE = b' \t'

        nr1 = 0x50305735
        nr2 = 0x12345671
        add = 7
        for c in secret:
            if c in WHITE:
                continue
            tmp = byte_elem_value(c)
            nr1 ^= ((((nr1 & 63)+add)*tmp) + (nr1 << 8)) & MASK_32
            nr2 = (nr2+((nr2 << 8) ^ nr1)) & MASK_32
            add = (add+tmp) & MASK_32
        mysql323_hash = u("%08x%08x") % (nr1 & MASK_31, nr2 & MASK_31)

        # NOTE: can't use digest directly, since bcrypt stops at first NULL.
        # NOTE: bcrypt doesn't fully mix entropy for bytes 55-72 of password
        #       (XXX: citation needed), so we don't want key to be > 55 bytes.
        #       thus, have to use base64 (44 bytes) rather than hex (64 bytes).
        key = b64encode(sha256(mysql323_hash).digest())
        return self._calc_checksum_backend(key)
コード例 #2
0
ファイル: des_crypt.py プロジェクト: cutso/passlib
def _crypt_secret_to_key(secret):
    """convert secret to 64-bit DES key.

    this only uses the first 8 bytes of the secret,
    and discards the high 8th bit of each byte at that.
    a null parity bit is inserted after every 7th bit of the output.
    """
    # NOTE: this would set the parity bits correctly,
    #       but des_encrypt_int_block() would just ignore them...
    ##return sum(expand_7bit(byte_elem_value(c) & 0x7f) << (56-i*8)
    ##           for i, c in enumerate(secret[:8]))
    return sum((byte_elem_value(c) & 0x7F) << (57 - i * 8) for i, c in enumerate(secret[:8]))
コード例 #3
0
def _crypt_secret_to_key(secret):
    """convert secret to 64-bit DES key.

    this only uses the first 8 bytes of the secret,
    and discards the high 8th bit of each byte at that.
    a null parity bit is inserted after every 7th bit of the output.
    """
    # NOTE: this would set the parity bits correctly,
    #       but des_encrypt_int_block() would just ignore them...
    ##return sum(expand_7bit(byte_elem_value(c) & 0x7f) << (56-i*8)
    ##           for i, c in enumerate(secret[:8]))
    return sum((byte_elem_value(c) & 0x7f) << (57 - i * 8)
               for i, c in enumerate(secret[:8]))
コード例 #4
0
ファイル: mysql.py プロジェクト: dragoncsc/HDsite
    def _calc_checksum(self, secret):
        # FIXME: no idea if mysql has a policy about handling unicode passwords
        if isinstance(secret, unicode):
            secret = secret.encode("utf-8")

        MASK_32 = 0xffffffff
        MASK_31 = 0x7fffffff
        WHITE = b' \t'

        nr1 = 0x50305735
        nr2 = 0x12345671
        add = 7
        for c in secret:
            if c in WHITE:
                continue
            tmp = byte_elem_value(c)
            nr1 ^= ((((nr1 & 63)+add)*tmp) + (nr1 << 8)) & MASK_32
            nr2 = (nr2+((nr2 << 8) ^ nr1)) & MASK_32
            add = (add+tmp) & MASK_32
        return u("%08x%08x") % (nr1 & MASK_31, nr2 & MASK_31)
コード例 #5
0
    def _calc_checksum(self, secret):
        # FIXME: no idea if mysql has a policy about handling unicode passwords
        if isinstance(secret, unicode):
            secret = secret.encode("utf-8")

        MASK_32 = 0xffffffff
        MASK_31 = 0x7fffffff
        WHITE = b' \t'

        nr1 = 0x50305735
        nr2 = 0x12345671
        add = 7
        for c in secret:
            if c in WHITE:
                continue
            tmp = byte_elem_value(c)
            nr1 ^= ((((nr1 & 63) + add) * tmp) + (nr1 << 8)) & MASK_32
            nr2 = (nr2 + ((nr2 << 8) ^ nr1)) & MASK_32
            add = (add + tmp) & MASK_32
        return u("%08x%08x") % (nr1 & MASK_31, nr2 & MASK_31)
コード例 #6
0
def _raw_sha2_crypt(pwd, salt, rounds, use_512=False):
    """perform raw sha256-crypt / sha512-crypt

    this function provides a pure-python implementation of the internals
    for the SHA256-Crypt and SHA512-Crypt algorithms; it doesn't
    handle any of the parsing/validation of the hash strings themselves.

    :arg pwd: password chars/bytes to hash
    :arg salt: salt chars to use
    :arg rounds: linear rounds cost
    :arg use_512: use sha512-crypt instead of sha256-crypt mode

    :returns:
        encoded checksum chars
    """
    # ===================================================================
    # init & validate inputs
    # ===================================================================

    # NOTE: the setup portion of this algorithm scales ~linearly in time
    #       with the size of the password, making it vulnerable to a DOS from
    #       unreasonably large inputs. the following code has some optimizations
    #       which would make things even worse, using O(pwd_len**2) memory
    #       when calculating digest P.
    #
    #       to mitigate these two issues: 1) this code switches to a
    #       O(pwd_len)-memory algorithm for passwords that are much larger
    #       than average, and 2) Passlib enforces a library-wide max limit on
    #       the size of passwords it will allow, to prevent this algorithm and
    #       others from being DOSed in this way (see passlib.exc.PasswordSizeError
    #       for details).

    # validate secret
    if isinstance(pwd, unicode):
        # XXX: not sure what official unicode policy is, using this as default
        pwd = pwd.encode("utf-8")
    assert isinstance(pwd, bytes)
    if _BNULL in pwd:
        raise uh.exc.NullPasswordError(
            sha512_crypt if use_512 else sha256_crypt)
    pwd_len = len(pwd)

    # validate rounds
    assert 1000 <= rounds <= 999999999, "invalid rounds"
    # NOTE: spec says out-of-range rounds should be clipped, instead of
    # causing an error. this function assumes that's been taken care of
    # by the handler class.

    # validate salt
    assert isinstance(salt, unicode), "salt not unicode"
    salt = salt.encode("ascii")
    salt_len = len(salt)
    assert salt_len < 17, "salt too large"
    # NOTE: spec says salts larger than 16 bytes should be truncated,
    # instead of causing an error. this function assumes that's been
    # taken care of by the handler class.

    # load sha256/512 specific constants
    if use_512:
        hash_const = hashlib.sha512
        transpose_map = _512_transpose_map
    else:
        hash_const = hashlib.sha256
        transpose_map = _256_transpose_map

    # ===================================================================
    # digest B - used as subinput to digest A
    # ===================================================================
    db = hash_const(pwd + salt + pwd).digest()

    # ===================================================================
    # digest A - used to initialize first round of digest C
    # ===================================================================
    # start out with pwd + salt
    a_ctx = hash_const(pwd + salt)
    a_ctx_update = a_ctx.update

    # add pwd_len bytes of b, repeating b as many times as needed.
    a_ctx_update(repeat_string(db, pwd_len))

    # for each bit in pwd_len: add b if it's 1, or pwd if it's 0
    i = pwd_len
    while i:
        a_ctx_update(db if i & 1 else pwd)
        i >>= 1

    # finish A
    da = a_ctx.digest()

    # ===================================================================
    # digest P from password - used instead of password itself
    #                          when calculating digest C.
    # ===================================================================
    if pwd_len < 96:
        # this method is faster under python, but uses O(pwd_len**2) memory;
        # so we don't use it for larger passwords to avoid a potential DOS.
        dp = repeat_string(hash_const(pwd * pwd_len).digest(), pwd_len)
    else:
        # this method is slower under python, but uses a fixed amount of memory.
        tmp_ctx = hash_const(pwd)
        tmp_ctx_update = tmp_ctx.update
        i = pwd_len - 1
        while i:
            tmp_ctx_update(pwd)
            i -= 1
        dp = repeat_string(tmp_ctx.digest(), pwd_len)
    assert len(dp) == pwd_len

    # ===================================================================
    # digest S  - used instead of salt itself when calculating digest C
    # ===================================================================
    ds = hash_const(salt * (16 + byte_elem_value(da[0]))).digest()[:salt_len]
    assert len(ds) == salt_len, "salt_len somehow > hash_len!"

    # ===================================================================
    # digest C - for a variable number of rounds, combine A, S, and P
    #            digests in various ways; in order to burn CPU time.
    # ===================================================================

    # NOTE: the original SHA256/512-Crypt specification performs the C digest
    # calculation using the following loop:
    #
    ##dc = da
    ##i = 0
    # while i < rounds:
    ##    tmp_ctx = hash_const(dp if i & 1 else dc)
    # if i % 3:
    # tmp_ctx.update(ds)
    # if i % 7:
    # tmp_ctx.update(dp)
    ##    tmp_ctx.update(dc if i & 1 else dp)
    ##    dc = tmp_ctx.digest()
    ##    i += 1
    #
    # The code Passlib uses (below) implements an equivalent algorithm,
    # it's just been heavily optimized to pre-calculate a large number
    # of things beforehand. It works off of a couple of observations
    # about the original algorithm:
    #
    # 1. each round is a combination of 'dc', 'ds', and 'dp'; determined
    #    by the whether 'i' a multiple of 2,3, and/or 7.
    # 2. since lcm(2,3,7)==42, the series of combinations will repeat
    #    every 42 rounds.
    # 3. even rounds 0-40 consist of 'hash(dc + round-specific-constant)';
    #    while odd rounds 1-41 consist of hash(round-specific-constant + dc)
    #
    # Using these observations, the following code...
    # * calculates the round-specific combination of ds & dp for each round 0-41
    # * runs through as many 42-round blocks as possible
    # * runs through as many pairs of rounds as possible for remaining rounds
    # * performs once last round if the total rounds should be odd.
    #
    # this cuts out a lot of the control overhead incurred when running the
    # original loop 40,000+ times in python, resulting in ~20% increase in
    # speed under CPython (though still 2x slower than glibc crypt)

    # prepare the 6 combinations of ds & dp which are needed
    # (order of 'perms' must match how _c_digest_offsets was generated)
    dp_dp = dp + dp
    dp_ds = dp + ds
    perms = [dp, dp_dp, dp_ds, dp_ds + dp, ds + dp, ds + dp_dp]

    # build up list of even-round & odd-round constants,
    # and store in 21-element list as (even,odd) pairs.
    data = [(perms[even], perms[odd]) for even, odd in _c_digest_offsets]

    # perform as many full 42-round blocks as possible
    dc = da
    blocks, tail = divmod(rounds, 42)
    while blocks:
        for even, odd in data:
            dc = hash_const(odd + hash_const(dc + even).digest()).digest()
        blocks -= 1

    # perform any leftover rounds
    if tail:
        # perform any pairs of rounds
        pairs = tail >> 1
        for even, odd in data[:pairs]:
            dc = hash_const(odd + hash_const(dc + even).digest()).digest()

        # if rounds was odd, do one last round (since we started at 0,
        # last round will be an even-numbered round)
        if tail & 1:
            dc = hash_const(dc + data[pairs][0]).digest()

    # ===================================================================
    # encode digest using appropriate transpose map
    # ===================================================================
    return h64.encode_transposed_bytes(dc, transpose_map).decode("ascii")
コード例 #7
0
ファイル: sun_md5_crypt.py プロジェクト: marta90/Projekt
def raw_sun_md5_crypt(secret, rounds, salt):
    "given secret & salt, return encoded sun-md5-crypt checksum"
    global MAGIC_HAMLET
    assert isinstance(secret, bytes)
    assert isinstance(salt, bytes)

    # validate rounds
    if rounds <= 0:
        rounds = 0
    real_rounds = 4096 + rounds
    # NOTE: spec seems to imply max 'rounds' is 2**32-1

    # generate initial digest to start off round 0.
    # NOTE: algorithm 'salt' includes full config string w/ trailing "$"
    result = md5(secret + salt).digest()
    assert len(result) == 16

    # NOTE: many things in this function have been inlined (to speed up the loop
    #       as much as possible), to the point that this code barely resembles
    #       the algorithm as described in the docs. in particular:
    #
    #       * all accesses to a given bit have been inlined using the formula
    #         rbitval(bit) = (rval((bit>>3) & 15) >> (bit & 7)) & 1
    #
    #       * the calculation of coinflip value R has been inlined
    #
    #       * the conditional division of coinflip value V has been inlined as
    #         a shift right of 0 or 1.
    #
    #       * the i, i+3, etc iterations are precalculated in lists.
    #
    #       * the round-based conditional division of x & y is now performed
    #         by choosing an appropriate precalculated list, so that it only
    #         calculates the 7 bits which will actually be used.
    #
    X_ROUNDS_0, X_ROUNDS_1, Y_ROUNDS_0, Y_ROUNDS_1 = _XY_ROUNDS

    # NOTE: % appears to be *slightly* slower than &, so we prefer & if possible

    round = 0
    while round < real_rounds:
        # convert last result byte string to list of byte-ints for easy access
        rval = [byte_elem_value(c) for c in result].__getitem__

        # build up X bit by bit
        x = 0
        xrounds = X_ROUNDS_1 if (rval((round >> 3) & 15) >> (round & 7)) & 1 else X_ROUNDS_0
        for i, ia, ib in xrounds:
            a = rval(ia)
            b = rval(ib)
            v = rval((a >> (b % 5)) & 15) >> ((b >> (a & 7)) & 1)
            x |= ((rval((v >> 3) & 15) >> (v & 7)) & 1) << i

        # build up Y bit by bit
        y = 0
        yrounds = Y_ROUNDS_1 if (rval(((round + 64) >> 3) & 15) >> (round & 7)) & 1 else Y_ROUNDS_0
        for i, ia, ib in yrounds:
            a = rval(ia)
            b = rval(ib)
            v = rval((a >> (b % 5)) & 15) >> ((b >> (a & 7)) & 1)
            y |= ((rval((v >> 3) & 15) >> (v & 7)) & 1) << i

        # extract x'th and y'th bit, xoring them together to yeild "coin flip"
        coin = ((rval(x >> 3) >> (x & 7)) ^ (rval(y >> 3) >> (y & 7))) & 1

        # construct hash for this round
        h = md5(result)
        if coin:
            h.update(MAGIC_HAMLET)
        h.update(unicode(round).encode("ascii"))
        result = h.digest()

        round += 1

    # encode output
    return h64.encode_transposed_bytes(result, _chk_offsets)
コード例 #8
0
def raw_sun_md5_crypt(secret, rounds, salt):
    """given secret & salt, return encoded sun-md5-crypt checksum"""
    global MAGIC_HAMLET
    assert isinstance(secret, bytes)
    assert isinstance(salt, bytes)

    # validate rounds
    if rounds <= 0:
        rounds = 0
    real_rounds = 4096 + rounds
    # NOTE: spec seems to imply max 'rounds' is 2**32-1

    # generate initial digest to start off round 0.
    # NOTE: algorithm 'salt' includes full config string w/ trailing "$"
    result = md5(secret + salt).digest()
    assert len(result) == 16

    # NOTE: many things in this function have been inlined (to speed up the loop
    #       as much as possible), to the point that this code barely resembles
    #       the algorithm as described in the docs. in particular:
    #
    #       * all accesses to a given bit have been inlined using the formula
    #         rbitval(bit) = (rval((bit>>3) & 15) >> (bit & 7)) & 1
    #
    #       * the calculation of coinflip value R has been inlined
    #
    #       * the conditional division of coinflip value V has been inlined as
    #         a shift right of 0 or 1.
    #
    #       * the i, i+3, etc iterations are precalculated in lists.
    #
    #       * the round-based conditional division of x & y is now performed
    #         by choosing an appropriate precalculated list, so that it only
    #         calculates the 7 bits which will actually be used.
    #
    X_ROUNDS_0, X_ROUNDS_1, Y_ROUNDS_0, Y_ROUNDS_1 = _XY_ROUNDS

    # NOTE: % appears to be *slightly* slower than &, so we prefer & if possible

    round = 0
    while round < real_rounds:
        # convert last result byte string to list of byte-ints for easy access
        rval = [ byte_elem_value(c) for c in result ].__getitem__

        # build up X bit by bit
        x = 0
        xrounds = X_ROUNDS_1 if (rval((round>>3) & 15)>>(round & 7)) & 1 else X_ROUNDS_0
        for i, ia, ib in xrounds:
            a = rval(ia)
            b = rval(ib)
            v = rval((a >> (b % 5)) & 15) >> ((b>>(a&7)) & 1)
            x |= ((rval((v>>3)&15)>>(v&7))&1) << i

        # build up Y bit by bit
        y = 0
        yrounds = Y_ROUNDS_1 if (rval(((round+64)>>3) & 15)>>(round & 7)) & 1 else Y_ROUNDS_0
        for i, ia, ib in yrounds:
            a = rval(ia)
            b = rval(ib)
            v = rval((a >> (b % 5)) & 15) >> ((b>>(a&7)) & 1)
            y |= ((rval((v>>3)&15)>>(v&7))&1) << i

        # extract x'th and y'th bit, xoring them together to yeild "coin flip"
        coin = ((rval(x>>3) >> (x&7)) ^ (rval(y>>3) >> (y&7))) & 1

        # construct hash for this round
        h = md5(result)
        if coin:
            h.update(MAGIC_HAMLET)
        h.update(unicode(round).encode("ascii"))
        result = h.digest()

        round += 1

    # encode output
    return h64.encode_transposed_bytes(result, _chk_offsets)
コード例 #9
0
ファイル: sha2_crypt.py プロジェクト: cutso/passlib
def _raw_sha2_crypt(pwd, salt, rounds, use_512=False):
    """perform raw sha256-crypt / sha512-crypt

    this function provides a pure-python implementation of the internals
    for the SHA256-Crypt and SHA512-Crypt algorithms; it doesn't
    handle any of the parsing/validation of the hash strings themselves.

    :arg pwd: password chars/bytes to encrypt
    :arg salt: salt chars to use
    :arg rounds: linear rounds cost
    :arg use_512: use sha512-crypt instead of sha256-crypt mode

    :returns:
        encoded checksum chars
    """
    #===================================================================
    # init & validate inputs
    #===================================================================

    # NOTE: the setup portion of this algorithm scales ~linearly in time
    #       with the size of the password, making it vulnerable to a DOS from
    #       unreasonably large inputs. the following code has some optimizations
    #       which would make things even worse, using O(pwd_len**2) memory
    #       when calculating digest P. 
    #
    #       to mitigate these two issues: 1) this code switches to a 
    #       O(pwd_len)-memory algorithm for passwords that are much larger 
    #       than average, and 2) Passlib enforces a library-wide max limit on
    #       the size of passwords it will allow, to prevent this algorithm and 
    #       others from being DOSed in this way (see passlib.exc.PasswordSizeError
    #       for details).

    # validate secret
    if isinstance(pwd, unicode):
        # XXX: not sure what official unicode policy is, using this as default
        pwd = pwd.encode("utf-8")
    assert isinstance(pwd, bytes)
    if _BNULL in pwd:
        raise uh.exc.NullPasswordError(sha512_crypt if use_512 else sha256_crypt)
    pwd_len = len(pwd)

    # validate rounds
    assert 1000 <= rounds <= 999999999, "invalid rounds"
        # NOTE: spec says out-of-range rounds should be clipped, instead of
        # causing an error. this function assumes that's been taken care of
        # by the handler class.

    # validate salt
    assert isinstance(salt, unicode), "salt not unicode"
    salt = salt.encode("ascii")
    salt_len = len(salt)
    assert salt_len < 17, "salt too large"
        # NOTE: spec says salts larger than 16 bytes should be truncated,
        # instead of causing an error. this function assumes that's been
        # taken care of by the handler class.

    # load sha256/512 specific constants
    if use_512:
        hash_const = hashlib.sha512
        hash_len = 64
        transpose_map = _512_transpose_map
    else:
        hash_const = hashlib.sha256
        hash_len = 32
        transpose_map = _256_transpose_map

    #===================================================================
    # digest B - used as subinput to digest A
    #===================================================================
    db = hash_const(pwd + salt + pwd).digest()

    #===================================================================
    # digest A - used to initialize first round of digest C
    #===================================================================
    # start out with pwd + salt
    a_ctx = hash_const(pwd + salt)
    a_ctx_update = a_ctx.update

    # add pwd_len bytes of b, repeating b as many times as needed.
    a_ctx_update(repeat_string(db, pwd_len))

    # for each bit in pwd_len: add b if it's 1, or pwd if it's 0
    i = pwd_len
    while i:
        a_ctx_update(db if i & 1 else pwd)
        i >>= 1

    # finish A
    da = a_ctx.digest()

    #===================================================================
    # digest P from password - used instead of password itself
    #                          when calculating digest C.
    #===================================================================
    if pwd_len < 96:
        # this method is faster under python, but uses O(pwd_len**2) memory;
        # so we don't use it for larger passwords to avoid a potential DOS.
        dp = repeat_string(hash_const(pwd * pwd_len).digest(), pwd_len)
    else:
        # this method is slower under python, but uses a fixed amount of memory.
        tmp_ctx = hash_const(pwd)
        tmp_ctx_update = tmp_ctx.update
        i = pwd_len-1
        while i:
            tmp_ctx_update(pwd)
            i -= 1
        dp = repeat_string(tmp_ctx.digest(), pwd_len)
    assert len(dp) == pwd_len

    #===================================================================
    # digest S  - used instead of salt itself when calculating digest C
    #===================================================================
    ds = hash_const(salt * (16 + byte_elem_value(da[0]))).digest()[:salt_len]
    assert len(ds) == salt_len, "salt_len somehow > hash_len!"

    #===================================================================
    # digest C - for a variable number of rounds, combine A, S, and P
    #            digests in various ways; in order to burn CPU time.
    #===================================================================

    # NOTE: the original SHA256/512-Crypt specification performs the C digest
    # calculation using the following loop:
    #
    ##dc = da
    ##i = 0
    ##while i < rounds:
    ##    tmp_ctx = hash_const(dp if i & 1 else dc)
    ##    if i % 3:
    ##        tmp_ctx.update(ds)
    ##    if i % 7:
    ##        tmp_ctx.update(dp)
    ##    tmp_ctx.update(dc if i & 1 else dp)
    ##    dc = tmp_ctx.digest()
    ##    i += 1
    #
    # The code Passlib uses (below) implements an equivalent algorithm,
    # it's just been heavily optimized to pre-calculate a large number
    # of things beforehand. It works off of a couple of observations
    # about the original algorithm:
    #
    # 1. each round is a combination of 'dc', 'ds', and 'dp'; determined
    #    by the whether 'i' a multiple of 2,3, and/or 7.
    # 2. since lcm(2,3,7)==42, the series of combinations will repeat
    #    every 42 rounds.
    # 3. even rounds 0-40 consist of 'hash(dc + round-specific-constant)';
    #    while odd rounds 1-41 consist of hash(round-specific-constant + dc)
    #
    # Using these observations, the following code...
    # * calculates the round-specific combination of ds & dp for each round 0-41
    # * runs through as many 42-round blocks as possible
    # * runs through as many pairs of rounds as possible for remaining rounds
    # * performs once last round if the total rounds should be odd.
    #
    # this cuts out a lot of the control overhead incurred when running the
    # original loop 40,000+ times in python, resulting in ~20% increase in
    # speed under CPython (though still 2x slower than glibc crypt)

    # prepare the 6 combinations of ds & dp which are needed
    # (order of 'perms' must match how _c_digest_offsets was generated)
    dp_dp = dp+dp
    dp_ds = dp+ds
    perms = [dp, dp_dp, dp_ds, dp_ds+dp, ds+dp, ds+dp_dp]

    # build up list of even-round & odd-round constants,
    # and store in 21-element list as (even,odd) pairs.
    data = [ (perms[even], perms[odd]) for even, odd in _c_digest_offsets]

    # perform as many full 42-round blocks as possible
    dc = da
    blocks, tail = divmod(rounds, 42)
    while blocks:
        for even, odd in data:
            dc = hash_const(odd + hash_const(dc + even).digest()).digest()
        blocks -= 1

    # perform any leftover rounds
    if tail:
        # perform any pairs of rounds
        pairs = tail>>1
        for even, odd in data[:pairs]:
            dc = hash_const(odd + hash_const(dc + even).digest()).digest()

        # if rounds was odd, do one last round (since we started at 0,
        # last round will be an even-numbered round)
        if tail & 1:
            dc = hash_const(dc + data[pairs][0]).digest()

    #===================================================================
    # encode digest using appropriate transpose map
    #===================================================================
    return h64.encode_transposed_bytes(dc, transpose_map).decode("ascii")
コード例 #10
0
def utf8_truncate(source, index):
    """
    helper to truncate UTF8 byte string to nearest character boundary ON OR AFTER <index>.
    returned prefix will always have length of at least <index>, and will stop on the
    first byte that's not a UTF8 continuation byte (128 - 191 inclusive).
    since utf8 should never take more than 4 bytes to encode known unicode values,
    we can stop after ``index+3`` is reached.

    :param bytes source:
    :param int index:
    :rtype: bytes
    """
    # general approach:
    #
    # * UTF8 bytes will have high two bits (0xC0) as one of:
    #   00 -- ascii char
    #   01 -- ascii char
    #   10 -- continuation of multibyte char
    #   11 -- start of multibyte char.
    #   thus we can cut on anything where high bits aren't "10" (0x80; continuation byte)
    #
    # * UTF8 characters SHOULD always be 1 to 4 bytes, though they may be unbounded.
    #   so we just keep going until first non-continuation byte is encountered, or end of str.
    #   this should work predictably even for malformed/non UTF8 inputs.

    if not isinstance(source, bytes):
        raise ExpectedTypeError(source, bytes, "source")

    # validate index
    end = len(source)
    if index < 0:
        index = max(0, index + end)
    if index >= end:
        return source

    # can stop search after 4 bytes, won't ever have longer utf8 sequence.
    end = min(index + 3, end)

    # loop until we find non-continuation byte
    while index < end:
        if byte_elem_value(source[index]) & 0xC0 != 0x80:
            # found single-char byte, or start-char byte.
            break
        # else: found continuation byte.
        index += 1
    else:
        assert index == end

    # truncate at final index
    result = source[:index]

    def sanity_check():
        # try to decode source
        try:
            text = source.decode("utf-8")
        except UnicodeDecodeError:
            # if source isn't valid utf8, byte level match is enough
            return True

        # validate that result was cut on character boundary
        assert text.startswith(result.decode("utf-8"))
        return True

    assert sanity_check()

    return result