Ejemplo n.º 1
0
 def test_pack_bitvector(self):
     for i in range(1 << 10):
         intermediate = unpack_bitvector(i)
         x = bitvector(intermediate)
         print(i, intermediate)
         self.assertEqual(i, x)
Ejemplo n.º 2
0
 def test_unpack_bitvector(self):
     for i in range(32):
         n = 1 << i
         lst = unpack_bitvector(n)
         self.assertEqual(len(lst), 1)
         self.assertEqual(lst[0], i)
Ejemplo n.º 3
0
def simplify_charclass(matching_codes, ignorecase=False):
    """Given a sequence of ordinals, return a (seq, negated) tuple.

    `ignorecase` is whether the regex flags include re.IGNORECASE.

    If the class shouldn't be optimized, raises WontOptimize with a basic reason
    string.
    """
    if max(matching_codes) > 255:
        raise WontOptimize('Unicode')

    # HACK: Don't simplify something that looks fairly like a hex digit pattern.
    # They look arguably prettier as '0-9a-f' than '\da-f'
    bv = bitvector(matching_codes)
    if (bv & HEX) == HEX and ord('g') not in matching_codes:
        raise WontOptimize('Hex digit')
    if (bv & ALNUM) == ALNUM and ord('_') not in matching_codes:
        raise WontOptimize('Alphanumeric without _')

    if ignorecase:
        bv = bitvector(map(lowercase_code, matching_codes))
        base = INSENSITIVE_ASCII
    else:
        base = ASCII

    # Tries all possibilities of categories first.
    keys = sorted(CATS.keys(), reverse=True)
    # Strategy: since we have a small number of categories, try each of them to
    # see if it's legal; add in remaining ranges; score.
    # when negated=0, there are 64 (=2**6) combinations to check.
    # when negated=1, there are only 8 (=2**3) combinations.
    possibilities = []
    for negated in (0, 1):
        for i in range(2**len(keys)):
            chosen_keys = [keys[b] for b in range(len(keys)) if i & 1<<b]
            # Humans are terrible at double-negatives.  If this involves a
            # negation of the charclass as well as the category, tough cookies.
            # This will cause suggested _expansion_ of any such uses already in
            # the codebase, which should be ignored by the caller.
            if negated:
                if any(k[1].isupper() for k in chosen_keys):
                    continue

            if negated:
                t = base ^ (base & bv)
            else:
                t = bv

            chosen = 0
            for k in chosen_keys:
                chosen |= CATS[k]
            chosen &= base

            # True iff. the chosen categories fit entirely in the target.
            if chosen & t == chosen:
                #print chosen_keys, "t", unpack_bitvector(t), unpack_bitvector(chosen)
                t ^= chosen
                #print "  ", unpack_bitvector(t)
                r = build_ranges(unpack_bitvector(t))
                r[:0] = chosen_keys
                discount = 1 if chosen_keys == ['\\w', '\\W'] else 0

                if r:
                    possibilities.append((charclass_score(r, negated) - discount,
                                          r, negated))

    #print "possibilities", possibilities
    # There will always be one, since we include no-categories above, and it's
    # not on the WontOptimize list.
    possibilities.sort()
    return (possibilities[0][1], possibilities[0][2])
Ejemplo n.º 4
0
 def test_pack_bitvector(self):
     for i in range(1<<10):
         intermediate=unpack_bitvector(i)
         x = bitvector(intermediate)
         print(i, intermediate)
         self.assertEqual(i, x)
Ejemplo n.º 5
0
 def test_unpack_bitvector(self):
     for i in range(32):
         n = 1<<i
         lst = unpack_bitvector(n)
         self.assertEqual(len(lst), 1)
         self.assertEqual(lst[0], i)
Ejemplo n.º 6
0
def simplify_charclass(matching_codes, ignorecase=False):
    """Given a sequence of ordinals, return a (seq, negated) tuple.

    `ignorecase` is whether the regex flags include re.IGNORECASE.

    If the class shouldn't be optimized, raises WontOptimize with a basic reason
    string.
    """
    if max(matching_codes) > 255:
        raise WontOptimize('Unicode')

    # HACK: Don't simplify something that looks fairly like a hex digit pattern.
    # They look arguably prettier as '0-9a-f' than '\da-f'
    bv = bitvector(matching_codes)
    if (bv & HEX) == HEX and ord('g') not in matching_codes:
        raise WontOptimize('Hex digit')
    if (bv & ALNUM) == ALNUM and ord('_') not in matching_codes:
        raise WontOptimize('Alphanumeric without _')

    if ignorecase:
        bv = bitvector(map(lowercase_code, matching_codes))
        base = INSENSITIVE_ASCII
    else:
        base = ASCII

    # Tries all possibilities of categories first.
    keys = sorted(CATS.keys(), reverse=True)
    # Strategy: since we have a small number of categories, try each of them to
    # see if it's legal; add in remaining ranges; score.
    # when negated=0, there are 64 (=2**6) combinations to check.
    # when negated=1, there are only 8 (=2**3) combinations.
    possibilities = []
    for negated in (0, 1):
        #  target is the set of all characters we want to match, and none of the
        #  ones we don't (note: for case-insensitive, we mask `chosen' before
        #  comparing later).
        if negated:
            if ignorecase:
                target = bitvector(
                    map(lowercase_code, [
                        i for i in range(256) if i not in unpack_bitvector(bv)
                    ]))
            else:
                target = base ^ (base & bv)
        else:
            target = bv

        for i in range(2**len(keys)):
            chosen_keys = [keys[b] for b in range(len(keys)) if i & 1 << b]
            # Humans are terrible at double-negatives.  If this involves a
            # negation of the charclass as well as the category, tough cookies.
            # This will cause suggested _expansion_ of any such uses already in
            # the codebase, which should be ignored by the caller.
            if negated:
                if any(k[1].isupper() for k in chosen_keys):
                    continue

            t = target
            chosen = 0
            for k in chosen_keys:
                chosen |= CATS[k]
            # N.b. don't need to conditionally lowercase_code here because all
            # our categories contain lower if they contain upper.
            chosen &= base

            # True iff. the chosen categories fit entirely in the target.
            if chosen & t == chosen:
                #print chosen_keys, "t", unpack_bitvector(t), unpack_bitvector(chosen)
                t ^= chosen
                #print "  ", unpack_bitvector(t)
                r = build_ranges(unpack_bitvector(t))
                r[:0] = chosen_keys
                discount = 1 if chosen_keys == ['\\w', '\\W'] else 0

                if r:
                    possibilities.append(
                        (charclass_score(r, negated) - discount, r, negated))

    #print "possibilities", possibilities
    # There will always be one, since we include no-categories above, and it's
    # not on the WontOptimize list.
    possibilities.sort(key=lambda i: i[0])
    return (possibilities[0][1], possibilities[0][2])