Exemple #1
0
def simplify_charclass(matching_codes, ignorecase=False):
    """Given a sequence of ordinals, return a (seq, negated) tuple.

    `ignorecase` is whether the regex flags include re.IGNORECASE.

    If the class shouldn't be optimized, raises WontOptimize with a basic reason
    string.
    """
    if max(matching_codes) > 255:
        raise WontOptimize('Unicode')

    # HACK: Don't simplify something that looks fairly like a hex digit pattern.
    # They look arguably prettier as '0-9a-f' than '\da-f'
    bv = bitvector(matching_codes)
    if (bv & HEX) == HEX and ord('g') not in matching_codes:
        raise WontOptimize('Hex digit')
    if (bv & ALNUM) == ALNUM and ord('_') not in matching_codes:
        raise WontOptimize('Alphanumeric without _')

    if ignorecase:
        bv = bitvector(map(lowercase_code, matching_codes))
        base = INSENSITIVE_ASCII
    else:
        base = ASCII

    # Tries all possibilities of categories first.
    keys = sorted(CATS.keys(), reverse=True)
    # Strategy: since we have a small number of categories, try each of them to
    # see if it's legal; add in remaining ranges; score.
    # when negated=0, there are 64 (=2**6) combinations to check.
    # when negated=1, there are only 8 (=2**3) combinations.
    possibilities = []
    for negated in (0, 1):
        for i in range(2**len(keys)):
            chosen_keys = [keys[b] for b in range(len(keys)) if i & 1<<b]
            # Humans are terrible at double-negatives.  If this involves a
            # negation of the charclass as well as the category, tough cookies.
            # This will cause suggested _expansion_ of any such uses already in
            # the codebase, which should be ignored by the caller.
            if negated:
                if any(k[1].isupper() for k in chosen_keys):
                    continue

            if negated:
                t = base ^ (base & bv)
            else:
                t = bv

            chosen = 0
            for k in chosen_keys:
                chosen |= CATS[k]
            chosen &= base

            # True iff. the chosen categories fit entirely in the target.
            if chosen & t == chosen:
                #print chosen_keys, "t", unpack_bitvector(t), unpack_bitvector(chosen)
                t ^= chosen
                #print "  ", unpack_bitvector(t)
                r = build_ranges(unpack_bitvector(t))
                r[:0] = chosen_keys
                discount = 1 if chosen_keys == ['\\w', '\\W'] else 0

                if r:
                    possibilities.append((charclass_score(r, negated) - discount,
                                          r, negated))

    #print "possibilities", possibilities
    # There will always be one, since we include no-categories above, and it's
    # not on the WontOptimize list.
    possibilities.sort()
    return (possibilities[0][1], possibilities[0][2])
 def test_pack_bitvector(self):
     for i in range(1 << 10):
         intermediate = unpack_bitvector(i)
         x = bitvector(intermediate)
         print(i, intermediate)
         self.assertEqual(i, x)
Exemple #3
0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from regexlint.parser import WHITESPACE, DIGITS, WORD, CharClass
from regexlint.util import build_ranges, esc, lowercase_code
from regexlint.bitvector import bitvector, unpack_bitvector

__all__ = ['simplify_charclass', 'charclass_score', 'build_output',
           'WontOptimize']

CATS = {
    '\\s': bitvector(map(ord, WHITESPACE)),
    '\\d': bitvector(map(ord, DIGITS)),
    '\\w': bitvector(map(ord, WORD)),
    '\\S': bitvector([_ for _ in range(256) if chr(_) not in WHITESPACE]),
    '\\D': bitvector([_ for _ in range(256) if chr(_) not in DIGITS]),
    '\\W': bitvector([_ for _ in range(256) if chr(_) not in WORD]),
}

HEX = bitvector(map(ord, '0123456789abcdef'))
ALNUM = (bitvector(range(ord('a'), ord('z')+1)) |
         bitvector(map(ord, '0123456789')))
ASCII = (1<<256) - 1
INSENSITIVE_ASCII = bitvector(map(lowercase_code, range(256)))

class WontOptimize(Exception):
    pass
Exemple #4
0
 def test_pack_bitvector(self):
     for i in range(1<<10):
         intermediate=unpack_bitvector(i)
         x = bitvector(intermediate)
         print(i, intermediate)
         self.assertEqual(i, x)
Exemple #5
0
def simplify_charclass(matching_codes, ignorecase=False):
    """Given a sequence of ordinals, return a (seq, negated) tuple.

    `ignorecase` is whether the regex flags include re.IGNORECASE.

    If the class shouldn't be optimized, raises WontOptimize with a basic reason
    string.
    """
    if max(matching_codes) > 255:
        raise WontOptimize('Unicode')

    # HACK: Don't simplify something that looks fairly like a hex digit pattern.
    # They look arguably prettier as '0-9a-f' than '\da-f'
    bv = bitvector(matching_codes)
    if (bv & HEX) == HEX and ord('g') not in matching_codes:
        raise WontOptimize('Hex digit')
    if (bv & ALNUM) == ALNUM and ord('_') not in matching_codes:
        raise WontOptimize('Alphanumeric without _')

    if ignorecase:
        bv = bitvector(map(lowercase_code, matching_codes))
        base = INSENSITIVE_ASCII
    else:
        base = ASCII

    # Tries all possibilities of categories first.
    keys = sorted(CATS.keys(), reverse=True)
    # Strategy: since we have a small number of categories, try each of them to
    # see if it's legal; add in remaining ranges; score.
    # when negated=0, there are 64 (=2**6) combinations to check.
    # when negated=1, there are only 8 (=2**3) combinations.
    possibilities = []
    for negated in (0, 1):
        #  target is the set of all characters we want to match, and none of the
        #  ones we don't (note: for case-insensitive, we mask `chosen' before
        #  comparing later).
        if negated:
            if ignorecase:
                target = bitvector(
                    map(lowercase_code, [
                        i for i in range(256) if i not in unpack_bitvector(bv)
                    ]))
            else:
                target = base ^ (base & bv)
        else:
            target = bv

        for i in range(2**len(keys)):
            chosen_keys = [keys[b] for b in range(len(keys)) if i & 1 << b]
            # Humans are terrible at double-negatives.  If this involves a
            # negation of the charclass as well as the category, tough cookies.
            # This will cause suggested _expansion_ of any such uses already in
            # the codebase, which should be ignored by the caller.
            if negated:
                if any(k[1].isupper() for k in chosen_keys):
                    continue

            t = target
            chosen = 0
            for k in chosen_keys:
                chosen |= CATS[k]
            # N.b. don't need to conditionally lowercase_code here because all
            # our categories contain lower if they contain upper.
            chosen &= base

            # True iff. the chosen categories fit entirely in the target.
            if chosen & t == chosen:
                #print chosen_keys, "t", unpack_bitvector(t), unpack_bitvector(chosen)
                t ^= chosen
                #print "  ", unpack_bitvector(t)
                r = build_ranges(unpack_bitvector(t))
                r[:0] = chosen_keys
                discount = 1 if chosen_keys == ['\\w', '\\W'] else 0

                if r:
                    possibilities.append(
                        (charclass_score(r, negated) - discount, r, negated))

    #print "possibilities", possibilities
    # There will always be one, since we include no-categories above, and it's
    # not on the WontOptimize list.
    possibilities.sort(key=lambda i: i[0])
    return (possibilities[0][1], possibilities[0][2])
Exemple #6
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from regexlint.parser import WHITESPACE, DIGITS, WORD, CharClass
from regexlint.util import build_ranges, esc, lowercase_code
from regexlint.bitvector import bitvector, unpack_bitvector

__all__ = [
    'simplify_charclass', 'charclass_score', 'build_output', 'WontOptimize'
]

CATS = {
    '\\s': bitvector(map(ord, WHITESPACE)),
    # disabled as it's not more easily readable.
    #'\\d': bitvector(map(ord, DIGITS)),
    '\\w': bitvector(map(ord, WORD)),
    '\\S': bitvector([_ for _ in range(256) if chr(_) not in WHITESPACE]),
    '\\D': bitvector([_ for _ in range(256) if chr(_) not in DIGITS]),
    '\\W': bitvector([_ for _ in range(256) if chr(_) not in WORD]),
}

HEX = bitvector(map(ord, '0123456789abcdef'))
ALNUM = (bitvector(range(ord('a'),
                         ord('z') + 1)) | bitvector(map(ord, '0123456789')))
ASCII = (1 << 256) - 1
INSENSITIVE_ASCII = bitvector(map(lowercase_code, range(256)))

Exemple #7
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from regexlint.bitvector import bitvector, unpack_bitvector
from regexlint.parser import DIGITS, WHITESPACE, WORD, CharClass
from regexlint.util import build_ranges, esc, lowercase_code

__all__ = [
    "simplify_charclass", "charclass_score", "build_output", "WontOptimize"
]

CATS = {
    "\\s": bitvector(map(ord, WHITESPACE)),
    # disabled as it's not more easily readable.
    # '\\d': bitvector(map(ord, DIGITS)),
    "\\w": bitvector(map(ord, WORD)),
    "\\S": bitvector([_ for _ in range(256) if chr(_) not in WHITESPACE]),
    "\\D": bitvector([_ for _ in range(256) if chr(_) not in DIGITS]),
    "\\W": bitvector([_ for _ in range(256) if chr(_) not in WORD]),
}

HEX = bitvector(map(ord, "0123456789abcdef"))
ALNUM = bitvector(range(ord("a"),
                        ord("z") + 1)) | bitvector(map(ord, "0123456789"))
ASCII = (1 << 256) - 1
INSENSITIVE_ASCII = bitvector(map(lowercase_code, range(256)))