Example #1
0
def test_unescape():
    from pypy.rlib.parsing.regexparse import unescape
    s = "".join([
        "\\x%s%s" % (a, b) for a in "0123456789abcdefABCDEF"
        for b in "0123456789ABCDEFabcdef"
    ])
    assert unescape(s) == eval("'" + s + "'")
Example #2
0
def create_result_iterator(tests, results):
    """Gets the expected return sets for each regular expression."""
    # Second line is the test to run against the regex
    # '    TEXT'
    while 1:
        test = get_simult_lines(tests, results)
        if not test:
            raise StopIteration
        if not test.startswith('    '):
            raise Exception("Input & output match, but I don't understand. (Got %r)" % test)
        if test.endswith('\\'): # Tests that end in \ expect the \ to be chopped off
            assert not test.endswith('\\\\')    # make sure there are no \\ at end
            test = test[:-1]
        test = unescape(test[4:])
    
        # Third line in the OUTPUT is the result, either:
        # ' 0: ...' for a match (but this is ONLY escaped by \x__ types)
        # 'No match' for no match
        result = results.pop(0)
        result = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1),16)), result)
        if result == 'No match':
            pass
        elif result.startswith(' 0:'):
            # Now we need to eat any further lines like:
            # ' 1: ....' a subgroup match
            while results[0]:
                if results[0][2] == ':':
                    results.pop(0)
                else:
                    break
        else:
            raise Exception("Lost sync in output.")
        yield test, result
Example #3
0
def create_result_iterator(tests, results):
    """Gets the expected return sets for each regular expression."""
    # Second line is the test to run against the regex
    # '    TEXT'
    while 1:
        test = get_simult_lines(tests, results)
        if not test:
            raise StopIteration
        if not test.startswith('    '):
            raise Exception(
                "Input & output match, but I don't understand. (Got %r)" %
                test)
        if test.endswith(
                '\\'):  # Tests that end in \ expect the \ to be chopped off
            assert not test.endswith(
                '\\\\')  # make sure there are no \\ at end
            test = test[:-1]
        test = unescape(test[4:])

        # Third line in the OUTPUT is the result, either:
        # ' 0: ...' for a match (but this is ONLY escaped by \x__ types)
        # 'No match' for no match
        result = results.pop(0)
        result = re.sub(r'\\x([0-9a-fA-F]{2})',
                        lambda m: chr(int(m.group(1), 16)), result)
        if result == 'No match':
            pass
        elif result.startswith(' 0:'):
            # Now we need to eat any further lines like:
            # ' 1: ....' a subgroup match
            while results[0]:
                if results[0][2] == ':':
                    results.pop(0)
                else:
                    break
        else:
            raise Exception("Lost sync in output.")
        yield test, result
Example #4
0
def create_pcre_pickle(file, dumper):
    """Create a filtered PCRE test file for the test."""
    lines = [line for line in file.readlines()]

    # Look for things to skip...
    no_escape = r'(^|[^\\])(\\\\)*'  # Make sure there's no escaping \
    greedy_ops = re.compile(no_escape + r'[*?+}\(]\?')  # Look for *? +? }? (?
    back_refs = re.compile(no_escape + r'\(.*' + no_escape +
                           r'\\1')  # find a \1
    caret_in_middle = re.compile(no_escape + r'[^\[\\]\^')
    posix_char_classes = re.compile(
        no_escape + r'\[[^]]*\[:[^]]+:\][^]]*\]')  # like [[:digit:]]
    bad_backslashes = re.compile(
        no_escape +
        r'(\\Q|\\E|\\G|\\P|\\8|\\9|\\A|\\Z|\\F|\\R|\\B|\\b|\\h|\\H|\\v|\\V|\\z|\\N)'
    )  # PCRE allows \Q.....\E to quote substrings, we dont.

    # Perl allows single-digit hex escapes. Change \x0 -> \x00, for example
    expand_perl_hex = re.compile(r'\\x([0-9a-fA-F]{1})(?=[^0-9a-fA-F]|$)')

    # suite = [
    #            [regex, flags, [(test,result),(test,result),...]]
    #            [regex, flags, [(test,result),(test,result),...]]
    #         ]
    suite = []
    while lines:
        delim = None
        regex = ''
        # A line is marked by a start-delimeter and an end-delimeter.
        # The delimeter is non-alphanumeric
        # If a backslash follows the delimiter, then the backslash should
        #   be appended to the end. (Otherwise, \ + delim would not be a
        #   delim anymore!)
        while 1:
            regex += lines.pop(0)
            if not delim:
                if not regex.strip():  # Suppress blank lanes before delim
                    regex = ''
                    continue
                delim = regex.strip()[0]
                assert delim in (set(string.printable) - set(string.letters) -
                                 set(string.digits))
                test_re = re.compile(
                    r'%(delim)s(([^%(delim)s]|\\%(delim)s)*([^\\]))%(delim)s(\\?)([^\n\r]*)'
                    % {'delim': delim})
                # last two groups are an optional backslash and optional flags

            matches = test_re.findall(regex)
            if matches:
                break

        assert len(matches) == 1  # check to make sure we matched right

        regex = matches[0][0]
        regex += matches[0][-2]  # Add the backslash, if we gotta
        flags = matches[0][-1]  # Get the flags for the regex

        # Gotta tolerate Perl's short hexes
        regex = expand_perl_hex.sub(lambda m: r'\x0' + m.group(1), regex)

        tests = []
        if greedy_ops.search(regex) or back_refs.search(regex):
            # Suppress complex features we can't do
            pass
        elif flags:
            # Suppress any test that requires PCRE flags
            pass
        elif posix_char_classes.search(regex):
            pass
        elif caret_in_middle.search(regex):
            pass
        elif bad_backslashes.search(regex):
            pass
        else:
            # In any other case, we're going to add the test
            # All the above test fall through and DONT get appended
            suite.append([regex, flags, tests])

        # Now find the test and expected result
        while lines:
            test = lines.pop(0).strip()
            if not test:
                break  # blank line ends the set
            if test.endswith(
                    '\\'
            ):  # Tests that end in \ expect the \ to be chopped off
                assert not test.endswith(
                    '\\\\\\'
                )  # Make sure not three \'s. otherwise this check will get ridiculous
                if not test.endswith('\\\\'):  # Two \'s means a real \
                    test = test[:-1]
            test = expand_perl_hex.sub(lambda m: r'\x0' + m.group(1), test)

            disqualify_test = bad_backslashes.search(test)

            try:
                test = unescape(test)
            except Exception:
                disqualify_test = True
                print "Warning: could not unescape %r" % test

            # Third line in the OUTPUT is the result, either:
            # ' 0: ...' for a match (but this is ONLY escaped by \x__ types)
            # 'No match' for no match
            # (other kinds exist, but we ignore them)
            while lines:
                match = lines.pop(0).rstrip('\r\n')
                match = re.sub(r'\\x([0-9a-fA-F]{2})',
                               lambda m: chr(int(m.group(1), 16)), match)
                if match.startswith('No match') or match.startswith(
                        'Error') or match.startswith('Partial'):
                    match = None
                    break
                elif match.startswith(' 0:'):
                    # Now we need to eat any further lines like:
                    # ' 1: ....' a subgroup match
                    match = match[4:]
                    while lines[0].strip():
                        # ' 0+ ...' is also possible here
                        if lines[0][2] in [':', '+']:
                            lines.pop(0)
                        else:
                            break
                    break
                elif not match:
                    print " *** %r ***" % match
                    raise Exception("Lost sync in output.")
            if not disqualify_test:
                tests.append((test, match))

    # Last step, if there are regex's that dont have any tests,
    # might as well strip them out
    suite = [test for test in suite if test[2]]

    dumper.dump(suite)
Example #5
0
def test_unescape():
    from pypy.rlib.parsing.regexparse import unescape
    s = "".join(["\\x%s%s" % (a, b) for a in "0123456789abcdefABCDEF"
                    for b in "0123456789ABCDEFabcdef"])
    assert unescape(s) == eval("'" + s + "'")
Example #6
0
def create_pcre_pickle(file, dumper):
    """Create a filtered PCRE test file for the test."""
    lines = [line for line in file.readlines()]
    
    # Look for things to skip...
    no_escape = r'(^|[^\\])(\\\\)*'                   # Make sure there's no escaping \
    greedy_ops = re.compile(no_escape + r'[*?+}\(]\?')  # Look for *? +? }? (?
    back_refs  = re.compile(no_escape + r'\(.*' + no_escape + r'\\1') # find a \1
    caret_in_middle = re.compile(no_escape + r'[^\[\\]\^')
    posix_char_classes = re.compile(no_escape + r'\[[^]]*\[:[^]]+:\][^]]*\]')    # like [[:digit:]]
    bad_backslashes = re.compile(no_escape + r'(\\Q|\\E|\\G|\\P|\\8|\\9|\\A|\\Z|\\F|\\R|\\B|\\b|\\h|\\H|\\v|\\V|\\z|\\N)')   # PCRE allows \Q.....\E to quote substrings, we dont.
    
    # Perl allows single-digit hex escapes. Change \x0 -> \x00, for example
    expand_perl_hex = re.compile(r'\\x([0-9a-fA-F]{1})(?=[^0-9a-fA-F]|$)')
    
    # suite = [ 
    #            [regex, flags, [(test,result),(test,result),...]]
    #            [regex, flags, [(test,result),(test,result),...]]
    #         ]
    suite = []
    while lines:
        delim = None
        regex = ''
        # A line is marked by a start-delimeter and an end-delimeter.
        # The delimeter is non-alphanumeric
        # If a backslash follows the delimiter, then the backslash should
        #   be appended to the end. (Otherwise, \ + delim would not be a
        #   delim anymore!)
        while 1:
            regex += lines.pop(0)
            if not delim:
                if not regex.strip():   # Suppress blank lanes before delim
                    regex = ''
                    continue
                delim = regex.strip()[0]
                assert delim in (set(string.printable) - set(string.letters) - set(string.digits))
                test_re = re.compile(r'%(delim)s(([^%(delim)s]|\\%(delim)s)*([^\\]))%(delim)s(\\?)([^\n\r]*)' % {'delim': delim})
                # last two groups are an optional backslash and optional flags
            
            matches = test_re.findall(regex)
            if matches:
                break

        assert len(matches)==1  # check to make sure we matched right
    
        regex = matches[0][0]
        regex += matches[0][-2] # Add the backslash, if we gotta
        flags = matches[0][-1] # Get the flags for the regex

        # Gotta tolerate Perl's short hexes
        regex = expand_perl_hex.sub(lambda m: r'\x0'+m.group(1), regex)
            
        tests = []
        if greedy_ops.search(regex) or back_refs.search(regex):
            # Suppress complex features we can't do
            pass
        elif flags:
            # Suppress any test that requires PCRE flags
            pass
        elif posix_char_classes.search(regex):
            pass
        elif caret_in_middle.search(regex):
            pass
        elif bad_backslashes.search(regex):
            pass
        else:
            # In any other case, we're going to add the test
            # All the above test fall through and DONT get appended
            suite.append([regex, flags, tests]) 
            
        # Now find the test and expected result
        while lines:
            test = lines.pop(0).strip()
            if not test:
                break   # blank line ends the set
            if test.endswith('\\'): # Tests that end in \ expect the \ to be chopped off
                assert not test.endswith('\\\\\\') # Make sure not three \'s. otherwise this check will get ridiculous
                if not test.endswith('\\\\'): # Two \'s means a real \
                    test = test[:-1]
            test = expand_perl_hex.sub(lambda m: r'\x0'+m.group(1), test)

            disqualify_test = bad_backslashes.search(test)

            try:
                test = unescape(test)
            except Exception:
                disqualify_test = True
                print "Warning: could not unescape %r" % test
                

            # Third line in the OUTPUT is the result, either:
            # ' 0: ...' for a match (but this is ONLY escaped by \x__ types)
            # 'No match' for no match
            # (other kinds exist, but we ignore them)
            while lines:
                match = lines.pop(0).rstrip('\r\n')
                match = re.sub(r'\\x([0-9a-fA-F]{2})', lambda m: chr(int(m.group(1),16)), match)
                if match.startswith('No match') or match.startswith('Error') or match.startswith('Partial'):
                    match = None
                    break
                elif match.startswith(' 0:'):
                    # Now we need to eat any further lines like:
                    # ' 1: ....' a subgroup match
                    match = match[4:]
                    while lines[0].strip():
                        # ' 0+ ...' is also possible here
                        if lines[0][2] in [':','+']:
                            lines.pop(0)
                        else:
                            break
                    break
                elif not match:
                    print " *** %r ***" % match
                    raise Exception("Lost sync in output.")
            if not disqualify_test:
                tests.append((test,match))
    
    # Last step, if there are regex's that dont have any tests,
    # might as well strip them out
    suite = [test for test in suite if test[2]]

    dumper.dump(suite)