Example #1
0
def replace_impl(interp, pce, replace_obj, subject, limit=-1):
    replace_obj.setup(interp, pce)
    space = interp.space
    rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit)
    rffi.setintfield(pce.extra, 'c_match_limit_recursion',
                     interp.regexp_recursion_limit)

    # Calculate the size of the offsets array
    num_subpats = pce.capturecount + 1
    size_offsets = num_subpats * 3

    # Initialize some stuff
    builder = StringBuilder(len(subject))

    # Allocate some more raw stuff
    rawsubject = rffi.str2charp(subject)
    offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw')
    try:
        exoptions = 0
        g_notempty = 0
        start_offset = 0
        original_limit = limit
        interp.regexp_error_code = PREG_NO_ERROR

        while limit != 0:
            # Execute the regular expression.
            count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject,
                                    len(subject), start_offset,
                                    exoptions | g_notempty, offsets,
                                    size_offsets)

            # the string was already proved to be valid UTF-8
            exoptions |= _pcre.PCRE_NO_UTF8_CHECK

            # Check for too many substrings condition.
            if count == 0:
                interp.notice("Matched, but too many substrings")
                count = size_offsets // 3

            # If something has matched
            if count > 0:

                # copy the part of the string before the match
                match_end = rffi.cast(lltype.Signed, offsets[0])
                builder.append_slice(subject, start_offset, match_end)

                # ask the replace_obj how to handle this match
                replace_obj.next_replace(builder, subject, count, offsets)

                limit -= 1

            elif count == _pcre.PCRE_ERROR_NOMATCH:
                # If we previously set PCRE_NOTEMPTY after a null match,
                # this is not necessarily the end. We need to advance
                # the start offset, and continue. Fudge the offset
                # values to achieve this, unless we're already at the
                # end of the string.
                if g_notempty != 0 and start_offset < len(subject):
                    next_offset = start_offset
                    next_offset += pce.utf8size(subject, start_offset)
                    builder.append_slice(subject, start_offset, next_offset)
                    offsets[0] = rffi.cast(rffi.INT, start_offset)
                    offsets[1] = rffi.cast(rffi.INT, next_offset)
                else:
                    builder.append_slice(subject, start_offset, len(subject))
                    break

            else:
                handle_exec_error(interp, count)
                return None, -1

            # If we have matched an empty string, mimic what Perl's /g
            # options does.  This turns out to be rather cunning. First
            # we set PCRE_NOTEMPTY and try the match again at the same
            # point. If this fails (picked up above) we advance to the
            # next character.
            g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED if
                          (rffi.cast(lltype.Signed, offsets[1]) == rffi.cast(
                              lltype.Signed, offsets[0])) else 0)

            # Advance to the position right after the last full match
            start_offset = rffi.cast(lltype.Signed, offsets[1])

        else:
            # reached limit == 0: copy the end of the string
            builder.append_slice(subject, start_offset, len(subject))

    finally:
        lltype.free(offsets, flavor='raw')
        rffi.free_charp(rawsubject)

    return space.newstr(builder.build()), original_limit - limit
Example #2
0
def replace_impl(interp, pce, replace_obj, subject, limit=-1):
    replace_obj.setup(interp, pce)
    space = interp.space
    rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit)
    rffi.setintfield(pce.extra, 'c_match_limit_recursion',
                     interp.regexp_recursion_limit)

    # Calculate the size of the offsets array
    num_subpats = pce.capturecount + 1
    size_offsets = num_subpats * 3

    # Initialize some stuff
    builder = StringBuilder(len(subject))

    # Allocate some more raw stuff
    rawsubject = rffi.str2charp(subject)
    offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw')
    try:
        exoptions = 0
        g_notempty = 0
        start_offset = 0
        original_limit = limit
        interp.regexp_error_code = PREG_NO_ERROR

        while limit != 0:
            # Execute the regular expression.
            count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject,
                                    len(subject), start_offset,
                                    exoptions|g_notempty,
                                    offsets, size_offsets)

            # the string was already proved to be valid UTF-8
            exoptions |= _pcre.PCRE_NO_UTF8_CHECK

            # Check for too many substrings condition.
            if count == 0:
                interp.notice("Matched, but too many substrings")
                count = size_offsets // 3

            # If something has matched
            if count > 0:

                # copy the part of the string before the match
                match_end = rffi.cast(lltype.Signed, offsets[0])
                builder.append_slice(subject, start_offset, match_end)

                # ask the replace_obj how to handle this match
                replace_obj.next_replace(builder, subject, count, offsets)

                limit -= 1

            elif count == _pcre.PCRE_ERROR_NOMATCH:
                # If we previously set PCRE_NOTEMPTY after a null match,
                # this is not necessarily the end. We need to advance
                # the start offset, and continue. Fudge the offset
                # values to achieve this, unless we're already at the
                # end of the string.
                if g_notempty != 0 and start_offset < len(subject):
                    next_offset = start_offset
                    next_offset += pce.utf8size(subject, start_offset)
                    builder.append_slice(subject, start_offset, next_offset)
                    offsets[0] = rffi.cast(rffi.INT, start_offset)
                    offsets[1] = rffi.cast(rffi.INT, next_offset)
                else:
                    builder.append_slice(subject, start_offset, len(subject))
                    break

            else:
                handle_exec_error(interp, count)
                return None, -1

            # If we have matched an empty string, mimic what Perl's /g
            # options does.  This turns out to be rather cunning. First
            # we set PCRE_NOTEMPTY and try the match again at the same
            # point. If this fails (picked up above) we advance to the
            # next character.
            g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED
                          if (rffi.cast(lltype.Signed, offsets[1]) ==
                              rffi.cast(lltype.Signed, offsets[0]))
                          else 0)

            # Advance to the position right after the last full match
            start_offset = rffi.cast(lltype.Signed, offsets[1])

        else:
            # reached limit == 0: copy the end of the string
            builder.append_slice(subject, start_offset, len(subject))

    finally:
        lltype.free(offsets, flavor='raw')
        rffi.free_charp(rawsubject)

    return space.newstr(builder.build()), original_limit - limit
Example #3
0
def match_impl(interp, pce, subject, w_matches, start_offset, mode, limit,
               flags):
    space = interp.space

    # Negative offset counts from the end of the string.
    if start_offset < 0:
        start_offset = len(subject) + start_offset
        if start_offset < 0:
            start_offset = 0

    rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit)
    rffi.setintfield(pce.extra, 'c_match_limit_recursion',
                     interp.regexp_recursion_limit)

    # Calculate the size of the offsets array
    num_subpats = pce.capturecount + 1
    size_offsets = num_subpats * 3

    # Allocate match sets array and initialize the values.
    if mode == MODE_PATTERN_ORDER:
        match_sets = [
            space.new_array_from_list([]) for i in range(num_subpats)
        ]
    else:
        match_sets = None

    # Allocate some more raw stuff
    rawsubject = rffi.str2charp(subject)
    offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw')
    if w_matches:
        subpats = space.new_array_from_list([])
    else:
        mode = MODE_NO_SUBPAT
        subpats = None
    try:
        exoptions = 0
        g_notempty = 0
        matched = 0
        last_match = 0
        interp.regexp_error_code = PREG_NO_ERROR
        while matched != limit:
            # Execute the regular expression.
            count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject,
                                    len(subject), start_offset,
                                    exoptions | g_notempty, offsets,
                                    size_offsets)

            # the string was already proved to be valid UTF-8
            exoptions |= _pcre.PCRE_NO_UTF8_CHECK

            # Check for too many substrings condition.
            if count == 0:
                interp.notice("Matched, but too many substrings")
                count = size_offsets // 3

            # If something has matched
            if count > 0:
                matched += 1

                # If subpatterns array has been passed, fill it in with values.
                if mode == MODE_MATCH:
                    # Single pattern matching
                    # For each subpattern, insert it into the
                    # subpatterns array.
                    for i in range(count):
                        subpats = _add_result(space, subpats, subject, offsets,
                                              i, flags, pce.subpat_names[i])
                elif mode == MODE_PATTERN_ORDER:
                    # Global pattern matching, pattern order
                    # For each subpattern, insert it into the
                    # appropriate array.
                    for i in range(count):
                        match_sets[i] = _add_result(space, match_sets[i],
                                                    subject, offsets, i, flags)
                    # If the number of captured subpatterns on this
                    # run is less than the total possible number,
                    # pad the result arrays with empty strings.
                    for i in range(count, num_subpats):
                        match_sets[i] = _add_result_range(
                            space, match_sets[i], subject, 0, 0,
                            flags=0)  # xxx why????

                elif mode == MODE_SET_ORDER:
                    # Global pattern matching, set order
                    sub1 = space.new_array_from_list([])
                    for i in range(count):
                        sub1 = _add_result(space, sub1, subject, offsets, i,
                                           flags, pce.subpat_names[i])
                    subpats = space.appenditem_maybe_inplace(subpats, sub1)

                elif mode == MODE_SPLIT:
                    next_head = rffi.cast(lltype.Signed, offsets[0])
                    no_empty = (flags & PREG_SPLIT_NO_EMPTY) != 0
                    if no_empty and next_head == last_match:
                        matched -= 1
                    else:
                        subpats = _add_result_range(space, subpats, subject,
                                                    last_match, next_head,
                                                    flags)
                    last_match = rffi.cast(lltype.Signed, offsets[1])

                    if flags & PREG_SPLIT_DELIM_CAPTURE:
                        for i in range(1, count):
                            start = rffi.cast(lltype.Signed, offsets[i << 1])
                            stop = rffi.cast(lltype.Signed,
                                             offsets[(i << 1) + 1])
                            if no_empty and start == stop:
                                continue
                            subpats = _add_result_range(
                                space, subpats, subject, start, stop, flags)

            elif count == _pcre.PCRE_ERROR_NOMATCH:
                # If we previously set PCRE_NOTEMPTY after a null match,
                # this is not necessarily the end. We need to advance
                # the start offset, and continue. Fudge the offset
                # values to achieve this, unless we're already at the
                # end of the string.
                if g_notempty != 0 and start_offset < len(subject):
                    offsets[0] = rffi.cast(rffi.INT, start_offset)
                    start_offset += pce.utf8size(subject, start_offset)
                    offsets[1] = rffi.cast(rffi.INT, start_offset)
                else:
                    break

            else:
                handle_exec_error(interp, count)
                break

            # If we have matched an empty string, mimic what Perl's /g
            # options does.  This turns out to be rather cunning. First
            # we set PCRE_NOTEMPTY and try the match again at the same
            # point. If this fails (picked up above) we advance to the
            # next character.
            g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED if
                          (rffi.cast(lltype.Signed, offsets[1]) == rffi.cast(
                              lltype.Signed, offsets[0])) else 0)

            # Advance to the position right after the last full match
            start_offset = rffi.cast(lltype.Signed, offsets[1])

    finally:
        lltype.free(offsets, flavor='raw')
        rffi.free_charp(rawsubject)

    if subpats:
        if mode == MODE_PATTERN_ORDER:
            # Add the match sets to the output array
            for i in range(num_subpats):
                match_set = match_sets[i]
                subpat_name = pce.subpat_names[i]
                if subpat_name is not None:
                    w_key = space.newstr(subpat_name)
                    subpats = space.packitem_maybe_inplace(
                        subpats, w_key, match_set)
                subpats = space.appenditem_maybe_inplace(subpats, match_set)

        elif mode == MODE_SPLIT:
            # the offset might have been incremented,
            # but without further successful matches
            start_offset = last_match

            no_empty = (flags & PREG_SPLIT_NO_EMPTY) != 0
            if no_empty and start_offset >= len(subject):
                pass
            else:
                subpats = _add_result_range(space, subpats, subject,
                                            start_offset, len(subject), flags)

        w_matches.store(subpats, unique=True)

    # Did we encounter an error?
    if interp.regexp_error_code == PREG_NO_ERROR:
        return space.newint(matched)
    else:
        return space.w_False
Example #4
0
def match_impl(interp, pce, subject, w_matches, start_offset,
               mode, limit, flags):
    space = interp.space

    # Negative offset counts from the end of the string.
    if start_offset < 0:
        start_offset = len(subject) + start_offset
        if start_offset < 0:
            start_offset = 0

    rffi.setintfield(pce.extra, 'c_match_limit', interp.regexp_backtrack_limit)
    rffi.setintfield(pce.extra, 'c_match_limit_recursion',
                     interp.regexp_recursion_limit)

    # Calculate the size of the offsets array
    num_subpats = pce.capturecount + 1
    size_offsets = num_subpats * 3

    # Allocate match sets array and initialize the values.
    if mode == MODE_PATTERN_ORDER:
        match_sets = [space.new_array_from_list([])
                      for i in range(num_subpats)]
    else:
        match_sets = None

    # Allocate some more raw stuff
    rawsubject = rffi.str2charp(subject)
    offsets = lltype.malloc(rffi.INTP.TO, size_offsets, flavor='raw')
    if w_matches:
        subpats = space.new_array_from_list([])
    else:
        mode = MODE_NO_SUBPAT
        subpats = None
    try:
        exoptions = 0
        g_notempty = 0
        matched = 0
        last_match = 0
        interp.regexp_error_code = PREG_NO_ERROR
        while matched != limit:
            # Execute the regular expression.
            count = _pcre.pcre_exec(pce.re, pce.extra, rawsubject,
                                    len(subject), start_offset,
                                    exoptions|g_notempty,
                                    offsets, size_offsets)

            # the string was already proved to be valid UTF-8
            exoptions |= _pcre.PCRE_NO_UTF8_CHECK

            # Check for too many substrings condition.
            if count == 0:
                interp.notice("Matched, but too many substrings")
                count = size_offsets // 3

            # If something has matched
            if count > 0:
                matched += 1

                # If subpatterns array has been passed, fill it in with values.
                if mode == MODE_MATCH:
                    # Single pattern matching
                    # For each subpattern, insert it into the
                    # subpatterns array.
                    for i in range(count):
                        subpats = _add_result(space, subpats, subject,
                                              offsets, i, flags,
                                              pce.subpat_names[i])
                elif mode == MODE_PATTERN_ORDER:
                    # Global pattern matching, pattern order
                    # For each subpattern, insert it into the
                    # appropriate array.
                    for i in range(count):
                        match_sets[i] = _add_result(
                            space, match_sets[i], subject,
                            offsets, i, flags)
                    # If the number of captured subpatterns on this
                    # run is less than the total possible number,
                    # pad the result arrays with empty strings.
                    for i in range(count, num_subpats):
                        match_sets[i] = _add_result_range(
                            space, match_sets[i], subject, 0, 0,
                            flags=0)   # xxx why????

                elif mode == MODE_SET_ORDER:
                    # Global pattern matching, set order
                    sub1 = space.new_array_from_list([])
                    for i in range(count):
                        sub1 = _add_result(space, sub1, subject,
                                           offsets, i, flags,
                                           pce.subpat_names[i])
                    subpats = space.appenditem_maybe_inplace(subpats, sub1)

                elif mode == MODE_SPLIT:
                    next_head = rffi.cast(lltype.Signed, offsets[0])
                    no_empty = (flags & PREG_SPLIT_NO_EMPTY) != 0
                    if no_empty and next_head == last_match:
                        matched -= 1
                    else:
                        subpats = _add_result_range(space, subpats,
                                                    subject, last_match,
                                                    next_head, flags)
                    last_match = rffi.cast(lltype.Signed, offsets[1])

                    if flags & PREG_SPLIT_DELIM_CAPTURE:
                        for i in range(1, count):
                            start = rffi.cast(lltype.Signed, offsets[i<<1])
                            stop = rffi.cast(lltype.Signed,
                                             offsets[(i<<1)+1])
                            if no_empty and start == stop:
                                continue
                            subpats = _add_result_range(space, subpats,
                                                        subject, start,
                                                        stop, flags)

            elif count == _pcre.PCRE_ERROR_NOMATCH:
                # If we previously set PCRE_NOTEMPTY after a null match,
                # this is not necessarily the end. We need to advance
                # the start offset, and continue. Fudge the offset
                # values to achieve this, unless we're already at the
                # end of the string.
                if g_notempty != 0 and start_offset < len(subject):
                    offsets[0] = rffi.cast(rffi.INT, start_offset)
                    start_offset += pce.utf8size(subject, start_offset)
                    offsets[1] = rffi.cast(rffi.INT, start_offset)
                else:
                    break

            else:
                handle_exec_error(interp, count)
                break

            # If we have matched an empty string, mimic what Perl's /g
            # options does.  This turns out to be rather cunning. First
            # we set PCRE_NOTEMPTY and try the match again at the same
            # point. If this fails (picked up above) we advance to the
            # next character.
            g_notempty = (_pcre.PCRE_NOTEMPTY | _pcre.PCRE_ANCHORED
                          if (rffi.cast(lltype.Signed, offsets[1]) ==
                              rffi.cast(lltype.Signed, offsets[0]))
                          else 0)

            # Advance to the position right after the last full match
            start_offset = rffi.cast(lltype.Signed, offsets[1])

    finally:
        lltype.free(offsets, flavor='raw')
        rffi.free_charp(rawsubject)

    if subpats:
        if mode == MODE_PATTERN_ORDER:
            # Add the match sets to the output array
            for i in range(num_subpats):
                match_set = match_sets[i]
                subpat_name = pce.subpat_names[i]
                if subpat_name is not None:
                    w_key = space.newstr(subpat_name)
                    subpats = space.packitem_maybe_inplace(subpats, w_key,
                                                           match_set)
                subpats = space.appenditem_maybe_inplace(subpats, match_set)

        elif mode == MODE_SPLIT:
            # the offset might have been incremented,
            # but without further successful matches
            start_offset = last_match

            no_empty = (flags & PREG_SPLIT_NO_EMPTY) != 0
            if no_empty and start_offset >= len(subject):
                pass
            else:
                subpats = _add_result_range(space, subpats,
                                            subject, start_offset,
                                            len(subject), flags)

        w_matches.store(subpats, unique=True)

    # Did we encounter an error?
    if interp.regexp_error_code == PREG_NO_ERROR:
        return space.newint(matched)
    else:
        return space.w_False