def sre_at(ctx, atcode, ptr): if (atcode == AT_BEGINNING or atcode == AT_BEGINNING_STRING): return ptr == 0 elif atcode == AT_BEGINNING_LINE: prevptr = ptr - 1 return prevptr < 0 or rsre_char.is_linebreak(ctx.str(prevptr)) elif atcode == AT_BOUNDARY: return at_boundary(ctx, ptr) elif atcode == AT_NON_BOUNDARY: return at_non_boundary(ctx, ptr) elif atcode == AT_END: remaining_chars = ctx.end - ptr return remaining_chars <= 0 or ( remaining_chars == 1 and rsre_char.is_linebreak(ctx.str(ptr))) elif atcode == AT_END_LINE: return ptr == ctx.end or rsre_char.is_linebreak(ctx.str(ptr)) elif atcode == AT_END_STRING: return ptr == ctx.end elif atcode == AT_LOC_BOUNDARY: return at_loc_boundary(ctx, ptr) elif atcode == AT_LOC_NON_BOUNDARY: return at_loc_non_boundary(ctx, ptr) elif atcode == AT_UNI_BOUNDARY: return at_uni_boundary(ctx, ptr) elif atcode == AT_UNI_NON_BOUNDARY: return at_uni_non_boundary(ctx, ptr) return False
def match_ANY(ctx, ptr, ppos): # dot wildcard. return not rsre_char.is_linebreak(ctx.str(ptr))
def sre_match(ctx, ppos, ptr, marks): """Returns either None or a MatchResult object. Usually we only need the first result, but there is the case of REPEAT...UNTIL where we need all results; in that case we use the method move_to_next_result() of the MatchResult.""" while True: op = ctx.pat(ppos) ppos += 1 #jit.jit_debug("sre_match", op, ppos, ptr) # # When using the JIT, calls to sre_match() must always have a constant # (green) argument for 'ppos'. If not, the following assert fails. jit.assert_green(op) if op == OPCODE_FAILURE: return if (op == OPCODE_SUCCESS or op == OPCODE_MAX_UNTIL or op == OPCODE_MIN_UNTIL): ctx.match_end = ptr ctx.match_marks = marks return MATCHED_OK elif op == OPCODE_ANY: # match anything (except a newline) # <ANY> if ptr >= ctx.end or rsre_char.is_linebreak(ctx.str(ptr)): return ptr += 1 elif op == OPCODE_ANY_ALL: # match anything # <ANY_ALL> if ptr >= ctx.end: return ptr += 1 elif op == OPCODE_ASSERT: # assert subpattern # <ASSERT> <0=skip> <1=back> <pattern> ptr1 = ptr - ctx.pat(ppos+1) if ptr1 < 0 or sre_match(ctx, ppos + 2, ptr1, marks) is None: return marks = ctx.match_marks ppos += ctx.pat(ppos) elif op == OPCODE_ASSERT_NOT: # assert not subpattern # <ASSERT_NOT> <0=skip> <1=back> <pattern> ptr1 = ptr - ctx.pat(ppos+1) if ptr1 >= 0 and sre_match(ctx, ppos + 2, ptr1, marks) is not None: return ppos += ctx.pat(ppos) elif op == OPCODE_AT: # match at given position (e.g. at beginning, at boundary, etc.) # <AT> <code> if not sre_at(ctx, ctx.pat(ppos), ptr): return ppos += 1 elif op == OPCODE_BRANCH: # alternation # <BRANCH> <0=skip> code <JUMP> ... <NULL> result = BranchMatchResult(ppos, ptr, marks) return result.find_first_result(ctx) elif op == OPCODE_CATEGORY: # seems to be never produced, but used by some tests from # pypy/module/_sre/test # <CATEGORY> <category> if (ptr == ctx.end or not rsre_char.category_dispatch(ctx.pat(ppos), ctx.str(ptr))): return ptr += 1 ppos += 1 elif op == OPCODE_GROUPREF: # match backreference # <GROUPREF> <groupnum> startptr, length = get_group_ref(marks, ctx.pat(ppos)) if length < 0: return # group was not previously defined if not match_repeated(ctx, ptr, startptr, length): return # no match ptr += length ppos += 1 elif op == OPCODE_GROUPREF_IGNORE: # match backreference # <GROUPREF> <groupnum> startptr, length = get_group_ref(marks, ctx.pat(ppos)) if length < 0: return # group was not previously defined if not match_repeated_ignore(ctx, ptr, startptr, length): return # no match ptr += length ppos += 1 elif op == OPCODE_GROUPREF_EXISTS: # conditional match depending on the existence of a group # <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... _, length = get_group_ref(marks, ctx.pat(ppos)) if length >= 0: ppos += 2 # jump to 'codeyes' else: ppos += ctx.pat(ppos+1) # jump to 'codeno' elif op == OPCODE_IN: # match set member (or non_member) # <IN> <skip> <set> if ptr >= ctx.end or not rsre_char.check_charset(ctx.pattern, ppos+1, ctx.str(ptr)): return ppos += ctx.pat(ppos) ptr += 1 elif op == OPCODE_IN_IGNORE: # match set member (or non_member), ignoring case # <IN> <skip> <set> if ptr >= ctx.end or not rsre_char.check_charset(ctx.pattern, ppos+1, ctx.lowstr(ptr)): return ppos += ctx.pat(ppos) ptr += 1 elif op == OPCODE_INFO: # optimization info block # <INFO> <0=skip> <1=flags> <2=min> ... if (ctx.end - ptr) < ctx.pat(ppos+2): return ppos += ctx.pat(ppos) elif op == OPCODE_JUMP: ppos += ctx.pat(ppos) elif op == OPCODE_LITERAL: # match literal string # <LITERAL> <code> if ptr >= ctx.end or ctx.str(ptr) != ctx.pat(ppos): return ppos += 1 ptr += 1 elif op == OPCODE_LITERAL_IGNORE: # match literal string, ignoring case # <LITERAL_IGNORE> <code> if ptr >= ctx.end or ctx.lowstr(ptr) != ctx.pat(ppos): return ppos += 1 ptr += 1 elif op == OPCODE_MARK: # set mark # <MARK> <gid> gid = ctx.pat(ppos) marks = Mark(gid, ptr, marks) ppos += 1 elif op == OPCODE_NOT_LITERAL: # match if it's not a literal string # <NOT_LITERAL> <code> if ptr >= ctx.end or ctx.str(ptr) == ctx.pat(ppos): return ppos += 1 ptr += 1 elif op == OPCODE_NOT_LITERAL_IGNORE: # match if it's not a literal string, ignoring case # <NOT_LITERAL> <code> if ptr >= ctx.end or ctx.lowstr(ptr) == ctx.pat(ppos): return ppos += 1 ptr += 1 elif op == OPCODE_REPEAT: # general repeat. in this version of the re module, all the work # is done here, and not on the later UNTIL operator. # <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail # FIXME: we probably need to deal with zero-width matches in here.. # decode the later UNTIL operator to see if it is actually # a MAX_UNTIL or MIN_UNTIL untilppos = ppos + ctx.pat(ppos) tailppos = untilppos + 1 op = ctx.pat(untilppos) if op == OPCODE_MAX_UNTIL: # the hard case: we have to match as many repetitions as # possible, followed by the 'tail'. we do this by # remembering each state for each possible number of # 'item' matching. result = MaxUntilMatchResult(ppos, tailppos, ptr, marks) return result.find_first_result(ctx) elif op == OPCODE_MIN_UNTIL: # first try to match the 'tail', and if it fails, try # to match one more 'item' and try again result = MinUntilMatchResult(ppos, tailppos, ptr, marks) return result.find_first_result(ctx) else: raise Error("missing UNTIL after REPEAT") elif op == OPCODE_REPEAT_ONE: # match repeated sequence (maximizing regexp). # this operator only works if the repeated item is # exactly one character wide, and we're not already # collecting backtracking points. for other cases, # use the MAX_REPEAT operator. # <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail start = ptr minptr = start + ctx.pat(ppos+1) if minptr > ctx.end: return # cannot match ptr = find_repetition_end(ctx, ppos+3, start, ctx.pat(ppos+2)) # when we arrive here, ptr points to the tail of the target # string. check if the rest of the pattern matches, # and backtrack if not. nextppos = ppos + ctx.pat(ppos) result = RepeatOneMatchResult(nextppos, minptr, ptr, marks) return result.find_first_result(ctx) elif op == OPCODE_MIN_REPEAT_ONE: # match repeated sequence (minimizing regexp). # this operator only works if the repeated item is # exactly one character wide, and we're not already # collecting backtracking points. for other cases, # use the MIN_REPEAT operator. # <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail start = ptr min = ctx.pat(ppos+1) if min > 0: minptr = ptr + min if minptr > ctx.end: return # cannot match # count using pattern min as the maximum ptr = find_repetition_end(ctx, ppos+3, ptr, min) if ptr < minptr: return # did not match minimum number of times maxptr = ctx.end max = ctx.pat(ppos+2) if max != 65535: maxptr1 = start + max if maxptr1 <= maxptr: maxptr = maxptr1 nextppos = ppos + ctx.pat(ppos) result = MinRepeatOneMatchResult(nextppos, ppos+3, maxptr, ptr, marks) return result.find_first_result(ctx) else: raise Error("bad pattern code %d" % op)