def reMatchStatePairs(re, str): """Return a list of (re, str), where re is a the regular expression with <SPAN>s inserted over all the matched characters, and str is the string with <SPAN>s likewise inserted.""" #print re, '~=', str pairs = [] fsa = compileRE(re, recordSourcePositions=1) states = fsa.epsilonClosure(fsa.initialState) positions = [] #todo: everything that starts here? for i in range(len(str)): if i < len(str): #print states, '->', newStates, '(', str[i], ')' #todo: factor the following block with fsa.nextStateSet newPositions = [] newStates = updatePositions(fsa, states, str[i], newPositions) #assert newStates == fsa.nextStateSet(states, str[i]) if not newStates: # we ran out of matches # todo: show in red where the match stopped, as in the textual version expected = None for state in states: for t in fsa.transitionsFrom(state): label = t[2] if expected: expected = expected + label else: expected = label return pairs, 'expected %s' % expected srcLabel = fsa.label # todo: could color newly matched states in a different color #todo: quote the html stuff rem = '' #print srcLabel, allStates, positions def htmlQuote(str): return ''.join([{'<': '<', '>': '>', '&': '&'}.get(c, c) for c in str]) for j in range(len(srcLabel)): c = htmlQuote(srcLabel[j]) if j+1 in newPositions: rem += '<SPAN CLASS="rematchnew">%s</SPAN>' % c #positions.append(j) elif j+1 in positions: rem += '<SPAN CLASS="rematch">%s</SPAN>' % c else: rem += c s0, s1, s2 = htmlQuote(str[:i+1]), '', htmlQuote(str[i+1:]) strm = '<SPAN CLASS="strmatch">%s</SPAN><SPAN CLASS="strmatchnew">%s</SPAN>%s' % (s0, s1, s2) comment = "states: %s -> %s; positions: %s -> %s; index = %d" % (states,newStates,positions,newPositions,i) pairs.append((rem, strm, comment)) states = newStates positions += newPositions return pairs, [s for s in states if s in fsa.finalStates]
def traceREStates(re, str, trace=1): fsa = compileRE(re, recordSourcePositions=1) states = fsa.epsilonClosure(fsa.initialState) for i in range(len(str)): newStates = fsa.nextStateSet(states, str[i]) if newStates: if trace: print fsaLabelWithCursor(fsa, newStates), 'matches', str[:i+1] + '.' + str[i+1:] states = newStates else: c = CharacterSet([]) for s0 in states: for _, _, label in fsa.transitionsFrom(s0): if label: c = c.union(label) print fsaLabelWithCursor(fsa, states), 'stops matching at', str[:i] + '.' + str[i:], '; expected', c break
def simplify(str): # replace() is workaround for bug in simplify return decompileFSA(compileRE(str).minimized()).replace('?*','*')
def simplify(str): return decompileFSA(compileRE(str).minimized())