def match ( self , txt , pnc , nxt ): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars up to and including punctuation char pnc - punctuation char nxt - single char after punctuation returns: True on match, False otherwise """ # print 'matching for txt=' , txt , 'pnc=' , pnc , 'nxt=' , nxt # print 'lstg=' , self.lstg if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print len(lp) , 'patterns' txl = txt[-self.maxl:] if len(txt) > self.maxl else txt txs = map(lambda x: x.lower(),txl) # actual left context for matching lt = len(txs) # its length # print 'txs= ' + unicode(txs) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' for p in lp: # try matching each pattern if p.left != None: n = len(p.left) # assume each pattern element must match one sequence char # print 'n=' , n , 'p=' , unicode(p) if n > lt: continue # fail immediately because of impossibility of match t = txs if n == lt else txs[-n:] # print 'left pat=' , '[' + ellyWildcard.deconvert(p.left) + ']' # print 'versus t=' , t if not ellyWildcard.match(p.left,t,0): # print 'no left match' continue if n < lt and ellyChar.isLetterOrDigit(t[0]): if ellyChar.isLetterOrDigit(txs[-n-1]): continue # fail because of no break in text # nc = '\\n' if nxt == '\n' else nxt # print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' # print 'versus c=' , nc if p.right == []: return True pcx = p.right[0] if pcx == nxt: # check for specific char after possible stop # print 'right=' , nxt return True if pcx == ellyWildcard.cCAN: # check for nonalphanumeric if nxt == u'' or not ellyChar.isLetterOrDigit(nxt): # print 'right nonalphanumeric=' , nxt return True if pcx == ellyWildcard.cSPC: # check for white space # print 'looking for space' if nxt == u'' or nxt == u' ' or nxt == u'\n': # print 'right space' return True # print 'last check' if p.right == u'.': # check for any punctuation if not ellyChar.isLetterOrDigit(nxt) and not ellyChar.isWhiteSpace(nxt): # print 'right punc=' , nxt return True return False
def match ( self , segm , tree ): """ compare text segment against FSA patterns arguments: self - segm - segment to match against tree - parse tree in which to put leaf nodes for final matches returns: text length matched by FSA """ # print 'comparing' , segm if len(self.indx) == 0: return 0 # no matches if FSA is empty lim = bound(segm) # get limit for matching mtl = 0 # accumulated match length mtls = 0 # saved final match length state = 0 # set to mandatory initial state for FSA stk = [ ] # for tracking multiple possible matches ls = self.indx[state] ix = 0 sg = segm[:lim] # text subsegment for matching capd = False if len(sg) == 0 else ellyChar.isUpperCaseLetter(sg[0]) while True: # run FSA to find all possible matches # print 'state=' , state # print 'count=' , mtl , 'matched so far' # print 'links=' , len(ls) nls = len(ls) # how many links from current state if ix == nls: # if none, then must back up if len(stk) == 0: break r = stk.pop() # restore match status state = r[0] # FSA state ls = r[1] # remaining links to check sg = r[2] # input string mtl = r[3] # total match length ix = 0 continue m = 0 while ix < nls: lk = ls[ix] # get next link at current state ix += 1 # and increment link index # print 'lk= [' , unicode(lk), '] , sg=' , sg if lk.patn == u'\x00': # do state change without matching? m = 0 # no match length else: # print 'match lk=' , unicode(lk) , 'sg=' , sg bds = ellyWildcard.match(lk.patn,sg) if bds == None: continue # print 'bds=' , bds m = bds[0] # get match length, ignore wildcard bindings if lk.nxts < 0: # final state? # print 'flags=' , lk.synf , '/' , lk.semf if tree.addLiteralPhraseWithSemantics(lk.catg,lk.synf,lk.semf,lk.bias, cap=capd): # make phrase mtls = mtl + m tree.lastph.lens = mtls # save its length # print 'match state=' , state , 'length=' , mtls # else: # print 'lastph=' , tree.lastph # print 'seg=' , sg # print 'cat=' , lk.catg, 'synf=' , lk.synf # print 'ix=' , ix , 'nls=' , nls if ix < nls: # any links not yet checked? r = [ state , ls[ix:] , sg , mtl ] # print 'r=' , r stk.append(r) # if not, save info for later continuation mtl += m # update match length break # leave loop at this state, go to next state else: # print 'no matches' continue # all patterns exhausted for state ix = 0 sg = sg[m:] # move up in text input state = lk.nxts # next state if state < 0: ls = [ ] else: ls = self.indx[state] # print 'sg=' , sg # print 'state=' , state # print 'len(ls)=' , len(ls) return mtls
def match ( self , txt , pnc , nxt ): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars up to and including punctuation char pnc - punctuation char nxt - single char after punctuation returns: True on match, False otherwise """ self.noteBracketing(pnc) # just in case this is bracketing if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] txl = txt[-self.maxl:] if len(txt) > self.maxl else txt txs = map(lambda x: x.lower(),txl) # actual left context for matching # print 'txs= ' + str(txs) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' lt = len(txs) # its length # print len(lp) , 'patterns' for p in lp: # try matching each pattern if p.left != None: n = len(p.left) # assume each pattern element must match one sequence char # print n , 'pattern elements' , lt , 'chars' if n > lt: continue # fail immediately because of impossibility of match if n < lt and ellyChar.isLetterOrDigit(txs[-n-1]): continue # fail because of text to match is after alphanumeric t = txs if n == lt else txs[-n:] # print 'pat=' , '[' + ellyWildcard.deconvert(p.left) + ']' if not ellyWildcard.match(p.left,t,0): continue # nc = '\\n' if nxt == '\n' else nxt # print 'nxt=' , '[' + nc + ']' # print 'pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' # if len(p.right) > 0: print ' ' , ord(p.right) if p.right == u'' or p.right == nxt: # check for specific char after possible stop return True if p.right == ellyWildcard.cCAN: # check for nonalphanumeric if nxt == u'' or not ellyChar.isLetterOrDigit(nxt): return True if p.right == ellyWildcard.cSPC: # check for white space # print 'looking for space' if nxt == u'' or nxt == u' ' or nxt == u'\n': return True if p.right == u'.': # check for any punctuation if not ellyChar.isLetterOrDigit(nxt) and not ellyChar.isWhiteSpace(nxt): return True return False
def _match ( self , rule ): """ compare a macro pattern to input text and substitute on match (easier to do both in one method because of match bindings) arguments: self - rule - [ pattern , spaces , rewriting ] returns: True if matched and substituted, False otherwise """ capital = self.isCapital() # starts with capital? pattern = rule[0] # split up macro substitution rule spaces = rule[1] # rewrite = rule[2] # # print 'pattern=' , ellyWildcard.deconvert(pattern) # print 'text =' , self.buffer lim = len(self.buffer) # limit on any expansion after match # print 'lim=' , lim mbd = ellyWildcard.match(pattern,self.buffer,0,lim,spaces) # try to match # print 'match=' , not (mbd == None) if mbd == None: return False # if no match bindings, stop mbl = len(mbd) # limit on wildcard bindings from match # print 'mbl=' ,mbl # compile substitution for matched macro mr = 0 # index variable for rewriting me = len(rewrite) # limit # print "rewrite len=",me,rewrite ob = [ ] # for substitution output while mr < me: # iterate on rewrite c = rewrite[mr] # next char in rewrite # print '_match c=' , c mr += 1 if c != u'\\': # literal char ob.append(c) # if so, put into output elif mr < me: # otherwise, look for binding index x = rewrite[mr] # index must be single digit try: k = int(x) # print "binding:" , '\\' + x , mbl if k < mbl: r = mbd[k] # get bind record ob.extend(self.buffer[r[0]:r[1]]) # add bound chars to output except ValueError: ob.append(ellyChar.SPC) # otherwise treat as spac mr += 1 # skip over char after \ else: ob.append(c) # no index number, save \ # print "ob:",ob # copy substitution back nm = mbd[0] # number of chars matched self.buffer = self.buffer[nm:] # remove them from buffer # print "remainder:",self.buffer if self.atToken(): self.prepend(ellyChar.SPC) # insert space if none already there self.prepend(ob) # put substitution into input buffer at start # print "after:", self.buffer if capital: # restore any capitalization self.setCapital() return True # successful match
def _match(self, rule): """ compare a macro pattern to input text and substitute on match (easier to do both in one method because of match bindings) arguments: self - rule - [ pattern , spaces , rewriting ] returns: True if matched and substituted, False otherwise """ capital = self.isCapital() # starts with capital? pattern = rule[0] # split up macro substitution rule spaces = rule[1] # rewrite = rule[2] # # print ( 'pattern=' , ellyWildcard.deconvert(pattern) ) # print ( 'text =' , self.buffer ) lim = len(self.buffer) # limit on any expansion after match # print ( 'lim=' , lim ) mbd = ellyWildcard.match(pattern, self.buffer, 0, lim, spaces) # try to match # print ( 'match=' , not (mbd == None) ) if mbd == None: return False # if no match bindings, stop mbl = len(mbd) # limit on wildcard bindings from match # print ( 'mbl=' ,mbl ) # compile substitution for matched macro mr = 0 # index variable for rewriting me = len(rewrite) # limit # print ( "rewrite len=",me,rewrite ) ob = [] # for substitution output while mr < me: # iterate on rewrite c = rewrite[mr] # next char in rewrite # print ( '_match c=' , c ) mr += 1 if c != '\\': # literal char ob.append(c) # if so, put into output elif mr < me: # otherwise, look for binding index x = rewrite[mr] # index must be single digit try: k = int(x) # print ( "binding:" , '\\' + x , mbl ) if k < mbl: r = mbd[k] # get bind record ob.extend(self.buffer[r[0]:r[1]] ) # add bound chars to output except ValueError: ob.append(ellyChar.SPC) # otherwise treat as spac mr += 1 # skip over char after \ else: ob.append(c) # no index number, save \ # print ( "ob:",ob ) # copy substitution back nm = mbd[0] # number of chars matched self.buffer = self.buffer[nm:] # remove them from buffer # print ( "remainder:",self.buffer ) if self.atToken(): self.prepend(ellyChar.SPC) # insert space if none already there self.prepend(ob) # put substitution into input buffer at start # print ( "after:", self.buffer ) if capital: # restore any capitalization self.setCapital() return True # successful match
def match(self, txt, pnc, ctx): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - next chars after punctuation returns: True on match, False otherwise """ # print ( 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx ) if matchtoo(txt, pnc, ctx): # exception by complex match? return True # print ( 'matchtoo() returned False' ) sep = ctx[0] if len(ctx) > 0 else '' if sep == ellyChar.THS: return True nxt = ctx[1] if len(ctx) > 1 else '' # print ( 'lstg=' , self.lstg.keys() ) if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print ( len(lp) , 'patterns' ) ltx = len(txt) # current length of accumulated text so far ntr = 1 while ntr <= ltx: if not ellyChar.isLetterOrDigit(txt[-ntr]): break ntr += 1 nrg = ntr ntr -= 1 # available trailing chars for wildcard * match while nrg <= ltx: c = txt[-nrg] if not ellyChar.isLetterOrDigit( c) and not ellyChar.isEmbeddedCombining(c): # print ( 'break at nrg=' , nrg , txt[-nrg] ) break nrg += 1 nrg -= 1 # end of range for all pattern matching # print ( 'ntr=' , ntr , 'nrg=' , nrg ) txt = txt[-nrg:] # reset text to limit for matching ltx = len(txt) # its new length # print ( 'txt= ' + str(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' ) for p in lp: # try matching each listed exception pattern if p.left != None and len(p.left) > 0: pat = p.left star = pat[-1] == ellyWildcard.cALL n = len( pat) # it each pattern element matches one sequence char if star: # except for a final wildcard * # print ( 'pattern ending with *' ) n -= 1 # print ( 'ltx=' , ltx , 'n=' , n ) if ltx < n: continue # cannot match pattern properly pat = pat[:-1] t = txt[:n] else: if ltx < n: continue # cannot match pattern properly t = txt[-n:] if not ellyWildcard.match(pat, t, 0): # print ( 'no possible pattern match' ) continue k = ltx - n # extra chars beyond any match # print ( 'k=' , k , 't=' , t ) # print ( 'txt=' , txt ) # print ( 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' ) # print ( 'matches' , n , 'chars' ) if not star and k > 0: # print ( 'check text before [' , txt[-n] , ']' ) if ellyChar.isLetterOrDigit(txt[-n]): c = txt[-n - 1] # print ( 'preceding= [', c , ']' ) if ellyChar.isLetterOrDigit(c) or c == '&': continue # because break in text is required # print ( 'pat=' , ellyWildcard.deconvert(p.left) ) # print ( 'n=' , n , 'ltx=' , ltx ) # print ( 'txt=' , txt ) # nc = '\\n' if nxt == '\n' else nxt # print ( 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' ) # print ( 'versus c=' , nc ) rp = p.right if rp == [] or rp[0] == ellyWildcard.cALL: return True pcx = rp[0] if pcx == nxt: # check for specific char after possible stop ) # print ( 'right=' , nxt ) return True elif pcx == ellyWildcard.cALF: # check for alphabetic if ellyChar.isLetter(nxt): # print ( 'right is alphabetic=' , nxt ) return True elif pcx == ellyWildcard.cDIG: # check for numeric if ellyChar.isDigit(nxt): # print ( 'right is numeric=' , nxt 0 return True elif pcx == ellyWildcard.cUPR: # check for upper case if ellyChar.isUpperCaseLetter(nxt): return True elif pcx == ellyWildcard.cLWR: # check for lower case if ellyChar.isLowerCaseLetter(nxt): return True elif pcx == ellyWildcard.cCAN: # check for non-alphanumeric if ellyChar.isLetter(nxt): # print ( 'right is alphabetic=' , nxt ) return True # print ( "no matches" ) return False
def match(self, segm, tree): """ compare text segment against all FSA patterns from state 0 arguments: self - segm - segment to match against tree - parse tree in which to put leaf nodes for final matches returns: text length matched by FSA """ # print 'comparing' , segm if len(self.indx) == 0: return 0 # no matches if FSA is empty if len(segm) == 0: return 0 # string is empty lim = bound(segm) # get text limit for matching mtl = 0 # accumulated match length mtls = 0 # saved final match length state = 0 # set to mandatory initial state for FSA stk = [] # for tracking possible multiple matches ls = self.indx[state] # for state 0! ix = 0 # index into current possible transitions sg = segm[:lim] # text subsegment for matching # print 'initial sg=' , sg # print len(ls) , 'transitions from state 0' capd = False if len(sg) == 0 else ellyChar.isUpperCaseLetter(sg[0]) while True: # run FSA to find all possible matches # print 'state=' , state # print 'count=' , mtl , 'matched so far' # print 'links=' , len(ls) , 'ix=' , ix nls = len(ls) # how many links from current state if ix == nls: # if none, then must back up if len(stk) == 0: break r = stk.pop() # restore match status # print 'pop r= [' , r[0] , r[1][0].shortcode() , ']' state = r[0] # FSA state ls = r[1] # remaining links to check sg = r[2] # input string mtl = r[3] # total match length ix = 0 # print 'pop sg=' , sg continue # print 'substring to match, sg=' , sg , 'nls=' , nls m = 0 while ix < nls: lk = ls[ix] # get next link at current state ix += 1 # and increment link index # print '@' , state , 'lk= [' , unicode(lk), ']' , 'ix=' , ix # print 'patn=' , lk.patn po = lk.patn[0] if po == u'\x00': # do state change without matching? m = 0 # no match length elif po != ellyWildcard.cEND: # print 'po=' , po bds = ellyWildcard.match(lk.patn, sg) # print 'bds=' , bds if bds == None: continue m = bds[0] # get match length, ignore wildcard bindings elif (len(sg) > 0 and (ellyChar.isLetterOrDigit(sg[0]) or sg[0] == ellyChar.PRME)): # print 'unmatched solitary $' continue else: # print 'matched solitary $, state=' , state m = 0 # print 'm=' , m if lk.nxts < 0: # final state? if lk.nxts == -2: m = 0 # last part of match not counted # print 'state=' , state , unicode(lk) # print 'flags=' , lk.synf , '/' , lk.semf if tree.addLiteralPhraseWithSemantics( lk.catg, lk.synf, lk.semf, lk.bias, cap=capd): # make phrase ml = mtl + m if mtls < ml: mtls = ml # print 'success!' , 'mtls=' , mtls tree.lastph.lens = mtls # save its length # print 'match state=' , state , 'length=' , mtls # print 'ix=' , ix , 'nls=' , nls if ix < nls: # any links not yet checked? r = [state, ls[ix:], sg, mtl] # print 'saved r= ' , state , # print [ x.shortcode() for x in ls[ix:] ] stk.append(r) # if not, save info for later continuation mtl += m # update match length break # leave loop at this state, go to next state else: # print 'no matches' continue # all patterns exhausted for state ix = 0 sg = sg[m:] # move up in text input state = lk.nxts # next state if state < 0: ls = [] else: ls = self.indx[state] # print 'sg=' , sg # print 'state=' , state # print 'len(ls)=' , len(ls) # print 'mtls=' , mtls return mtls
def match ( self , txt , pnc , ctx ): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - next chars after punctuation returns: True on match, False otherwise """ # print 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx if matchtoo(txt,pnc,ctx): # exception by complex match? return True # print 'matchtoo() returned False' sep = ctx[0] if len(ctx) > 0 else '' if sep == ellyChar.THS: return True nxt = ctx[1] if len(ctx) > 1 else '' # print 'lstg=' , self.lstg.keys() if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print len(lp) , 'patterns' ltx = len(txt) # current length of accumulated text so far ntr = 1 while ntr <= ltx: if not ellyChar.isLetterOrDigit(txt[-ntr]): break ntr += 1 nrg = ntr ntr -= 1 # available trailing chars for wildcard * match while nrg <= ltx: c = txt[-nrg] if not ellyChar.isLetterOrDigit(c) and not ellyChar.isEmbeddedCombining(c): # print 'break at nrg=' , nrg , txt[-nrg] break nrg += 1 nrg -= 1 # end of range for all pattern matching # print 'ntr=' , ntr , 'nrg=' , nrg txt = txt[-nrg:] # reset text to limit for matching ltx = len(txt) # its new length # print 'txt= ' + unicode(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' for p in lp: # try matching each listed exception pattern if p.left != None and len(p.left) > 0: pat = p.left star = pat[-1] == ellyWildcard.cALL n = len(pat) # it each pattern element matches one sequence char if star: # except for a final wildcard * # print 'pattern ending with *' n -= 1 # print 'ltx=' , ltx , 'n=' , n if ltx < n: continue # cannot match pattern properly pat = pat[:-1] t = txt[:n] else: if ltx < n: continue # cannot match pattern properly t = txt[-n:] if not ellyWildcard.match(pat,t,0): # print 'no possible pattern match' continue k = ltx - n # extra chars beyond any match # print 'k=' , k , 't=' , t # print 'txt=' , txt # print 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' # print 'matches' , n , 'chars' if not star and k > 0: # print 'check text before [' , txt[-n] , ']' if ellyChar.isLetterOrDigit(txt[-n]): c = txt[-n-1] # print 'preceding= [', c , ']' if ellyChar.isLetterOrDigit(c) or c == '&': continue # because break in text is required # print 'pat=' , ellyWildcard.deconvert(p.left) # print 'n=' , n , 'ltx=' , ltx # print 'txt=' , txt # nc = '\\n' if nxt == '\n' else nxt # print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' # print 'versus c=' , nc rp = p.right if rp == [] or rp[0] == ellyWildcard.cALL: return True pcx = rp[0] if pcx == nxt: # check for specific char after possible stop # print 'right=' , nxt return True elif pcx == ellyWildcard.cALF: # check for alphabetic if ellyChar.isLetter(nxt): # print 'right is alphabetic=' , nxt return True elif pcx == ellyWildcard.cDIG: # check for numeric if ellyChar.isDigit(nxt): # print 'right is numeric=' , nxt return True elif pcx == ellyWildcard.cUPR: # check for upper case if ellyChar.isUpperCaseLetter(nxt): return True elif pcx == ellyWildcard.cLWR: # check for lower case if ellyChar.isLowerCaseLetter(nxt): return True elif pcx == ellyWildcard.cCAN: # check for non-alphanumeric if ellyChar.isLetter(nxt): # print 'right is alphabetic=' , nxt return True # print "no matches" return False