def finAPO ( ss , sp ): """ handle final apostrophes arguments: ss - character stream sp - last char of word in stream """ lss = len(ss) # print 'finAPO lss=' , lss , ss[sp:] if lss > sp + 1: # print 'ending=' , ss[sp:] if ellyChar.isApostrophe(ss[sp+1]): if lss > sp + 2: # print 'ss=' , ss[sp:] if ss[sp+2].lower() == 's': if terminate(ss,sp+3,lss): sp += 1 # print 'sp=' , sp ss[sp] = "'" return if ss[sp].lower() == 's' and terminate(ss,sp+2,lss): sp += 1 ss[sp] = Ls
def getRules ( self , a ): """ get appropriate macros for text with specified starting char arguments: self - a - first letter of current buffer contents (NOT space!) returns: a list of unpacked macro rules to try out """ # print 'getRules(a=' , a , ')' if a == '': return [ ] if ellyChar.isLetterOrDigit(a): k = ellyChar.toIndex(a) ls = self.index[k] # print 'index a=' , a , 'k=' , k ws = self.letWx if ellyChar.isLetter(a) else self.digWx uniqueAdd(ls,ws) uniqueAdd(ls,self.anyWx) elif ellyChar.isApostrophe(a): ls = self.apoWx else: ls = self.index[0] uniqueAdd(ls,self.anyWx) # print len(ls) , ' rules to check' return [ r.unpack() for r in ls ]
def finAPO(ss, sp): """ handle final apostrophes arguments: ss - character stream sp - last char of word in stream """ lss = len(ss) # print 'finAPO lss=' , lss , ss[sp:] if lss > sp + 1: # print 'ending=' , ss[sp:] if ellyChar.isApostrophe(ss[sp + 1]): if lss > sp + 2: # print 'ss=' , ss[sp:] if ss[sp + 2].lower() == 's': if terminate(ss, sp + 3, lss): sp += 1 # print 'sp=' , sp ss[sp] = "'" return if ss[sp].lower() == 's' and terminate(ss, sp + 2, lss): sp += 1 ss[sp] = Ls
def delimitKey ( t ): """ get part of term for vocabulary table indexing that ends in alphanumeric or is a single nonalphanumeric with special stripping of 'S at the end arguments: t - text string to scan returns: count of chars to put into search key """ ln = len(t) # number of chars in input text if ln == 0: return 0 n = t.find(' ') # find rough range of key for SQLite in text if n < 0: n = ln # if undivided by spaces, take everything n -= 1 # index of last char in range while n > 0: # scan input text backwards c = t[n] # check char for alphanumeric if ellyChar.isLetterOrDigit(c): # print 'n=' , n , 'c=' , c if n > 1: # check for 'S as special case! if ( c in [ 's' , 'S' ] and ellyChar.isApostrophe(t[n-1]) ): # print 'drop \'S from SQLite key' n -= 1 else: break else: break n -= 1 # continue scanning backwards return n + 1 # to get key length ending in alphanumeric
def getRules(self, a): """ get appropriate macros for text with specified starting char arguments: self - a - first letter of current buffer contents (NOT space!) returns: a list of unpacked macro rules to try out """ # print ( 'getRules(a=' , a , ')' ) if a == '': return [] if ellyChar.isLetterOrDigit(a): k = ellyChar.toIndex(a) ls = self.index[k] # print ( 'index a=' , a , 'k=' , k ) ws = self.letWx if ellyChar.isLetter(a) else self.digWx uniqueAdd(ls, ws) uniqueAdd(ls, self.anyWx) elif ellyChar.isApostrophe(a): ls = self.apoWx else: ls = self.index[0] uniqueAdd(ls, self.anyWx) # print ( len(ls) , ' rules to check' ) return [r.unpack() for r in ls]
def simplify(self, strg): """ apply inflectional stemming to string arguments: self - strg - input Unicode string returns: stemmed Unicode string """ if len(strg) < 4: return strg if strg[-1] == "s" and ellyChar.isApostrophe(strg[-2]): return strg[:-2] else: t = ellyToken.EllyToken(strg) self.apply(t) return t.toUnicode()
def delimitKey ( t ): """ get bounds of vocabulary table key for looking up a term starting at the front of a given text string with special stripping of 'S at the end arguments: t - text string to scan returns: count of chars to take for search key """ ln = len(t) # number of chars in input text if ln == 0: return 0 if not ellyChar.isLetterOrDigit(t[0]): return 1 # print 'delimitKey t=' , t k = t.find('-') # find rough range of SQLite key in text n = t.find(' ') # delimited by either a hyphen or a space if n < 0: n = ln # if space, take everything if k > 1 and n > k: n = k # hyphen delimits if it comes first n -= 1 # index of last char of candidate key # print 'k=' , k , 'n=' , n while n > 0: # scan input text backwards c = t[n] # check char for alphanumeric if ellyChar.isLetterOrDigit(c): # print 'n=' , n , 'c=' , c if n > 1: # check for 'S as special case! if ( c in [ 's' , 'S' ] and ellyChar.isApostrophe(t[n-1]) ): # print 'drop \'S from SQLite key' n -= 1 else: break else: break n -= 1 # continue scanning backwards # print 'key=' , t[:n+1] return n + 1 # to get key length ending in alphanumeric
def delimitKey(t): """ get bounds of vocabulary table key for looking up a term starting at the front of a given text string with special stripping of 'S at the end arguments: t - text string to scan returns: count of chars to take for search key """ ln = len(t) # number of chars in input text if ln == 0: return 0 if not ellyChar.isLetterOrDigit(t[0]): return 1 # print ( 'delimitKey t=' , t ) k = t.find('-') # find rough range of SQLite key in text n = t.find(' ') # delimited by either a hyphen or a space if n < 0: n = ln # if space, take everything if k > 1 and n > k: n = k # hyphen delimits if it comes first n -= 1 # index of last char of candidate key # print ( 'k=' , k , 'n=' , n ) while n > 0: # scan input text backwards c = t[n] # check char for alphanumeric if ellyChar.isLetterOrDigit(c): # print ( 'n=' , n , 'c=' , c ) if n > 1: # check for 'S as special case! if (c in ['s', 'S'] and ellyChar.isApostrophe(t[n - 1])): # print ( 'drop \'S from SQLite key' ) n -= 1 else: break else: break n -= 1 # continue scanning backwards # print ( 'key=' , t[:n+1] ) return n + 1 # to get key length ending in alphanumeric
def simplify ( self , strg ): """ apply inflectional stemming to string arguments: self - strg - input Unicode string returns: stemmed Unicode string """ if len(strg) < 4: return strg if strg[-1] == "s" and ellyChar.isApostrophe(strg[-2]): return strg[:-2] else: t = ellyToken.EllyToken(strg) self.apply(t) return t.toUnicode()
def _scanText(self, k): """ try to match in buffer regardless of word boundaries using Elly vocabulary, pattern, amd template tables an also running Elly entity extractors arguments: self - k - length of first component in buffer returns: match parameters [ text span of match , vocabulary match , suffix removed ] exceptions: ParseOverflow """ # print ( '_scanText k=' , k ) sb = self.sbu.buffer # input buffer tr = self.ptr # parse tree for results # print ( '_scanText sb=' , sb ) # initialize match status nspan = 0 # total span of match vmchs = [] # chars of vocabulary entry matched suffx = '' # any suffix removed in match d = self.rul # grammar rule definitions m = d.ptb.match(sb, tr) # try token by pattern match next # print ( 'pattern m=' , m ) if nspan < m: nspan = m # on longer match, update maximum m = d.ctb.match(sb, tr) # try multi-word template match next # print ( 'template m=' , m ) if nspan < m: nspan = m # on longer match, update maximum m = self.iex.run(sb) # try entity extractors next # print ( 'extractor m=' , m ) if nspan < m: nspan = m # on longer match, update maximum # print ( 'nspan=' , nspan, sb[:nspan] ) lm = len(sb) # scan limit # print ( 'lm=' , lm , 'm=' , m ) capd = ellyChar.isUpperCaseLetter(sb[0]) # print ( 'next component=' , sb[:k] , ', context=' , sb[k:lm] ) if self.vtb != None: # look in external dictionary, if it exists ls = list(sb[:k]) # print ( 'vtb ls 0=' , ls ) ellyChar.toLowerCaseASCII(ls) ss = ''.join(ls) # where to start for vocabulary indexing # print ( 'vtb ls 1=' , ls ) n = vocabularyTable.delimitKey(ss) # get actual indexing # print ( 'delimiting n=' , n , ':' , '<' + ss[:n] + '>' ) # print ( vocabularyTable.listDBKeys(self.vtb.cdb) ) rl = self.vtb.lookUp(sb, n) # get list of the maximum text matches # print ( 'external matches=' , len(rl) ) # print ( 'input text=' , sb ) if len(rl) > 0: # r0 = rl[0] # look at first record # print ( 'r0=' , r0 ) vmln = r0.nspan # should be same for all matches vchs = r0.vem.chs # vsfx = r0.suffx # # print ( 'nspan=' , vmln , vsfx ) if (vmln > nspan or vmln == nspan and vsfx == ''): nspan = vmln # keep vocabulary matches vmchs = vchs # suffx = vsfx # for r in rl: ve = r.vem # get vocabulary entry # print ( 've=' , ve ) # if ve.gen != None: print ( 've.gen=' , ve.gen ) if tr.addLiteralPhraseWithSemantics( ve.cat, ve.syf, ve.smf, ve.bia, ve.gen, len(suffx) > 0): tr.lastph.lens = nspan # char length of leaf phrase node # needed for later selection tr.lastph.krnl.cncp = ve.con if capd: tr.lastph.krnl.semf.set(0) # print ( 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens ) if suffx != '': if ellyChar.isApostrophe(suffx[1]): tr.lastph.krnl.usen = 0 # print ( 'vocabulary m=' , vmln ) # print ( 'queue after table lookup:' , len(self.ptr.queue) ) # print ( 'vtb sb=' , sb ) # print ( 'maximum match=' , nspan ) # print ( 'next input=' , sb[:nspan] ) if nspan > 0: # any matches at all? tr.requeue() # if so, keep only longest of them # print ( 'queue after scan:' , len(self.ptr.queue) ) # print ( 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']' ) return [nspan, vmchs, suffx]
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __str__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + str(self.kind) + ',ct=' + str(self.count) + ',pa=' + str(self.pats) + ',tx=' + str(self.txts) + ',bd=' + str(self.bnds) + ',ns=' + str(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print ( "binding:",offs,ns ) os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) ) # print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp ) # print ( "text to span:",text[offs:] ) # print ( "pat rest=" , patn[mp:] ) k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print ( "exclude=",k,"chars from possible span for rest of pattern" ) # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print ( mx,"chars available to scan" ) mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print ( "text at",offs,"maximum wildcard match=",mx ) nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print ( 'span c=' , c ) if not tfn(c): break # stop when it fails to match nm += 1 # print ( "maximum wildcard span=",nm ) return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print ( 'starting match, limt=',limt,text[offs:limt],":",patn ) # print ( 'nsps=' , nsps ) mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print ( '---- loop mp=' , mp , 'ml=' , ml ) while mp < ml: if offs >= limt: # print ( "offs=",offs,"limt=",limt ) last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print ( 'patn=' , patn ) mc = patn[mp] # print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs ) # print ( 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' ) if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print ( 'hyphen special matching, limt=', limt , 'offs=' , offs ) # print ( 'text[offs:]=' , text[offs:] ) if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print ( 'no special matching of hyphen' ) break # print ( 'matched @mp=' , mp ) mp += 1 ## check whether mismatch is due to special pattern char # print ( 'pat @',mp,"<",ml ) # print ( "txt @",offs,'<',limt,'last=',last ) # print ( '@',offs,text[offs:] ) if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) ) if tc == cALL: # a * wildcard? # print ( "ALL last=< " + last + " >" ) if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print ( "offs=",offs,'nm=',nm ) uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print ( "END $:",last ) if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif not ellyChar.isText(last): offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print ( "ANY:",last,offs ) if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print ( 'at cCAN' ) if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print ( "ALF:",last,offs ) if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print ( "UPR:",last,'@',offs ) if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print ( "LWR:",last,'@',offs ) if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print ( "SPC:","["+last+"]" ) if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print ( 'NO space' ) elif tc == cAPO: # apostrophe wildcard? # print ( "APO: last=" , last ) if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print ( "SOS" ) # print ( last,'@',offs ) mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print ( "EOS" ) if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' ) if last != '': # still more to match? offs -= 1 # print ( 'nsps=' , nsps ) # print ( '@' , offs , text ) nm = _span(tc,nsps) # maximum match possible # print ( 'spanning=' , nm ) if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print ( 'spanning=' , nm ) if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print ( 'offs=' , offs ) last = text[offs] if offs < limt else '' continue # print ( 'fail tc=' , deconvert(tc) ) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print ( "fail - unwinding" , unj ) while unj > 0: # try unwinding, if possible # print ( "unw:",unj ) uf = unw[unj-1] # get most recent unwinding record # print ( uf ) if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print ( "no unwinding" ) break # quit if unwinding is exhausted # print ( 'cnt=' , uf.count , 'off=' , offs ) ## ## clean up on match mode or on no match possible ## # print ( "matched=",matched ) if not matched: return None # no bindings # print ( text,offs ) ## consolidate contiguous bindings for subsequent substitutions # print ( "BEFORE consolidating consecutive bindings" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print ( "AFTER" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) return mbd # consolidated bindings plus new offset
def doMatchUp ( self , vcs , txs ): """ match current text with vocabulary entry, possibly removing final inflection (this method assumes English; override for other languages) arguments: self - vcs - vocabulary entry chars txs - text chars to be matched returns: count of txs chars matched, 0 on mismatch """ # print 'match up vcs=' , vcs self.endg = '' # default inflection lvc = len(vcs) ltx = len(txs) if ltx < lvc: return 0 nr = icmpr(vcs,txs) # do match on lists of chars # print 'nr=' , nr , 'nt=' # print 'txs=' , txs , 'ltx=' , ltx if nr == 0: # vocabulary entry fully matched? if ltx == lvc: return ltx # if no more text, done dnc = ltx - lvc # otherwise, check text past match # print 'dnc=' , dnc if ellyChar.isApostrophe(txs[lvc]): # apostrophe check if dnc > 1 and txs[lvc+1] in [ 's' , 'S' ]: # 'S found? if dnc == 2 or _terminate(txs[lvc+2]): self.endg = '-\'s' # return lvc + 2 # if so, remove ending return 0 if txs[lvc-1] in [ 's' , 'S' ]: # S' found? if dnc == 1 or _terminate(txs[lvc+1]): self.endg = '-\'s' # put in implied S! return lvc + 1 # if so, remove ending return 0 return lvc # break at if _terminate(txs[lvc]): return lvc # successful match # alphanumeric follows match either full or partial # try inflectional stemming to align a match here if self.stm == None: return 0 # if no stemmer, no match possible k = lvc - nr + 1 # get extent of text to match against while k < ltx: if _terminate(txs[k]): break # find current end of text to match k += 1 n = k - 1 while n > 0: if _terminate(txs[n]): n += 1 break # find start of stemming n -= 1 # print 'k=' , k , 'n=' , n , 'nr=' , nr if k - n < nr: # check if stemming could help match return 0 tc = txs[k-1] # last char at end of text to match # print 'tc=' , tc if tc != 's' and tc != 'd' and tc != 'g': return 0 # only -S, -ED, and -ING checked tw = ''.join(txs[n:k]) # segment of text for stemming sw = self.stm.simplify(tw) # inflectional stemming # print 'sw=' , sw , 'tw=' , tw if len(sw) + n != lvc: # stemmed result should now align return 0 # with vocabulary entry # print 'nr=' , nr ns = 0 if nr == 0 else icmpr(vcs[-nr:],sw[-nr:]) # continue from previous match # print 'ns=' , ns if ns == 0: # mismatch gone? self.endg = ( '-s' if tc == 's' else '-ed' if tc == 'd' else '-ing' ) # indicate ending removed # print 'txs=' , txs # print 'ltx=' , ltx , 'endg=' , self.endg return k # successful match return 0 # no match by default
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __unicode__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + unicode(self.kind) + ',ct=' + unicode(self.count) + ',pa=' + unicode(self.pats) + ',tx=' + unicode(self.txts) + ',bd=' + unicode(self.bnds) + ',ns=' + unicode(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print "binding:",offs,ns os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) # print "_span: txt @",offs,"pat @",mp,"nsp=",nsp # print "text to span:",text[offs:] # print "pat rest=" , patn[mp:] k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print "exclude=",k,"chars from possible span for rest of pattern" # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print mx,"chars available to scan" mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print "text at",offs,"maximum wildcard match=",mx nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print 'span c=' , c if not tfn(c): break # stop when it fails to match nm += 1 # print "maximum wildcard span=",nm return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print 'starting match, limt=',limt,text[offs:limt],":",patn # print 'nsps=' , nsps mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print '---- loop mp=' , mp , 'ml=' , ml while mp < ml: if offs >= limt: # print "offs=",offs,"limt=",limt last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print 'patn=' , patn mc = patn[mp] # print 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs # print 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print 'hyphen special matching, limt=', limt , 'offs=' , offs # print 'text[offs:]=' , text[offs:] if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print 'no special matching of hyphen' break # print 'matched @mp=' , mp mp += 1 ## check whether mismatch is due to special pattern char # print 'pat @',mp,"<",ml # print "txt @",offs,'<',limt,'last=',last # print '@',offs,text[offs:] if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) if tc == cALL: # a * wildcard? # print "ALL last=< " + last + " >" if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print "offs=",offs,'nm=',nm uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print "END $:",last if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print "ANY:",last,offs if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print 'at cCAN' if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print "ALF:",last,offs if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print "UPR:",last,'@',offs if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print "LWR:",last,'@',offs if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print "SPC:","["+last+"]" if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print 'NO space' elif tc == cAPO: # apostrophe wildcard? # print "APO: last=" , last if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print "SOS" # print last,'@',offs mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print "EOS" if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' if last != '': # still more to match? offs -= 1 # print 'nsps=' , nsps # print '@' , offs , text nm = _span(tc,nsps) # maximum match possible # print 'spanning=' , nm if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print 'spanning=' , nm if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print 'offs=' , offs last = text[offs] if offs < limt else '' continue # print 'fail tc=' , deconvert(tc) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print "fail - unwinding" , unj while unj > 0: # try unwinding, if possible # print "unw:",unj uf = unw[unj-1] # get most recent unwinding record # print uf if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print "no unwinding" break # quit if unwinding is exhausted # print 'cnt=' , uf.count , 'off=' , offs ## ## clean up on match mode or on no match possible ## # print "matched=",matched if not matched: return None # no bindings # print text,offs ## consolidate contiguous bindings for subsequent substitutions # print "BEFORE consolidating consecutive bindings" # print "bd:",len(mbd) # print mbd[0] # print '----' # for b in mbd[1:]: # print b mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print "AFTER" # print "bd:",len(mbd) # print mbd[0] # print '----' # for b in mbd[1:]: # print b return mbd # consolidated bindings plus new offset
def _lookUpNext(self): """ look up possible next segments in input buffer by various means, keeping tokens only for the LONGEST segment arguments: self returns: True on successful lookup, False otherwise exceptions: ParseOverflow """ self.sbu.skipSpaces() # skip leading spaces s = self.sbu.buffer # print ( '_lookUp@0 buffer=' , s ) if len(s) == 0: # check for end of input return False # if so, done # print ( 'in =' , str(self.sbu) ) if self.trs != None: # preanalysis of number expressions self.trs.rewriteNumber(s) # print ( '_lookUp@1 buffer=' , self.sbu.buffer ) # print ( 'macro expansion s[0]=' , s[0] ) self.sbu.expand() # apply macro substitutions # print ( 'macro expanded s[0]=' , s[0] ) # print ( '_lookUp@2 buffer=' , self.sbu.buffer ) s = self.sbu.buffer # print ( 'expanded len=' , len(s) ) if len(s) == 0: return True # macros can empty out buffer k = self.sbu.findBreak() # find extent of first component for lookup if k == 0: k = 1 # must have at least one char in token # print ( 'break at k=' , k ) kl = len(s) if k + 1 < kl and s[k] == '+' and s[k + 1] == ' ': k += 1 # recognize possible prefix # print ( 'len(s)=' , kl , 'k=' , k , 's=', s ) # print ( '_lookUp@3 buffer=' , self.sbu.buffer ) mr = self._scanText(k) # text matching in various ways mx = mr[0] # overall maximum match length chs = mr[1] # any vocabulary element matched suf = mr[2] # any suffix removed in matching # print ( '_lookUp@4 buffer=' , self.sbu.buffer ) s = self.sbu.buffer # print ( 'k=' , k ) # print ( 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf ) # print ( 'len(s)=' , len(s) , 's=' , s ) if (k < mx or k == mx and suf != '' ): # next word cannot produce token as long as already seen? # print ( 'queue:' , len(self.ptr.queue) ) # print ( 'chs=' , chs ) if len(chs) > 0: # any vocabulary matches? # print ( 'put back' , suf , mx , s ) self.sbu.skip(mx) # if so, they supersede if suf != '': # handle any suffix removal self.sbu.prepend(list(suf)) # print ( 'suf=' , suf ) else: chs = self.sbu.extract(mx) # print ( 'extract chs=' , chs ) to = ellyToken.EllyToken(chs) # print ( 'token=' , str(to) ) self.ctx.addTokenToListing(to) if suf != '': if not ellyChar.isApostrophe(suf[1]): to.dvdd = True # must note suffix removal for token! # print ( 'only queue:' , len(self.ptr.queue) ) return True # print ( 'mx=' , mx ) # print ( 'plus queue:' , len(self.ptr.queue) ) wsk = self.sbu.buffer[:k] cap = ellyChar.isUpperCaseLetter(wsk[0]) # print ( 'wsk=' , wsk ) rws = ''.join(wsk) found = self.ptr.createPhrasesFromDictionary(rws.lower(), False, cap) if not found: if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]): k -= 1 rws = rws[:-1] found = self.ptr.createPhrasesFromDictionary( rws.lower(), False, cap) # print ( rws , 'found in dictionary=' , found ) if found or mx > 0: # match found in dictionary or by text scan if not found: k = mx # if by text scan, must make token longer rws = rws[:k] # if mx > k self.sbu.skip(k) # print ( 'next=' , self.sbu.buffer[self.sbu.index:] ) # print ( 'queue after =' , len(self.ptr.queue) ) to = ellyToken.EllyToken(rws[:k]) if len(suf) > 1: # change token to show suffix properly # print ( 'suf=' , suf ) cs = suf[1] # first char in suffix after '-' rt = to.root # this is a list! lk = -1 # start at last char in token while rt[lk] != cs: lk -= 1 sn = len(rt) + lk # where to divide suffix from root # print ( 'sn=' , sn , rt ) to.root = rt[:sn] # root without suffix self.sbu.prepend(suf) # restore suffix to input for processing else: # no suffix chx = self.sbu.peek() # look at next char after match if chx == '-': # if hyphen, need to separate it self.sbu.skip() if ellyChar.isLetter(self.sbu.peek()): self.sbu.prepend(' ') self.sbu.prepend('-') # print ( 'add' , str(to) ) self.ctx.addTokenToListing(to) # add token to listing for sentence return True # print ( '[' + rws + ']' , 'still unrecognized' ) chx = rws[0] # special hyphen check if chx == '-' and k > 1: # print ( 'look in internal dictionary' ) if self.ptr.createPhrasesFromDictionary(chx, False, False): # print ( 'found!' ) to = ellyToken.EllyToken(chx) # treat hyphen as token self.ctx.addTokenToListing(to) # add it to token list self.sbu.skip() # remove from input return True to = self._extractToken( mx) # single-word matching with analysis and lookup # print ( 'extracted to=' , str(to) ) if to == None: # if no match, we are done and will return # print ( 'mx=' , mx ) return False if mx == 0 else True # still success if _scanText() found something if self.ptr.lastph != None: self.ptr.lastph.lens = to.getLength() # print ( 'to=' , str(to) , 'len(s)=' , len(s) , s ) # posn = self.ctx.countTokensInListing() # print ( 'at', posn , 'in token list' ) self.ctx.addTokenToListing(to) # add token to listing for sentence # tol = self.ctx.getNthTokenInListing(-1) # print ( 'last token root=' , tol.root ) return True # successful lookup
def _extractToken ( self , mnl ): """ extract next token from input buffer and look up in grammar table arguments: self - mnl - minimum length for any previous match returns: ellyToken on success, otherwise None exceptions: ParseOverflow """ d = self.rul # grammar rule definitions tree = self.ptr # parse tree buff = self.sbu # input source # print 'start extraction' try: w = buff.getNext() # extract next token # print 'got token=' , w ws = u''.join(w.root) except ellyException.StemmingError as e: print >> sys.stderr , 'FATAL error' , e sys.exit(1) # print 'extracted' , '['+ ws + ']' wcapzn = w.isCapitalized() wsplit = w.isSplit() wl = len(ws) if wl > mnl: found = self._simpleTableLookUp(ws,tree,wsplit,wcapzn) > 0 # print 'found in external table=' , found if wl >= mnl: if ws in self.rul.gtb.dctn: # look up internally # print '"' + ws + '" in dictionary' if tree.createPhrasesFromDictionary(ws,wsplit,wcapzn): found = True # print 'found in internal dictionary=' , found if found: # if any success, we are done return w if mnl > 0: return None # defer to previous lookup # print 'affix logic:' # print d.man.pref # print d.man.suff dvdd = False if d.man.analyze(w): # any analysis possible? root = u''.join(w.root) # if so, get parts of analysis tan = w.pres + [ root ] + w.sufs if len(w.sufs) > 0: sx = w.sufs[-1] dvdd = not ellyChar.isApostrophe(sx[1]) # print 'token analysis=' , tan while len(tan) > 0: # and put back into input x = tan.pop() buff.prepend(x) buff.prepend(' ') w = buff.getNext() # get token again with stemming and macros # print 'analyzed w=' , w ws = u''.join(w.root) if ws[-1] == '+': # print 'len(queue)=' , len(tree.queue) m = d.ptb.match(w.root,tree) # print 'root=' , w.root # print 'match=' , m # print 'len(queue)=' , len(tree.queue) # print 'char span=' , tree.lastph.lens if m > 0: tree.lastph.bias = 2 found = True # print 'after found=' , found if len(ws) < mnl: return None # external lookup? if self._simpleTableLookUp(ws,tree,False,wcapzn): # external lookup found = True if ws in self.rul.gtb.dctn: # internal lookup? if tree.createPhrasesFromDictionary(ws,wsplit,wcapzn): found = True if found: # if any success, we are done # print 'token recognized' w.dvdd = dvdd return w # print 'still unrecognized token w=' , unicode(w) lws = len(ws) if lws > 1: # special handling of + or - if ws[0] == '+' and ws[-1] != '+': # print 'root=' , ws # marks root with prefixes removed if self._simpleTableLookUp(ws[1:],tree) > 0: return w if ws[0] == '-': w.shortenBy(lws-1) # -X not recognized as suffix # print 'w=' , w # try processing - separately cn = buff.peek() if ellyChar.isLetterOrDigit(cn): buff.prepend(' ') buff.prepend(ws[1:]) # put back X for further analysis if self.pnc.match(w.root): # check if next token is punctuation # print 'catg=' , self.pnc.catg , self.pnc.synf.hexadecimal() if tree.addLiteralPhrase(self.pnc.catg,self.pnc.synf): tree.lastph.lens = w.getLength() tree.lastph.krnl.semf.combine(self.pnc.semf) # print 'semf=' , self.pnc.semf # print 'lastph=' , tree.lastph # print 'punc w=' , unicode(w) else: # print 'must create UNKN leaf node' tree.createUnknownPhrase(w) # unknown type as last resort tree.lastph.lens = len(ws) return w
def _scanText ( self , k ): """ try to match in buffer regardless of word boundaries using Elly vocabulary, pattern, amd template tables an also running Elly entity extractors arguments: self - k - length of first component in buffer returns: match parameters [ text span of match , vocabulary match , suffix removed ] exceptions: ParseOverflow """ # print '_scanText k=' , k sb = self.sbu.buffer # input buffer tr = self.ptr # parse tree for results # print '_scanText sb=' , sb # initialize match status nspan = 0 # total span of match vmchs = [ ] # chars of vocabulary entry matched suffx = '' # any suffix removed in match d = self.rul # grammar rule definitions m = d.ptb.match(sb,tr) # try token by pattern match next # print 'pattern m=' , m if nspan < m: nspan = m # on longer match, update maximum m = d.ctb.match(sb,tr) # try multi-word template match next # print 'template m=' , m if nspan < m: nspan = m # on longer match, update maximum m = self.iex.run(sb) # try entity extractors next # print 'extractor m=' , m if nspan < m: nspan = m # on longer match, update maximum # lm = len(sb) # scan limit # print 'lm=' , lm , 'm=' , m capd = ellyChar.isUpperCaseLetter(sb[0]) # print 'next component=' , sb[:k] , ', context=' , sb[k:lm] if self.vtb != None: # look in external dictionary, if it exists ls = list(sb[:k]) # print 'ls 0=' , ls ellyChar.toLowerCaseASCII(ls) ss = u''.join(ls) # where to start for vocabulary indexing # print 'ls 1=' , ls n = vocabularyTable.delimitKey(ss) # get actual indexing # print 'delimiting n=' , n , '=' , '<' + ss[:n] + '>' # print vocabularyTable.listDBKeys(self.vtb.cdb) rl = self.vtb.lookUp(sb,n) # get list of the maximum text matches # print len(rl) , 'matches' if len(rl) > 0: # r0 = rl[0] # look at first record # print 'r0=' , r0 vmln = r0.nspan # should be same for all matches vchs = r0.vem.chs # vsfx = r0.suffx # # print 'nspan=' , vmln , vsfx if ( vmln > nspan or vmln == nspan and vsfx == '' ): nspan = vmln # keep vocabulary matches vmchs = vchs # suffx = vsfx # for r in rl: ve = r.vem # get vocabulary entry # print 've=' , ve # if ve.gen != None: print 've.gen=' , ve.gen if tr.addLiteralPhraseWithSemantics( ve.cat,ve.syf,ve.smf,ve.bia,ve.gen,len(suffx) > 0): tr.lastph.lens = nspan # char length of leaf phrase node # needed for later selection tr.lastph.krnl.cncp = ve.con if capd: tr.lastph.krnl.semf.set(0) # print 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens if suffx != '': if ellyChar.isApostrophe(suffx[1]): tr.lastph.krnl.usen = 0 # print 'vocabulary m=' , vmln # print 'queue after table lookup:' , len(self.ptr.queue) # print 'sb=' , sb # print 'maximum match=' , nspan # print 'input=' , self.sbu.buffer[:nspan] if nspan > 0: # any matches at all? tr.requeue() # if so, keep only longest of them # print 'queue after scan:' , len(self.ptr.queue) # print 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']' return [ nspan , vmchs , suffx ]
def _lookUpNext ( self ): """ look up possible next segments in input buffer by various means, keeping tokens only for the LONGEST segment arguments: self returns: True on successful lookup, False otherwise exceptions: ParseOverflow """ self.sbu.skipSpaces() # skip leading spaces s = self.sbu.buffer # print '_lookUp@0 buffer=' , s if len(s) == 0: # check for end of input return False # if so, done # print 'in =' , unicode(self.sbu) if self.trs != None: # preanalysis of number expressions self.trs.rewriteNumber(s) # print '_lookUp@1 buffer=' , self.sbu.buffer # print 'macro expansion s[0]=' , s[0] self.sbu.expand() # apply macro substitutions # print 'macro expanded s[0]=' , s[0] # print '_lookUp@2 buffer=' , self.sbu.buffer s = self.sbu.buffer # print 'expanded len=' , len(s) if len(s) == 0: return True # macros can empty out buffer k = self.sbu.findBreak() # find extent of first component for lookup if k == 0: k = 1 # must have at least one char in token # print 'break at k=' , k kl = len(s) if k + 1 < kl and s[k] == '+' and s[k+1] == ' ': k += 1 # recognize possible prefix # print 'len(s)=' , kl , 'k=' , k , 's=', s # print '_lookUp@3 buffer=' , self.sbu.buffer mr = self._scanText(k) # text matching in various ways mx = mr[0] # overall maximum match length chs = mr[1] # any vocabulary element matched suf = mr[2] # any suffix removed in matching # print '_lookUp@4 buffer=' , self.sbu.buffer s = self.sbu.buffer # print 'k=' , k # print 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf # print 'len(s)=' , len(s) , 's=' , s if ( k < mx or k == mx and suf != '' ): # next word cannot produce token as long as already seen? # print 'queue:' , len(self.ptr.queue) # print 'chs=' , chs if len(chs) > 0: # any vocabulary matches? # print 'put back' , suf , mx , s self.sbu.skip(mx) # if so, they supersede if suf != '': # handle any suffix removal self.sbu.prepend(list(suf)) # print 'suf=' , suf else: chs = self.sbu.extract(mx) # print 'extracted chs=' , chs # print 'token chs=' , chs to = ellyToken.EllyToken(chs) # print 'long token=' , unicode(to) self.ctx.addTokenToListing(to) if suf != '': if not ellyChar.isApostrophe(suf[1]): to.dvdd = True # must note suffix removal for token! # print 'only queue:' , len(self.ptr.queue) return True # print 'mx=' , mx # print 'plus queue:' , len(self.ptr.queue) wsk = self.sbu.buffer[:k] cap = ellyChar.isUpperCaseLetter(wsk[0]) # print 'wsk=' , wsk rws = u''.join(wsk) found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap) if not found: # print 'not found, k=' , k if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]): k -= 1 rws = rws[:-1] found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap) # print 'found in dictionary=' , found if found or mx > 0: # match found in dictionary or by text scan if not found: k = mx # if by text scan, must make token longer rws = rws[:k] # if mx > k self.sbu.skip(k) # print 'next=' , self.sbu.buffer[self.sbu.index:] # print 'queue after =' , len(self.ptr.queue) to = ellyToken.EllyToken(rws[:k]) if len(suf) > 1: # change token to show suffix properly # print 'suf=' , suf cs = suf[1] # first char in suffix after '-' rt = to.root # this is a list! lk = -1 # start at last char in token while rt[lk] != cs: lk -= 1 sn = len(rt) + lk # where to divide suffix from root # print 'sn=' , sn , rt to.root = rt[:sn] # root without suffix self.sbu.prepend(suf) # restore suffix to input for processing else: # no suffix chx = self.sbu.peek() # look at next char after match if chx == '-': # if hyphen, need to separate it self.sbu.skip() if ellyChar.isLetter(self.sbu.peek()): self.sbu.prepend(' ') self.sbu.prepend('-') # print 'add' , unicode(to) self.ctx.addTokenToListing(to) # add token to listing for sentence return True # print '[' + rws + ']' , 'still unrecognized' chx = rws[0] # special hyphen check if chx == '-' and k > 1: # print 'look in internal dictionary' if self.ptr.createPhrasesFromDictionary(chx,False,False): # print 'found!' to = ellyToken.EllyToken(chx) # treat hyphen as token self.ctx.addTokenToListing(to) # add it to token list self.sbu.skip() # remove from input return True to = self._extractToken(mx) # single-word matching with analysis and lookup # print 'extracted to=' , unicode(to) if to == None: # if no match, we are done and will return # print 'mx=' , mx return False if mx == 0 else True # still success if _scanText() found something if self.ptr.lastph != None: self.ptr.lastph.lens = to.getLength() # print 'to=' , unicode(to) , 'len(s)=' , len(s) , s # posn = self.ctx.countTokensInListing() # print 'at', posn , 'in token list' self.ctx.addTokenToListing(to) # add token to listing for sentence # tol = self.ctx.getNthTokenInListing(-1) # print 'last token root=' , tol.root return True # successful lookup
def match ( patn , text , offs=0 , limt=None ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit of matching returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 def __unicode__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + unicode(self.kind) + ',ct=' + unicode(self.count) + ',pa=' + unicode(self.pats) + ',tx=' + unicode(self.txts) + ',bd=' + unicode(self.bnds) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # three private functions using local variables of match() # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print "binding:",offs,ns os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi return uf def _span ( typw ): """ count chars available for wildcard match arguments: typw - wildcard returns: non-negative count if any match possible, otherwise -1 """ k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print "exclude=",k,"@",offs # calculate maximum chars a wildcard can match mx = ellyChar.findBreak(text,offs) - k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # char type matching a wildcard # print "text at",offs,"maximum wildcard match=",mx nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset if not tfn(c): break # stop when it fails to match nm += 1 # print "maximum wildcard span=",nm return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match? if limt == None: limt = len(text) mp = 0 # pattern index ml = len(patn) # pattern match limit # print text[offs:limt],":",list(patn) while True: ## literally match as many next chars as possible while mp < ml: if offs >= limt: last = '' else: last = text[offs].lower() offs += 1 # print 'matching last=' , last , 'at' , offs if patn[mp] != last: break mp += 1 ## check whether mismatch is due to special pattern char # print 'pat',mp,"<",ml # print "txt @",offs if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print "tc=",ord(tc) if tc == cALL: # a * wildcard? # print "ALL last=< " + last + " >" if last != '': offs -= 1 nm = _span(cALL) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print "offs=",offs uf = _mark(1); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print "END $:",last if last == '': continue elif last in [ '.' , ',' , '-' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' ]: offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print "ALF:",last,offs if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print "SPC:" if last != '' and ellyChar.isWhiteSpace(last): _bind(); _modify(); mbi += 1 continue elif tc == cAPO: # apostrophe wildcard? # print "APO: last=" , last if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print "SOS" # print last,'@',offs mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print "EOS" if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: if last != '': # still more to match? offs -= 1 nm = _span(tc) # maximum match possible # print 'spanning=' , nm if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1); unj += 1 uf.count = nm - 1 # at least one char must be matched continue elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch # print "fail - unwinding",unj while unj > 0: # try unwinding, if possible # print "unw:",unj uf = unw[unj-1] # get most recent unwinding record # print uf if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard break else: # print "no unwinding" break # quit if unwinding is exhausted ## ## clean up on match mode or on no match possible ## # print "matched=",matched if not matched: return None # no bindings # print text,offs ## consolidate contiguous bindings for subsequent substitutions # print "BEFORE consolidating" # print "bd:",len(mbd) # for b in mbd: # print b mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print "AFTER" # print "bd:",len(mbd) # for b in mbd: # print b return mbd # consolidated bindings plus new offset
def _extractToken(self, mnl): """ extract next token from input buffer and look up in grammar table arguments: self - mnl - minimum length for any previous match returns: ellyToken on success, otherwise None exceptions: ParseOverflow """ d = self.rul # grammar rule definitions tree = self.ptr # parse tree buff = self.sbu # input source # print ( 'start extraction' ) try: w = buff.getNext() # extract next token # print ( 'got token=' , w ) ws = ''.join(w.root) except ellyException.StemmingError as e: # print ( 'FATAL error' , e , file=sys.stderr ) sys.exit(1) # print ( 'extracted' , '['+ ws + ']' ) wcapzn = w.isCapitalized() wsplit = w.isSplit() wl = len(ws) if wl > mnl: found = self._simpleTableLookUp(ws, tree, wsplit, wcapzn) > 0 # print ( 'found in external table=' , found ) if wl >= mnl: if ws in self.rul.gtb.dctn: # look up internally # print ( v'"' + ws + '" in dictionary' ) if tree.createPhrasesFromDictionary(ws, wsplit, wcapzn): found = True # print ( 'found in internal dictionary=' , found ) if found: # if any success, we are done return w if mnl > 0: return None # defer to previous lookup # print ( 'affix logic:' ) # print ( d.man.pref ) # print ( d.man.suff ) dvdd = False if d.man.analyze(w): # any analysis possible? root = ''.join(w.root) # if so, get parts of analysis tan = w.pres + [root] + w.sufs if len(w.sufs) > 0: sx = w.sufs[-1] dvdd = not ellyChar.isApostrophe(sx[1]) # print ( 'token analysis=' , tan ) while len(tan) > 0: # and put back into input x = tan.pop() buff.prepend(x) buff.prepend(' ') w = buff.getNext() # get token again with stemming and macros # print ( 'analyzed w=' , w ) ws = ''.join(w.root) if ws[-1] == '+': # print ( 'len(queue)=' , len(tree.queue) ) m = d.ptb.match(w.root, tree) # print ( 'root=' , w.root ) # print ( 'match=' , m ) # print ( 'len(queue)=' , len(tree.queue) ) # print ( 'char span=' , tree.lastph.lens ) if m > 0: tree.lastph.bias = 2 found = True # print ( 'after found=' , found ) if len(ws) < mnl: return None # external lookup? if self._simpleTableLookUp(ws, tree, False, wcapzn): # external lookup found = True if ws in self.rul.gtb.dctn: # internal lookup? if tree.createPhrasesFromDictionary(ws, wsplit, wcapzn): found = True if found: # if any success, we are done # print ( 'token recognized' ) w.dvdd = dvdd return w # print ( 'still unrecognized token w=' , str(w) ) lws = len(ws) if lws > 1: # special handling of + or - if ws[0] == '+' and ws[-1] != '+': # print ( 'root=' , ws ) # marks root with prefixes removed if self._simpleTableLookUp(ws[1:], tree) > 0: return w if ws[0] == '-': w.shortenBy(lws - 1) # -X not recognized as suffix # print ( 'w=' , w ) # try processing - separately cn = buff.peek() if ellyChar.isLetterOrDigit(cn): buff.prepend(' ') buff.prepend(ws[1:]) # put back X for further analysis if self.pnc.match(w.root): # check if next token is punctuation # print ( 'catg=' , self.pnc.catg , self.pnc.synf.hexadecimal() ) if tree.addLiteralPhrase(self.pnc.catg, self.pnc.synf): tree.lastph.lens = w.getLength() tree.lastph.krnl.semf.combine(self.pnc.semf) # print ( 'semf=' , self.pnc.semf ) # print ( 'lastph=' , tree.lastph ) # print ( 'punc w=' , str(w) ) else: # print ( 'must create UNKN leaf node' ) tree.createUnknownPhrase(w) # unknown type as last resort tree.lastph.lens = len(ws) return w