def run ( self , segm ): """ execute each extractor and store results arguments: self - segm - input buffer returns: returns number of chars matched on success, 0 otherwise """ mx = 0 ms = [ ] capd = ellyChar.isUpperCaseLetter(segm[0]) for xr in self.exs: # try each extraction procedure in order m = xr[0](segm) # if m > 0: # match? if mx > m: # if so, it has to be longer than the longest previous continue elif mx < m: # if longer than longest previous, discard the previous mx = m ms = [ ] ms.append(xr[1:]) # add to match list if mx > 0: # any matches? for mr in ms: # if so, make phrases for them sbs = mr[2] if len(mr) > 2 else noBits bia = mr[3] if len(mr) > 3 else 0 if self.ptr.addLiteralPhraseWithSemantics(mr[0],mr[1],sbs,bia,None,False,capd): self.ptr.lastph.lens = mx return mx
def run(self, segm): """ execute each extractor and store results arguments: self - segm - input buffer returns: returns number of chars matched on success, 0 otherwise """ mx = 0 ms = [] capd = ellyChar.isUpperCaseLetter(segm[0]) for xr in self.exs: # try each extraction procedure in order m = xr[0](segm) # if m > 0: # match? if mx > m: # if so, it has to be longer than the longest previous continue elif mx < m: # if longer than longest previous, discard the previous mx = m ms = [] ms.append(xr[1:]) # add to match list nmatch = 0 if mx > 0: # any matches? for mr in ms: # if so, make phrases for them sbs = mr[2] if len(mr) > 2 else noBits bia = mr[3] if len(mr) > 3 else 0 if self.ptr.addLiteralPhraseWithSemantics( mr[0], mr[1], sbs, bia, None, False, capd): self.ptr.lastph.lens = mx nmatch += 1 return mx if nmatch > 0 else 0
def acronym ( buffr ): """ recognize parenthesized introduction of acronym in text arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ lb = len(buffr) if lb > Lmax: lb = Lmax if lb < Lmin or buffr[0] != '(': return 0 nu = 0 # uppercase count ib = 1 while ib < lb: bc = buffr[ib] ib += 1 if bc == ')': break if not ellyChar.isLetter(bc): return 0 if ellyChar.isUpperCaseLetter(bc): nu += 1 else: return 0 # must have enclosing ')' if ib < Lmin or ib - 2*nu > 0: return 0 if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0 return ib
def matchtoo(txt, pnc, ctx): """ complex checks - currently only for rightmost period of A.M. or P.M. arguments: txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - list of chars in context after punctuation returns: True on match, False otherwise """ ln = len(txt) # print ( 'nomatch() ln=' , ln , txt ) nxt = ctx[0] if len(ctx) > 0 else '' if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5: return False # print ( 'check' , txt[-3:] ) if not txt[-1] in ['M', 'm'] or txt[-2] != '.' or not txt[-3] in [ 'P', 'p', 'A', 'a' ] or txt[-4] != ' ': return False ch = txt[-5] # print ( 'ch=' , ch ) if ellyChar.isDigit(ch): # only 1 digit will be checked here! # print ( 'ONE DIGIT' ) return True # erring on the side of not to break sentence elif not ellyChar.isLetter(ch): return False # # the following code is needed only when number transforms are turned off # nn = 6 while nn <= ln and ellyChar.isLetter(txt[-nn]): nn += 1 # print ( 'nn=' , nn ) if nn < 3 or nn > 6: return False elif nn > ln: if not txt[-nn] in [' ', '-']: return False wd = ''.join(txt[:-nn]).lower() # print ( 'wd=' , wd ) if wd in [ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve' ]: if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]): return False else: return True else: return False
def matchtoo ( txt , pnc , ctx ): """ complex checks - currently only for rightmost period of A.M. or P.M. arguments: txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - list of chars in context after punctuation returns: True on match, False otherwise """ ln = len(txt) # print 'nomatch() ln=' , ln , txt nxt = ctx[0] if len(ctx) > 0 else '' if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5: return False # print 'check' , txt[-3:] if not txt[-1] in ['M','m'] or txt[-2] != '.' or not txt[-3] in ['P','p','A','a'] or txt[-4] != ' ': return False ch = txt[-5] # print 'ch=' , ch if ellyChar.isDigit(ch): # only 1 digit will be checked here! # print 'ONE DIGIT' return True # erring on the side of not to break sentence elif not ellyChar.isLetter(ch): return False # # the following code is needed only when number transforms are turned off # nn = 6 while nn <= ln and ellyChar.isLetter(txt[-nn]): nn += 1 # print 'nn=' , nn if nn < 3 or nn > 6: return False elif nn > ln: if not txt[-nn] in [ ' ' , '-' ]: return False wd = ''.join(txt[:-nn]).lower() # print 'wd=' , wd if wd in [ 'one' , 'two' , 'three' , 'four' , 'five' , 'six' , 'seven' , 'eight' , 'nine' , 'ten' , 'eleven' , 'twelve' ]: if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]): return False else: return True else: return False
def _planAhead ( buf ): """ check for possible problems in the next scan while context is still available and set flags if needed arguments: buf - buffer to be scanned """ global _toscan nsk = 0 # total skip count lb = len(buf) if lb > 4: if buf[0] == '(': # skip initial '(' nsk += 1 buf = buf[1:] if buf[0] == '"': # skip initial '"' nsk += 1 buf = buf[1:] lb -= nsk nix = 0 # scan count if lb > 8: for chx in buf: # go to first non-letter if not ellyChar.isLetter(chx): if ellyChar.isWhiteSpace(chx): break # must be space return nix += 1 sst = ''.join(buf[:nix]).lower() if not sst in _det: return # must find determiner nix += 1 # skip space if ellyChar.isUpperCaseLetter(buf[nix]): nix += 1 # skip first letter buf = buf[nix:] for ch in buf: # go to next non-letter if not ellyChar.isLetter(ch): if ellyChar.isWhiteSpace(ch): break return nix += 1 _toscan = lb + nsk - nix
def title ( buffr ): """ recognize double-quoted title in text arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ lb = len(buffr) if lb > Tmax: lb = Tmax if lb < Tmin: return 0 qm = buffr[0] if qm != aDQ and qm != lDQ: return 0 ib = 1 while ib < lb: bc = buffr[ib] ib += 1 if bc == rDQ: break if not ellyChar.isUpperCaseLetter(bc): return 0 while ib < lb: bc = buffr[ib] ib += 1 if bc == ' ': break if qm == aDQ: if bc == aDQ: break else: if bc == rDQ: break if bc in [ '!' , '?' ]: return 0 if bc == rDQ or bc == aDQ: break else: return 0 # must have enclosing rDQ or aDQ if ib < Tmin: return 0 if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0 return ib
def infer ( tok ): """ infer a token as a possible name component with side effect of converting it to lowercase ASCII arguments: tok - token as list of chars returns: True if inferred, False otherwise """ # print 'inferring tok=' , tok nch = len(tok) if (nch < 5 or not ellyChar.isUpperCaseLetter(tok[0]) or len(ellyConfiguration.digraph) == 0): return False ellyChar.toLowerCaseASCII(tok,True) toks = ''.join(tok) miss = 0 last = '' for i in range(1,nch): # check plausibility of all digraphs digr = toks[i-1:i+1] # print 'digr=' , digr , 'last=' , last if (digr == last or not digr in ellyConfiguration.digraph): miss += 1 last = digr # print 'miss=' , miss if nch < 7: return (miss == 0) else: return (miss <= 1)
def infer ( tok ): """ infer a token as a possible name component with side effect of converting it to lowercase ASCII arguments: tok - token as list of chars returns: True if inferred, False otherwise """ # print ( 'inferring tok=' , tok ) nch = len(tok) if (nch < 5 or not ellyChar.isUpperCaseLetter(tok[0]) or len(ellyConfiguration.digraph) == 0): return False ellyChar.toLowerCaseASCII(tok,True) toks = ''.join(tok) miss = 0 last = '' for i in range(1,nch): # check plausibility of all digraphs digr = toks[i-1:i+1] # print ( 'digr=' , digr , 'last=' , last ) if (digr == last or not digr in ellyConfiguration.digraph): miss += 1 last = digr # print ( 'miss=' , miss ) if nch < 7: return (miss == 0) else: return (miss <= 1)
def match ( self , segm , tree ): """ compare text segment against FSA patterns arguments: self - segm - segment to match against tree - parse tree in which to put leaf nodes for final matches returns: text length matched by FSA """ # print 'comparing' , segm if len(self.indx) == 0: return 0 # no matches if FSA is empty lim = bound(segm) # get limit for matching mtl = 0 # accumulated match length mtls = 0 # saved final match length state = 0 # set to mandatory initial state for FSA stk = [ ] # for tracking multiple possible matches ls = self.indx[state] ix = 0 sg = segm[:lim] # text subsegment for matching capd = False if len(sg) == 0 else ellyChar.isUpperCaseLetter(sg[0]) while True: # run FSA to find all possible matches # print 'state=' , state # print 'count=' , mtl , 'matched so far' # print 'links=' , len(ls) nls = len(ls) # how many links from current state if ix == nls: # if none, then must back up if len(stk) == 0: break r = stk.pop() # restore match status state = r[0] # FSA state ls = r[1] # remaining links to check sg = r[2] # input string mtl = r[3] # total match length ix = 0 continue m = 0 while ix < nls: lk = ls[ix] # get next link at current state ix += 1 # and increment link index # print 'lk= [' , unicode(lk), '] , sg=' , sg if lk.patn == u'\x00': # do state change without matching? m = 0 # no match length else: # print 'match lk=' , unicode(lk) , 'sg=' , sg bds = ellyWildcard.match(lk.patn,sg) if bds == None: continue # print 'bds=' , bds m = bds[0] # get match length, ignore wildcard bindings if lk.nxts < 0: # final state? # print 'flags=' , lk.synf , '/' , lk.semf if tree.addLiteralPhraseWithSemantics(lk.catg,lk.synf,lk.semf,lk.bias, cap=capd): # make phrase mtls = mtl + m tree.lastph.lens = mtls # save its length # print 'match state=' , state , 'length=' , mtls # else: # print 'lastph=' , tree.lastph # print 'seg=' , sg # print 'cat=' , lk.catg, 'synf=' , lk.synf # print 'ix=' , ix , 'nls=' , nls if ix < nls: # any links not yet checked? r = [ state , ls[ix:] , sg , mtl ] # print 'r=' , r stk.append(r) # if not, save info for later continuation mtl += m # update match length break # leave loop at this state, go to next state else: # print 'no matches' continue # all patterns exhausted for state ix = 0 sg = sg[m:] # move up in text input state = lk.nxts # next state if state < 0: ls = [ ] else: ls = self.indx[state] # print 'sg=' , sg # print 'state=' , state # print 'len(ls)=' , len(ls) return mtls
def getNext ( self ): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print 'getNext' self.resetBracketing() inBrkt = False nspc = 0 # set space count sent = [ ] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print 'x=' , '<' + x + '>' , ord(x) self.inp.unread(x,SP) # put first char back to restore input # print '0 <<" , self.inp.buf # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' # print 'sent=' , sent , 'nspc=' , nspc # check for table delimiters in text if len(sent) == 0: # print 'table' # print '1 <<' , self.inp.buf if x == u'.' or x == u'-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far #################################################### # accumulate chars and count alphanumeric and spaces #################################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # elif ellyChar.isWhiteSpace(c) and inBrkt: nspc += 1 svBrkt = inBrkt inBrkt = self.checkBracketing(x) # do bracketing check with modified chars if svBrkt and not inBrkt: nspc = 0 # print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt sent.append(c) # put original char into sentence buffer if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # char was not alphanumeric or space # look for stop punctuation exception cx = self.inp.preview() # for context of match call # print '0 <<' , self.inp.buf # print 'sent=' , sent[:-1] # print 'punc=' , '<' + c + '>' # print 'next=' , cx if c in Stops and len(cx) > 0 and cx[0] == SP: if self.stpx.match(sent[:-1],c,cx): # print 'stop exception MATCH' if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print 'no stop exception MATCH for' , c # print '@1 <<' , self.inp.buf # handle any nonstandard punctuation exoticPunctuation.normalize(c,self.inp) # print '@2 <<' , self.inp.buf # check for dash if c == u'-': d = self.inp.read() if d == u'-': # print 'dash' while True: d = self.inp.read() if d != u'-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print '@3 c=' , c , inBrkt if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) if not inBrkt: # print sent , 'so far' z = self.inp.read() if self.shortBracketing(sent,z): break self.inp.unread(z) # print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break elif c in QUOs and lc in Stops: # print 'stop+quote' z = self.inp.read() if z in RBs: sent.append(z) y = self.inp.read() if y in Stops: sent.append(y) elif not ellyChar.isWhiteSpace(y): self.inp.unread(y) inBrkt = False break elif z in QUOs: # print 'stop+quote+quote' sent.append(z) inBrkt = False break self.inp.unread(z) # print 'continue' continue elif not c in Stops: continue else: # print 'check stopping!' d = self.inp.read() # print '@3 <<' , self.inp.buf if d == None: d = u'!' # print 'stop=' , '<' + c + '> <' + d + '>' # print 'ellipsis check' if c == u'.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(d) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append(SP) # if part of token, put in space as separator continue if c == ELLP: # print 'found Unicode ellipsis, d=' , d if ellyChar.isUpperCaseLetter(d): self.inp.unread(d) # super special case of bad punctuation self.inp.unread(' ') # put in implied period and space self.inp.unread('.') # # special check for multiple stops # print 'next char d=' , d , ord(d) if d != END else 'NONE' if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent,d): break if d in self._cl and self._cl[d] == 1: dn = self.inp.peek() if ellyChar.isWhiteSpace(dn): sent.append(d) break self.inp.unread(d) # print 'no space after punc' continue # if no match for lookahead, put back elif d != END: # print 'unread d=' , d self.inp.unread(d) # print 'possible stop' # check special case of number ending in decimal point if c == '.': ixb = len(sent) - 2 ixn = ixb + 1 cxn = '' # print 'sent=' , sent # print 'ixn=' ,ixn while ixn > 0: ixn -= 1 cxn = sent[ixn] # print 'cxn=' , cxn if not ellyChar.isDigit(cxn): break # print 'break: ixn=' , ixn , 'ixb=' , ixb if ixn < ixb and cxn in [ ' ' , '-' , '+' ]: prvw = self.inp.preview() # print 'prvw=' , prvw if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(prvw[1]): continue # final check: is sentence long enough? if inBrkt: # print 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() # print 'nspc=' , nspc if c in [ ':' , ';' ] or nspc < 3: sent.append(d) # print 'add' , '<' + d + '> to sentence' # print 'sent=' , sent self.inp.skip() nspc -= 1 continue # print '@4 <<' , self.inp.buf cx = self.inp.peek() if cx == None: cx = u'!!' # print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent # print 'nAN=' , nAN , 'inBrkt=' , inBrkt if nAN > 1: break if sent == [ u'\u2026' ]: # special case of sentence return list("-.-") # with lone ellipsis elif len(sent) > 0 or self.last != END: return sent else: return None
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __str__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + str(self.kind) + ',ct=' + str(self.count) + ',pa=' + str(self.pats) + ',tx=' + str(self.txts) + ',bd=' + str(self.bnds) + ',ns=' + str(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print ( "binding:",offs,ns ) os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) ) # print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp ) # print ( "text to span:",text[offs:] ) # print ( "pat rest=" , patn[mp:] ) k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print ( "exclude=",k,"chars from possible span for rest of pattern" ) # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print ( mx,"chars available to scan" ) mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print ( "text at",offs,"maximum wildcard match=",mx ) nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print ( 'span c=' , c ) if not tfn(c): break # stop when it fails to match nm += 1 # print ( "maximum wildcard span=",nm ) return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print ( 'starting match, limt=',limt,text[offs:limt],":",patn ) # print ( 'nsps=' , nsps ) mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print ( '---- loop mp=' , mp , 'ml=' , ml ) while mp < ml: if offs >= limt: # print ( "offs=",offs,"limt=",limt ) last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print ( 'patn=' , patn ) mc = patn[mp] # print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs ) # print ( 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' ) if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print ( 'hyphen special matching, limt=', limt , 'offs=' , offs ) # print ( 'text[offs:]=' , text[offs:] ) if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print ( 'no special matching of hyphen' ) break # print ( 'matched @mp=' , mp ) mp += 1 ## check whether mismatch is due to special pattern char # print ( 'pat @',mp,"<",ml ) # print ( "txt @",offs,'<',limt,'last=',last ) # print ( '@',offs,text[offs:] ) if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) ) if tc == cALL: # a * wildcard? # print ( "ALL last=< " + last + " >" ) if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print ( "offs=",offs,'nm=',nm ) uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print ( "END $:",last ) if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif not ellyChar.isText(last): offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print ( "ANY:",last,offs ) if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print ( 'at cCAN' ) if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print ( "ALF:",last,offs ) if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print ( "UPR:",last,'@',offs ) if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print ( "LWR:",last,'@',offs ) if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print ( "SPC:","["+last+"]" ) if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print ( 'NO space' ) elif tc == cAPO: # apostrophe wildcard? # print ( "APO: last=" , last ) if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print ( "SOS" ) # print ( last,'@',offs ) mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print ( "EOS" ) if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' ) if last != '': # still more to match? offs -= 1 # print ( 'nsps=' , nsps ) # print ( '@' , offs , text ) nm = _span(tc,nsps) # maximum match possible # print ( 'spanning=' , nm ) if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print ( 'spanning=' , nm ) if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print ( 'offs=' , offs ) last = text[offs] if offs < limt else '' continue # print ( 'fail tc=' , deconvert(tc) ) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print ( "fail - unwinding" , unj ) while unj > 0: # try unwinding, if possible # print ( "unw:",unj ) uf = unw[unj-1] # get most recent unwinding record # print ( uf ) if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print ( "no unwinding" ) break # quit if unwinding is exhausted # print ( 'cnt=' , uf.count , 'off=' , offs ) ## ## clean up on match mode or on no match possible ## # print ( "matched=",matched ) if not matched: return None # no bindings # print ( text,offs ) ## consolidate contiguous bindings for subsequent substitutions # print ( "BEFORE consolidating consecutive bindings" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print ( "AFTER" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) return mbd # consolidated bindings plus new offset
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __unicode__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + unicode(self.kind) + ',ct=' + unicode(self.count) + ',pa=' + unicode(self.pats) + ',tx=' + unicode(self.txts) + ',bd=' + unicode(self.bnds) + ',ns=' + unicode(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print "binding:",offs,ns os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) # print "_span: txt @",offs,"pat @",mp,"nsp=",nsp # print "text to span:",text[offs:] # print "pat rest=" , patn[mp:] k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print "exclude=",k,"chars from possible span for rest of pattern" # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print mx,"chars available to scan" mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print "text at",offs,"maximum wildcard match=",mx nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print 'span c=' , c if not tfn(c): break # stop when it fails to match nm += 1 # print "maximum wildcard span=",nm return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print 'starting match, limt=',limt,text[offs:limt],":",patn # print 'nsps=' , nsps mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print '---- loop mp=' , mp , 'ml=' , ml while mp < ml: if offs >= limt: # print "offs=",offs,"limt=",limt last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print 'patn=' , patn mc = patn[mp] # print 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs # print 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print 'hyphen special matching, limt=', limt , 'offs=' , offs # print 'text[offs:]=' , text[offs:] if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print 'no special matching of hyphen' break # print 'matched @mp=' , mp mp += 1 ## check whether mismatch is due to special pattern char # print 'pat @',mp,"<",ml # print "txt @",offs,'<',limt,'last=',last # print '@',offs,text[offs:] if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) if tc == cALL: # a * wildcard? # print "ALL last=< " + last + " >" if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print "offs=",offs,'nm=',nm uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print "END $:",last if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print "ANY:",last,offs if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print 'at cCAN' if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print "ALF:",last,offs if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print "UPR:",last,'@',offs if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print "LWR:",last,'@',offs if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print "SPC:","["+last+"]" if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print 'NO space' elif tc == cAPO: # apostrophe wildcard? # print "APO: last=" , last if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print "SOS" # print last,'@',offs mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print "EOS" if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' if last != '': # still more to match? offs -= 1 # print 'nsps=' , nsps # print '@' , offs , text nm = _span(tc,nsps) # maximum match possible # print 'spanning=' , nm if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print 'spanning=' , nm if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print 'offs=' , offs last = text[offs] if offs < limt else '' continue # print 'fail tc=' , deconvert(tc) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print "fail - unwinding" , unj while unj > 0: # try unwinding, if possible # print "unw:",unj uf = unw[unj-1] # get most recent unwinding record # print uf if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print "no unwinding" break # quit if unwinding is exhausted # print 'cnt=' , uf.count , 'off=' , offs ## ## clean up on match mode or on no match possible ## # print "matched=",matched if not matched: return None # no bindings # print text,offs ## consolidate contiguous bindings for subsequent substitutions # print "BEFORE consolidating consecutive bindings" # print "bd:",len(mbd) # print mbd[0] # print '----' # for b in mbd[1:]: # print b mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print "AFTER" # print "bd:",len(mbd) # print mbd[0] # print '----' # for b in mbd[1:]: # print b return mbd # consolidated bindings plus new offset
def _scanText(self, k): """ try to match in buffer regardless of word boundaries using Elly vocabulary, pattern, amd template tables an also running Elly entity extractors arguments: self - k - length of first component in buffer returns: match parameters [ text span of match , vocabulary match , suffix removed ] exceptions: ParseOverflow """ # print ( '_scanText k=' , k ) sb = self.sbu.buffer # input buffer tr = self.ptr # parse tree for results # print ( '_scanText sb=' , sb ) # initialize match status nspan = 0 # total span of match vmchs = [] # chars of vocabulary entry matched suffx = '' # any suffix removed in match d = self.rul # grammar rule definitions m = d.ptb.match(sb, tr) # try token by pattern match next # print ( 'pattern m=' , m ) if nspan < m: nspan = m # on longer match, update maximum m = d.ctb.match(sb, tr) # try multi-word template match next # print ( 'template m=' , m ) if nspan < m: nspan = m # on longer match, update maximum m = self.iex.run(sb) # try entity extractors next # print ( 'extractor m=' , m ) if nspan < m: nspan = m # on longer match, update maximum # print ( 'nspan=' , nspan, sb[:nspan] ) lm = len(sb) # scan limit # print ( 'lm=' , lm , 'm=' , m ) capd = ellyChar.isUpperCaseLetter(sb[0]) # print ( 'next component=' , sb[:k] , ', context=' , sb[k:lm] ) if self.vtb != None: # look in external dictionary, if it exists ls = list(sb[:k]) # print ( 'vtb ls 0=' , ls ) ellyChar.toLowerCaseASCII(ls) ss = ''.join(ls) # where to start for vocabulary indexing # print ( 'vtb ls 1=' , ls ) n = vocabularyTable.delimitKey(ss) # get actual indexing # print ( 'delimiting n=' , n , ':' , '<' + ss[:n] + '>' ) # print ( vocabularyTable.listDBKeys(self.vtb.cdb) ) rl = self.vtb.lookUp(sb, n) # get list of the maximum text matches # print ( 'external matches=' , len(rl) ) # print ( 'input text=' , sb ) if len(rl) > 0: # r0 = rl[0] # look at first record # print ( 'r0=' , r0 ) vmln = r0.nspan # should be same for all matches vchs = r0.vem.chs # vsfx = r0.suffx # # print ( 'nspan=' , vmln , vsfx ) if (vmln > nspan or vmln == nspan and vsfx == ''): nspan = vmln # keep vocabulary matches vmchs = vchs # suffx = vsfx # for r in rl: ve = r.vem # get vocabulary entry # print ( 've=' , ve ) # if ve.gen != None: print ( 've.gen=' , ve.gen ) if tr.addLiteralPhraseWithSemantics( ve.cat, ve.syf, ve.smf, ve.bia, ve.gen, len(suffx) > 0): tr.lastph.lens = nspan # char length of leaf phrase node # needed for later selection tr.lastph.krnl.cncp = ve.con if capd: tr.lastph.krnl.semf.set(0) # print ( 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens ) if suffx != '': if ellyChar.isApostrophe(suffx[1]): tr.lastph.krnl.usen = 0 # print ( 'vocabulary m=' , vmln ) # print ( 'queue after table lookup:' , len(self.ptr.queue) ) # print ( 'vtb sb=' , sb ) # print ( 'maximum match=' , nspan ) # print ( 'next input=' , sb[:nspan] ) if nspan > 0: # any matches at all? tr.requeue() # if so, keep only longest of them # print ( 'queue after scan:' , len(self.ptr.queue) ) # print ( 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']' ) return [nspan, vmchs, suffx]
def _lookUpNext(self): """ look up possible next segments in input buffer by various means, keeping tokens only for the LONGEST segment arguments: self returns: True on successful lookup, False otherwise exceptions: ParseOverflow """ self.sbu.skipSpaces() # skip leading spaces s = self.sbu.buffer # print ( '_lookUp@0 buffer=' , s ) if len(s) == 0: # check for end of input return False # if so, done # print ( 'in =' , str(self.sbu) ) if self.trs != None: # preanalysis of number expressions self.trs.rewriteNumber(s) # print ( '_lookUp@1 buffer=' , self.sbu.buffer ) # print ( 'macro expansion s[0]=' , s[0] ) self.sbu.expand() # apply macro substitutions # print ( 'macro expanded s[0]=' , s[0] ) # print ( '_lookUp@2 buffer=' , self.sbu.buffer ) s = self.sbu.buffer # print ( 'expanded len=' , len(s) ) if len(s) == 0: return True # macros can empty out buffer k = self.sbu.findBreak() # find extent of first component for lookup if k == 0: k = 1 # must have at least one char in token # print ( 'break at k=' , k ) kl = len(s) if k + 1 < kl and s[k] == '+' and s[k + 1] == ' ': k += 1 # recognize possible prefix # print ( 'len(s)=' , kl , 'k=' , k , 's=', s ) # print ( '_lookUp@3 buffer=' , self.sbu.buffer ) mr = self._scanText(k) # text matching in various ways mx = mr[0] # overall maximum match length chs = mr[1] # any vocabulary element matched suf = mr[2] # any suffix removed in matching # print ( '_lookUp@4 buffer=' , self.sbu.buffer ) s = self.sbu.buffer # print ( 'k=' , k ) # print ( 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf ) # print ( 'len(s)=' , len(s) , 's=' , s ) if (k < mx or k == mx and suf != '' ): # next word cannot produce token as long as already seen? # print ( 'queue:' , len(self.ptr.queue) ) # print ( 'chs=' , chs ) if len(chs) > 0: # any vocabulary matches? # print ( 'put back' , suf , mx , s ) self.sbu.skip(mx) # if so, they supersede if suf != '': # handle any suffix removal self.sbu.prepend(list(suf)) # print ( 'suf=' , suf ) else: chs = self.sbu.extract(mx) # print ( 'extract chs=' , chs ) to = ellyToken.EllyToken(chs) # print ( 'token=' , str(to) ) self.ctx.addTokenToListing(to) if suf != '': if not ellyChar.isApostrophe(suf[1]): to.dvdd = True # must note suffix removal for token! # print ( 'only queue:' , len(self.ptr.queue) ) return True # print ( 'mx=' , mx ) # print ( 'plus queue:' , len(self.ptr.queue) ) wsk = self.sbu.buffer[:k] cap = ellyChar.isUpperCaseLetter(wsk[0]) # print ( 'wsk=' , wsk ) rws = ''.join(wsk) found = self.ptr.createPhrasesFromDictionary(rws.lower(), False, cap) if not found: if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]): k -= 1 rws = rws[:-1] found = self.ptr.createPhrasesFromDictionary( rws.lower(), False, cap) # print ( rws , 'found in dictionary=' , found ) if found or mx > 0: # match found in dictionary or by text scan if not found: k = mx # if by text scan, must make token longer rws = rws[:k] # if mx > k self.sbu.skip(k) # print ( 'next=' , self.sbu.buffer[self.sbu.index:] ) # print ( 'queue after =' , len(self.ptr.queue) ) to = ellyToken.EllyToken(rws[:k]) if len(suf) > 1: # change token to show suffix properly # print ( 'suf=' , suf ) cs = suf[1] # first char in suffix after '-' rt = to.root # this is a list! lk = -1 # start at last char in token while rt[lk] != cs: lk -= 1 sn = len(rt) + lk # where to divide suffix from root # print ( 'sn=' , sn , rt ) to.root = rt[:sn] # root without suffix self.sbu.prepend(suf) # restore suffix to input for processing else: # no suffix chx = self.sbu.peek() # look at next char after match if chx == '-': # if hyphen, need to separate it self.sbu.skip() if ellyChar.isLetter(self.sbu.peek()): self.sbu.prepend(' ') self.sbu.prepend('-') # print ( 'add' , str(to) ) self.ctx.addTokenToListing(to) # add token to listing for sentence return True # print ( '[' + rws + ']' , 'still unrecognized' ) chx = rws[0] # special hyphen check if chx == '-' and k > 1: # print ( 'look in internal dictionary' ) if self.ptr.createPhrasesFromDictionary(chx, False, False): # print ( 'found!' ) to = ellyToken.EllyToken(chx) # treat hyphen as token self.ctx.addTokenToListing(to) # add it to token list self.sbu.skip() # remove from input return True to = self._extractToken( mx) # single-word matching with analysis and lookup # print ( 'extracted to=' , str(to) ) if to == None: # if no match, we are done and will return # print ( 'mx=' , mx ) return False if mx == 0 else True # still success if _scanText() found something if self.ptr.lastph != None: self.ptr.lastph.lens = to.getLength() # print ( 'to=' , str(to) , 'len(s)=' , len(s) , s ) # posn = self.ctx.countTokensInListing() # print ( 'at', posn , 'in token list' ) self.ctx.addTokenToListing(to) # add token to listing for sentence # tol = self.ctx.getNthTokenInListing(-1) # print ( 'last token root=' , tol.root ) return True # successful lookup
def match(self, segm, tree): """ compare text segment against all FSA patterns from state 0 arguments: self - segm - segment to match against tree - parse tree in which to put leaf nodes for final matches returns: text length matched by FSA """ # print 'comparing' , segm if len(self.indx) == 0: return 0 # no matches if FSA is empty if len(segm) == 0: return 0 # string is empty lim = bound(segm) # get text limit for matching mtl = 0 # accumulated match length mtls = 0 # saved final match length state = 0 # set to mandatory initial state for FSA stk = [] # for tracking possible multiple matches ls = self.indx[state] # for state 0! ix = 0 # index into current possible transitions sg = segm[:lim] # text subsegment for matching # print 'initial sg=' , sg # print len(ls) , 'transitions from state 0' capd = False if len(sg) == 0 else ellyChar.isUpperCaseLetter(sg[0]) while True: # run FSA to find all possible matches # print 'state=' , state # print 'count=' , mtl , 'matched so far' # print 'links=' , len(ls) , 'ix=' , ix nls = len(ls) # how many links from current state if ix == nls: # if none, then must back up if len(stk) == 0: break r = stk.pop() # restore match status # print 'pop r= [' , r[0] , r[1][0].shortcode() , ']' state = r[0] # FSA state ls = r[1] # remaining links to check sg = r[2] # input string mtl = r[3] # total match length ix = 0 # print 'pop sg=' , sg continue # print 'substring to match, sg=' , sg , 'nls=' , nls m = 0 while ix < nls: lk = ls[ix] # get next link at current state ix += 1 # and increment link index # print '@' , state , 'lk= [' , unicode(lk), ']' , 'ix=' , ix # print 'patn=' , lk.patn po = lk.patn[0] if po == u'\x00': # do state change without matching? m = 0 # no match length elif po != ellyWildcard.cEND: # print 'po=' , po bds = ellyWildcard.match(lk.patn, sg) # print 'bds=' , bds if bds == None: continue m = bds[0] # get match length, ignore wildcard bindings elif (len(sg) > 0 and (ellyChar.isLetterOrDigit(sg[0]) or sg[0] == ellyChar.PRME)): # print 'unmatched solitary $' continue else: # print 'matched solitary $, state=' , state m = 0 # print 'm=' , m if lk.nxts < 0: # final state? if lk.nxts == -2: m = 0 # last part of match not counted # print 'state=' , state , unicode(lk) # print 'flags=' , lk.synf , '/' , lk.semf if tree.addLiteralPhraseWithSemantics( lk.catg, lk.synf, lk.semf, lk.bias, cap=capd): # make phrase ml = mtl + m if mtls < ml: mtls = ml # print 'success!' , 'mtls=' , mtls tree.lastph.lens = mtls # save its length # print 'match state=' , state , 'length=' , mtls # print 'ix=' , ix , 'nls=' , nls if ix < nls: # any links not yet checked? r = [state, ls[ix:], sg, mtl] # print 'saved r= ' , state , # print [ x.shortcode() for x in ls[ix:] ] stk.append(r) # if not, save info for later continuation mtl += m # update match length break # leave loop at this state, go to next state else: # print 'no matches' continue # all patterns exhausted for state ix = 0 sg = sg[m:] # move up in text input state = lk.nxts # next state if state < 0: ls = [] else: ls = self.indx[state] # print 'sg=' , sg # print 'state=' , state # print 'len(ls)=' , len(ls) # print 'mtls=' , mtls return mtls
def getNext(self): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print ( 'getNext' ) self.resetBracketing() inBrkt = False nspc = 0 # set space count sent = [] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print ( 'x=' , '<' + x + '>' , ord(x) ) self.inp.unread(x, SP) # put first char back to restore input # print ( '0 <<' , self.inp.buf ) # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print ( 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' ) # print ( 'sent=' , sent , 'nspc=' , nspc ) # check for table delimiters in text if len(sent) == 0: # print ( 'table' ) # print ( '1 <<' , self.inp.buf ) if x == '.' or x == '-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far #################################################### # accumulate chars and count alphanumeric and spaces #################################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print ( 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' ) if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # elif ellyChar.isWhiteSpace(c) and inBrkt: nspc += 1 svBrkt = inBrkt inBrkt = self.checkBracketing( x) # do bracketing check with modified chars if svBrkt and not inBrkt: nspc = 0 # print ( 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt ) sent.append(c) # put original char into sentence buffer if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # certain Unicode punctuation will always break if c in Hards: break # char was not alphanumeric or space # look for stop punctuation exception cx = self.inp.preview() # for context of match call # print ( '0 <<' , self.inp.buf ) # print ( 'sent=' , sent[:-1] ) # print ( 'punc=' , '<' + c + '>' ) # print ( 'next=' , cx ) if c in Stops and len(cx) > 0 and cx[0] == SP: if self.stpx.match(sent[:-1], c, cx): # print ( 'stop exception MATCH' ) if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print ( 'no stop exception MATCH for' , c ) # print ( '@1 <<' , self.inp.buf ) # handle any nonstandard punctuation exoticPunctuation.normalize(c, self.inp) # print ( '@2 <<' , self.inp.buf ) # check for dash if c == '-': d = self.inp.read() if d == '-': # print ( 'dash' ) while True: d = self.inp.read() if d != '-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print ( '@3 c=' , c , inBrkt ) if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print ( 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) ) if not inBrkt: # print ( sent , 'so far' ) z = self.inp.read() if self.shortBracketing(sent, z): break self.inp.unread(z) # print ( 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' ) if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break elif c in QUOs and lc in Stops: # print ( 'stop+quote' ) z = self.inp.read() if z in RBs: sent.append(z) y = self.inp.read() if y in Stops: sent.append(y) elif not ellyChar.isWhiteSpace(y): self.inp.unread(y) inBrkt = False break elif z in QUOs: # print ( 'stop+quote+quote' ) sent.append(z) inBrkt = False break self.inp.unread(z) # print ( 'continue' ) continue elif not c in Stops: continue else: # print ( 'check stopping!' ) d = self.inp.read() # print ( '@3 <<' , self.inp.buf ) if d == None: d = '!' # print ( 'stop=' , '<' + c + '> <' + d + '>' ) # print ( 'ellipsis check' ) if c == '.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(d) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append( SP ) # if part of token, put in space as separator continue if c == ELLP: # print ( 'found Unicode ellipsis, d=' , d ) if ellyChar.isUpperCaseLetter(d): self.inp.unread( d) # super special case of bad punctuation self.inp.unread(' ') # put in implied period and space self.inp.unread('.') # # special check for multiple stops # print ( 'next char d=' , d , ord(d) if d != END else 'NONE' ) if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent, d): break if d in self._cl and self._cl[d] == 1: dn = self.inp.peek() if ellyChar.isWhiteSpace(dn): sent.append(d) break self.inp.unread(d) # print ( 'no space after punc' ) continue # if no match for lookahead, put back elif d != END: # print ( 'unread d=' , d ) self.inp.unread(d) # print ( 'possible stop' ) # check special case of number ending in decimal point if c == '.': ixb = len(sent) - 2 ixn = ixb + 1 cxn = '' # print ( 'sent=' , sent ) # print ( 'ixn=' ,ixn ) while ixn > 0: ixn -= 1 cxn = sent[ixn] # print ( 'cxn=' , cxn ) if not ellyChar.isDigit(cxn): break # print ( 'break: ixn=' , ixn , 'ixb=' , ixb ) if ixn < ixb and cxn in [' ', '-', '+']: prvw = self.inp.preview() # print ( 'prvw=' , prvw ) if len(prvw) > 1 and not ellyChar.isUpperCaseLetter( prvw[1]): continue # final check: is sentence long enough? if inBrkt: # print ( 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() ) # print ( 'nspc=' , nspc ) if c in [':', ';'] or nspc < 3: sent.append(d) # print ( 'add' , '<' + d + '> to sentence' ) # print ( 'sent=' , sent ) self.inp.skip() nspc -= 1 continue # print ( '@4 <<' , self.inp.buf ) cx = self.inp.peek() if cx == None: cx = '!!' # print ( 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent ) # print ( 'nAN=' , nAN , 'inBrkt=' , inBrkt ) if nAN > 1: break if sent == ['\u2026']: # special case of sentence return list("-.-") # with lone ellipsis elif len(sent) > 0 or self.last != END: return sent else: return None
def read ( self ): """ get next char from input stream with filtering arguments: self returns: single Unicode char on success, null string otherwise """ # print 'reading: buf=' , self.buf while True: if not self._reload(): # check if buffer empty and reload if needed return END # return EOF if no more chars available # print 'buf=' , self.buf c = self.buf.pop(0) # next raw char in buffer if c == SHYP: # ignore soft hyphen if len(self.buf) > 0: if self.buf[0] == SP: c = self.buf.pop(0) continue if not ellyChar.isText(c): # unrecognizable Elly char? # print 'c=' , '{0:04x}'.format(ord(c)) if ellyChar.isCJK(c): c = '_' # special handling for Chinese else: # print 'replace' , c , 'with NBSP' c = NBSP # by default, replace with no-break space lc = self._lc # copy saved last char # print 'lc=' , ord(lc) self._lc = c # set new last char # if c == "'": # print 'apostrophe' , self.buf # print 'c=' , '<' + c + '>' if c == HYPH: # special treatment for isolated hyphens if spc(lc) and spc(self.peek()): c = DASH break elif c == '.': # check for ellipsis bb = self.buf bl = len(bb) # print 'bl=' , bl , 'bb=' , bb if bl >= 2 and bb[0] == '.' and bb[1] == '.': self.buf = bb[2:] c = ELLP elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[2] == ' ' and bb[3] == '.': self.buf = bb[4:] c = ELLP break elif c == RSQm: # check for single quote # print 'at single quote' nc = self.peek() # look at next char # print 'next=' , nc if nc == RSQm: # doubling of single quote? self.buf.pop(0) # if so, combine two single quotes c = RDQm # into one double quote elif not ellyChar.isWhiteSpace(c): if ellyChar.isWhiteSpace(lc): self._cap = ellyChar.isUpperCaseLetter(c) break elif c == CR: # always ignore continue elif c == NL: # special handling of \n # print 'got NL' nc = self.peek() # look at next char while nc == CR: self.buf.pop(0) # skip over CR's nc = self.peek() # print "lc= '" + lc + "'" if lc != NL and nc == NL: self.buf.pop(0) # special case when NL can be returned break if nc == NL: # NL followed NL? while nc == NL or nc == CR: self.buf.pop(0) # ignore subsequent new line chars nc = self.peek() elif nc == END or ellyChar.isWhiteSpace(nc): continue # NL followed by space is ignored elif nc == u'.' or nc == u'-': pass else: # print 'NL to SP, lc=' , ord(lc) c = SP # convert NL to SP if not before another NL else: # print 'lc=' , ord(lc) , 'c=' , ord(c) c = SP # otherwise, convert white space to plain space self._cap = False if not ellyChar.isWhiteSpace(lc): # preceding char was not white space? # print 'return SP' break # if so, keep space in stream return c # next filtered char
def match(self, txt, pnc, ctx): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - next chars after punctuation returns: True on match, False otherwise """ # print ( 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx ) if matchtoo(txt, pnc, ctx): # exception by complex match? return True # print ( 'matchtoo() returned False' ) sep = ctx[0] if len(ctx) > 0 else '' if sep == ellyChar.THS: return True nxt = ctx[1] if len(ctx) > 1 else '' # print ( 'lstg=' , self.lstg.keys() ) if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print ( len(lp) , 'patterns' ) ltx = len(txt) # current length of accumulated text so far ntr = 1 while ntr <= ltx: if not ellyChar.isLetterOrDigit(txt[-ntr]): break ntr += 1 nrg = ntr ntr -= 1 # available trailing chars for wildcard * match while nrg <= ltx: c = txt[-nrg] if not ellyChar.isLetterOrDigit( c) and not ellyChar.isEmbeddedCombining(c): # print ( 'break at nrg=' , nrg , txt[-nrg] ) break nrg += 1 nrg -= 1 # end of range for all pattern matching # print ( 'ntr=' , ntr , 'nrg=' , nrg ) txt = txt[-nrg:] # reset text to limit for matching ltx = len(txt) # its new length # print ( 'txt= ' + str(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' ) for p in lp: # try matching each listed exception pattern if p.left != None and len(p.left) > 0: pat = p.left star = pat[-1] == ellyWildcard.cALL n = len( pat) # it each pattern element matches one sequence char if star: # except for a final wildcard * # print ( 'pattern ending with *' ) n -= 1 # print ( 'ltx=' , ltx , 'n=' , n ) if ltx < n: continue # cannot match pattern properly pat = pat[:-1] t = txt[:n] else: if ltx < n: continue # cannot match pattern properly t = txt[-n:] if not ellyWildcard.match(pat, t, 0): # print ( 'no possible pattern match' ) continue k = ltx - n # extra chars beyond any match # print ( 'k=' , k , 't=' , t ) # print ( 'txt=' , txt ) # print ( 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' ) # print ( 'matches' , n , 'chars' ) if not star and k > 0: # print ( 'check text before [' , txt[-n] , ']' ) if ellyChar.isLetterOrDigit(txt[-n]): c = txt[-n - 1] # print ( 'preceding= [', c , ']' ) if ellyChar.isLetterOrDigit(c) or c == '&': continue # because break in text is required # print ( 'pat=' , ellyWildcard.deconvert(p.left) ) # print ( 'n=' , n , 'ltx=' , ltx ) # print ( 'txt=' , txt ) # nc = '\\n' if nxt == '\n' else nxt # print ( 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' ) # print ( 'versus c=' , nc ) rp = p.right if rp == [] or rp[0] == ellyWildcard.cALL: return True pcx = rp[0] if pcx == nxt: # check for specific char after possible stop ) # print ( 'right=' , nxt ) return True elif pcx == ellyWildcard.cALF: # check for alphabetic if ellyChar.isLetter(nxt): # print ( 'right is alphabetic=' , nxt ) return True elif pcx == ellyWildcard.cDIG: # check for numeric if ellyChar.isDigit(nxt): # print ( 'right is numeric=' , nxt 0 return True elif pcx == ellyWildcard.cUPR: # check for upper case if ellyChar.isUpperCaseLetter(nxt): return True elif pcx == ellyWildcard.cLWR: # check for lower case if ellyChar.isLowerCaseLetter(nxt): return True elif pcx == ellyWildcard.cCAN: # check for non-alphanumeric if ellyChar.isLetter(nxt): # print ( 'right is alphabetic=' , nxt ) return True # print ( "no matches" ) return False
def _lookUpNext ( self ): """ look up possible next segments in input buffer by various means, keeping tokens only for the LONGEST segment arguments: self returns: True on successful lookup, False otherwise exceptions: ParseOverflow """ self.sbu.skipSpaces() # skip leading spaces s = self.sbu.buffer # print '_lookUp@0 buffer=' , s if len(s) == 0: # check for end of input return False # if so, done # print 'in =' , unicode(self.sbu) if self.trs != None: # preanalysis of number expressions self.trs.rewriteNumber(s) # print '_lookUp@1 buffer=' , self.sbu.buffer # print 'macro expansion s[0]=' , s[0] self.sbu.expand() # apply macro substitutions # print 'macro expanded s[0]=' , s[0] # print '_lookUp@2 buffer=' , self.sbu.buffer s = self.sbu.buffer # print 'expanded len=' , len(s) if len(s) == 0: return True # macros can empty out buffer k = self.sbu.findBreak() # find extent of first component for lookup if k == 0: k = 1 # must have at least one char in token # print 'break at k=' , k kl = len(s) if k + 1 < kl and s[k] == '+' and s[k+1] == ' ': k += 1 # recognize possible prefix # print 'len(s)=' , kl , 'k=' , k , 's=', s # print '_lookUp@3 buffer=' , self.sbu.buffer mr = self._scanText(k) # text matching in various ways mx = mr[0] # overall maximum match length chs = mr[1] # any vocabulary element matched suf = mr[2] # any suffix removed in matching # print '_lookUp@4 buffer=' , self.sbu.buffer s = self.sbu.buffer # print 'k=' , k # print 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf # print 'len(s)=' , len(s) , 's=' , s if ( k < mx or k == mx and suf != '' ): # next word cannot produce token as long as already seen? # print 'queue:' , len(self.ptr.queue) # print 'chs=' , chs if len(chs) > 0: # any vocabulary matches? # print 'put back' , suf , mx , s self.sbu.skip(mx) # if so, they supersede if suf != '': # handle any suffix removal self.sbu.prepend(list(suf)) # print 'suf=' , suf else: chs = self.sbu.extract(mx) # print 'extracted chs=' , chs # print 'token chs=' , chs to = ellyToken.EllyToken(chs) # print 'long token=' , unicode(to) self.ctx.addTokenToListing(to) if suf != '': if not ellyChar.isApostrophe(suf[1]): to.dvdd = True # must note suffix removal for token! # print 'only queue:' , len(self.ptr.queue) return True # print 'mx=' , mx # print 'plus queue:' , len(self.ptr.queue) wsk = self.sbu.buffer[:k] cap = ellyChar.isUpperCaseLetter(wsk[0]) # print 'wsk=' , wsk rws = u''.join(wsk) found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap) if not found: # print 'not found, k=' , k if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]): k -= 1 rws = rws[:-1] found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap) # print 'found in dictionary=' , found if found or mx > 0: # match found in dictionary or by text scan if not found: k = mx # if by text scan, must make token longer rws = rws[:k] # if mx > k self.sbu.skip(k) # print 'next=' , self.sbu.buffer[self.sbu.index:] # print 'queue after =' , len(self.ptr.queue) to = ellyToken.EllyToken(rws[:k]) if len(suf) > 1: # change token to show suffix properly # print 'suf=' , suf cs = suf[1] # first char in suffix after '-' rt = to.root # this is a list! lk = -1 # start at last char in token while rt[lk] != cs: lk -= 1 sn = len(rt) + lk # where to divide suffix from root # print 'sn=' , sn , rt to.root = rt[:sn] # root without suffix self.sbu.prepend(suf) # restore suffix to input for processing else: # no suffix chx = self.sbu.peek() # look at next char after match if chx == '-': # if hyphen, need to separate it self.sbu.skip() if ellyChar.isLetter(self.sbu.peek()): self.sbu.prepend(' ') self.sbu.prepend('-') # print 'add' , unicode(to) self.ctx.addTokenToListing(to) # add token to listing for sentence return True # print '[' + rws + ']' , 'still unrecognized' chx = rws[0] # special hyphen check if chx == '-' and k > 1: # print 'look in internal dictionary' if self.ptr.createPhrasesFromDictionary(chx,False,False): # print 'found!' to = ellyToken.EllyToken(chx) # treat hyphen as token self.ctx.addTokenToListing(to) # add it to token list self.sbu.skip() # remove from input return True to = self._extractToken(mx) # single-word matching with analysis and lookup # print 'extracted to=' , unicode(to) if to == None: # if no match, we are done and will return # print 'mx=' , mx return False if mx == 0 else True # still success if _scanText() found something if self.ptr.lastph != None: self.ptr.lastph.lens = to.getLength() # print 'to=' , unicode(to) , 'len(s)=' , len(s) , s # posn = self.ctx.countTokensInListing() # print 'at', posn , 'in token list' self.ctx.addTokenToListing(to) # add token to listing for sentence # tol = self.ctx.getNthTokenInListing(-1) # print 'last token root=' , tol.root return True # successful lookup
def _scanText ( self , k ): """ try to match in buffer regardless of word boundaries using Elly vocabulary, pattern, amd template tables an also running Elly entity extractors arguments: self - k - length of first component in buffer returns: match parameters [ text span of match , vocabulary match , suffix removed ] exceptions: ParseOverflow """ # print '_scanText k=' , k sb = self.sbu.buffer # input buffer tr = self.ptr # parse tree for results # print '_scanText sb=' , sb # initialize match status nspan = 0 # total span of match vmchs = [ ] # chars of vocabulary entry matched suffx = '' # any suffix removed in match d = self.rul # grammar rule definitions m = d.ptb.match(sb,tr) # try token by pattern match next # print 'pattern m=' , m if nspan < m: nspan = m # on longer match, update maximum m = d.ctb.match(sb,tr) # try multi-word template match next # print 'template m=' , m if nspan < m: nspan = m # on longer match, update maximum m = self.iex.run(sb) # try entity extractors next # print 'extractor m=' , m if nspan < m: nspan = m # on longer match, update maximum # lm = len(sb) # scan limit # print 'lm=' , lm , 'm=' , m capd = ellyChar.isUpperCaseLetter(sb[0]) # print 'next component=' , sb[:k] , ', context=' , sb[k:lm] if self.vtb != None: # look in external dictionary, if it exists ls = list(sb[:k]) # print 'ls 0=' , ls ellyChar.toLowerCaseASCII(ls) ss = u''.join(ls) # where to start for vocabulary indexing # print 'ls 1=' , ls n = vocabularyTable.delimitKey(ss) # get actual indexing # print 'delimiting n=' , n , '=' , '<' + ss[:n] + '>' # print vocabularyTable.listDBKeys(self.vtb.cdb) rl = self.vtb.lookUp(sb,n) # get list of the maximum text matches # print len(rl) , 'matches' if len(rl) > 0: # r0 = rl[0] # look at first record # print 'r0=' , r0 vmln = r0.nspan # should be same for all matches vchs = r0.vem.chs # vsfx = r0.suffx # # print 'nspan=' , vmln , vsfx if ( vmln > nspan or vmln == nspan and vsfx == '' ): nspan = vmln # keep vocabulary matches vmchs = vchs # suffx = vsfx # for r in rl: ve = r.vem # get vocabulary entry # print 've=' , ve # if ve.gen != None: print 've.gen=' , ve.gen if tr.addLiteralPhraseWithSemantics( ve.cat,ve.syf,ve.smf,ve.bia,ve.gen,len(suffx) > 0): tr.lastph.lens = nspan # char length of leaf phrase node # needed for later selection tr.lastph.krnl.cncp = ve.con if capd: tr.lastph.krnl.semf.set(0) # print 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens if suffx != '': if ellyChar.isApostrophe(suffx[1]): tr.lastph.krnl.usen = 0 # print 'vocabulary m=' , vmln # print 'queue after table lookup:' , len(self.ptr.queue) # print 'sb=' , sb # print 'maximum match=' , nspan # print 'input=' , self.sbu.buffer[:nspan] if nspan > 0: # any matches at all? tr.requeue() # if so, keep only longest of them # print 'queue after scan:' , len(self.ptr.queue) # print 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']' return [ nspan , vmchs , suffx ]
def read(self): """ get next char from input stream with filtering arguments: self returns: single Unicode char on success, null string otherwise """ # print 'reading: buf=' , self.buf while True: if not self._reload( ): # check if buffer empty and reload if needed return END # return EOF if no more chars available # print 'buf=' , self.buf c = self.buf.pop(0) # next raw char in buffer if c == SHYP: # ignore soft hyphen if len(self.buf) > 0: if self.buf[0] == SP: c = self.buf.pop(0) continue if not ellyChar.isText(c): # unrecognizable Elly char? # print 'c=' , '{0:04x}'.format(ord(c)) if ellyChar.isCJK(c): if ellyConfiguration.language != 'ZH': c = '_' # special handling for non-Chinese input elif not c in [u'\uff0c', u'\u3002']: # print 'replace' , c , 'with NBSP' c = NBSP # by default, replace with no-break space lc = self._lc # copy saved last char # print 'lc=' , ord(lc) self._lc = c # set new last char # if c == "'": # print 'apostrophe' , self.buf # print 'c=' , '<' + c + '>' if c == HYPH: # special treatment for isolated hyphens if spc(lc) and spc(self.peek()): c = DASH break elif c == '.': # check for ellipsis bb = self.buf bl = len(bb) # print 'bl=' , bl , 'bb=' , bb if bl >= 2 and bb[0] == '.' and bb[1] == '.': self.buf = bb[2:] c = ELLP elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[ 2] == ' ' and bb[3] == '.': self.buf = bb[4:] c = ELLP break elif c == RSQm: # check for single quote # print 'at single quote' nc = self.peek() # look at next char # print 'next=' , nc if nc == RSQm: # doubling of single quote? self.buf.pop(0) # if so, combine two single quotes c = RDQm # into one double quote elif not ellyChar.isWhiteSpace(c): if ellyChar.isWhiteSpace(lc): self._cap = ellyChar.isUpperCaseLetter(c) break elif c == CR: # always ignore continue elif c == NL: # special handling of \n # print 'got NL' nc = self.peek() # look at next char while nc == CR: self.buf.pop(0) # skip over CR's nc = self.peek() # print "lc= '" + lc + "'" if lc != NL and nc == NL: self.buf.pop(0) # special case when NL can be returned break if nc == NL: # NL followed NL? while nc == NL or nc == CR: self.buf.pop(0) # ignore subsequent new line chars nc = self.peek() elif nc == END or ellyChar.isWhiteSpace(nc): continue # NL followed by space is ignored elif nc == u'.' or nc == u'-': pass else: # print 'NL to SP, lc=' , ord(lc) c = SP # convert NL to SP if not before another NL else: # print 'lc=' , ord(lc) , 'c=' , ord(c) c = SP # otherwise, convert white space to plain space self._cap = False if not ellyChar.isWhiteSpace( lc): # preceding char was not white space? # print 'return SP' break # if so, keep space in stream return c # next filtered char
def match ( self , txt , pnc , ctx ): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - next chars after punctuation returns: True on match, False otherwise """ # print 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx if matchtoo(txt,pnc,ctx): # exception by complex match? return True # print 'matchtoo() returned False' sep = ctx[0] if len(ctx) > 0 else '' if sep == ellyChar.THS: return True nxt = ctx[1] if len(ctx) > 1 else '' # print 'lstg=' , self.lstg.keys() if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print len(lp) , 'patterns' ltx = len(txt) # current length of accumulated text so far ntr = 1 while ntr <= ltx: if not ellyChar.isLetterOrDigit(txt[-ntr]): break ntr += 1 nrg = ntr ntr -= 1 # available trailing chars for wildcard * match while nrg <= ltx: c = txt[-nrg] if not ellyChar.isLetterOrDigit(c) and not ellyChar.isEmbeddedCombining(c): # print 'break at nrg=' , nrg , txt[-nrg] break nrg += 1 nrg -= 1 # end of range for all pattern matching # print 'ntr=' , ntr , 'nrg=' , nrg txt = txt[-nrg:] # reset text to limit for matching ltx = len(txt) # its new length # print 'txt= ' + unicode(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' for p in lp: # try matching each listed exception pattern if p.left != None and len(p.left) > 0: pat = p.left star = pat[-1] == ellyWildcard.cALL n = len(pat) # it each pattern element matches one sequence char if star: # except for a final wildcard * # print 'pattern ending with *' n -= 1 # print 'ltx=' , ltx , 'n=' , n if ltx < n: continue # cannot match pattern properly pat = pat[:-1] t = txt[:n] else: if ltx < n: continue # cannot match pattern properly t = txt[-n:] if not ellyWildcard.match(pat,t,0): # print 'no possible pattern match' continue k = ltx - n # extra chars beyond any match # print 'k=' , k , 't=' , t # print 'txt=' , txt # print 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' # print 'matches' , n , 'chars' if not star and k > 0: # print 'check text before [' , txt[-n] , ']' if ellyChar.isLetterOrDigit(txt[-n]): c = txt[-n-1] # print 'preceding= [', c , ']' if ellyChar.isLetterOrDigit(c) or c == '&': continue # because break in text is required # print 'pat=' , ellyWildcard.deconvert(p.left) # print 'n=' , n , 'ltx=' , ltx # print 'txt=' , txt # nc = '\\n' if nxt == '\n' else nxt # print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' # print 'versus c=' , nc rp = p.right if rp == [] or rp[0] == ellyWildcard.cALL: return True pcx = rp[0] if pcx == nxt: # check for specific char after possible stop # print 'right=' , nxt return True elif pcx == ellyWildcard.cALF: # check for alphabetic if ellyChar.isLetter(nxt): # print 'right is alphabetic=' , nxt return True elif pcx == ellyWildcard.cDIG: # check for numeric if ellyChar.isDigit(nxt): # print 'right is numeric=' , nxt return True elif pcx == ellyWildcard.cUPR: # check for upper case if ellyChar.isUpperCaseLetter(nxt): return True elif pcx == ellyWildcard.cLWR: # check for lower case if ellyChar.isLowerCaseLetter(nxt): return True elif pcx == ellyWildcard.cCAN: # check for non-alphanumeric if ellyChar.isLetter(nxt): # print 'right is alphabetic=' , nxt return True # print "no matches" return False