def getNext(self): """ get single Chinese character arguments: self returns: a token or None if buffer is empty exceptions: StemmingError """ # print super(EllyBufferZH,self) , 'ZH getNext' ln = len(self.buffer) if ln == 0: return None # print 'buffer=' , self.buffer n = 1 if ellyChar.isDigit(self.buffer[0]): while n < ln and ellyChar.isDigit(self.buffer[n]): n += 1 w = ellyToken.EllyToken(self.extract(n)) # print 'return token=' , w # print 'ZH extracted' # print 'buffer=' , self.buffer return w
def get ( self , ts , n=N ): """ get normalized substring in lower case for subsequent comparisons arguments: self - ts - list of chars to get substring from n - limit on count of chars to get returns: count of chars scanned for substring """ sl = [ ] # char sublist to be matched # print 'ts=' , ts lts = len(ts) if lts == 0: return 0 # no chars to scan lm = lts if lts < n else n # print 'lm=' , lm i = 0 c = '' while i < lm: # scan input text up to char limit lc = c c = ts[i] # get next char if c == PERIOD: # special treatment of PERIOD if lc == PERIOD: break elif c == COMMA: # special treatment of COMMA # print 'comma' if ( not ellyChar.isDigit(lc) or i + 1 == lm or not ellyChar.isDigit(ts[i + 1]) ): break else: if not ellyChar.isLetterOrDigit(c): # stop if not letter if not c in ALSO: break # or "'" or "/" or "-" sl.append(c.lower()) # otherwise append to sublist i += 1 # print 'i=' , i , '<' + c + '>' if i < lm and ellyChar.isLetterOrDigit(ts[i]): # proper termination? return 0 # if not, reject substring self.string = u''.join(sl) return i # scan count
def matchtoo(txt, pnc, ctx): """ complex checks - currently only for rightmost period of A.M. or P.M. arguments: txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - list of chars in context after punctuation returns: True on match, False otherwise """ ln = len(txt) # print ( 'nomatch() ln=' , ln , txt ) nxt = ctx[0] if len(ctx) > 0 else '' if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5: return False # print ( 'check' , txt[-3:] ) if not txt[-1] in ['M', 'm'] or txt[-2] != '.' or not txt[-3] in [ 'P', 'p', 'A', 'a' ] or txt[-4] != ' ': return False ch = txt[-5] # print ( 'ch=' , ch ) if ellyChar.isDigit(ch): # only 1 digit will be checked here! # print ( 'ONE DIGIT' ) return True # erring on the side of not to break sentence elif not ellyChar.isLetter(ch): return False # # the following code is needed only when number transforms are turned off # nn = 6 while nn <= ln and ellyChar.isLetter(txt[-nn]): nn += 1 # print ( 'nn=' , nn ) if nn < 3 or nn > 6: return False elif nn > ln: if not txt[-nn] in [' ', '-']: return False wd = ''.join(txt[:-nn]).lower() # print ( 'wd=' , wd ) if wd in [ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve' ]: if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]): return False else: return True else: return False
def get(self, ts, n=N): """ get normalized substring in lower case for subsequent comparisons arguments: self - ts - list of chars to get substring from n - limit on count of chars to get returns: count of chars scanned for substring """ sl = [] # char sublist to be matched # print ( 'ts=' , ts ) lts = len(ts) if lts == 0: return 0 # no chars to scan lm = lts if lts < n else n # print ( 'lm=' , lm ) i = 0 c = '' while i < lm: # scan input text up to char limit lc = c c = ts[i] # get next char if c == COMMA: # special treatment of COMMA # print ( 'comma' ) if (not ellyChar.isDigit(lc) or i + 3 >= lm or not ellyChar.isDigit(ts[i + 1]) or not ellyChar.isDigit(ts[i + 2]) or not ellyChar.isDigit(ts[i + 3])): break else: if not ellyChar.isLetterOrDigit(c): # stop if not letter if not c in ALSO: break # or "'" or "/" or "-" sl.append(c.lower()) # otherwise append to sublist i += 1 # print ( 'i=' , i , '<' + c + '>' ) if i < lm and ellyChar.isLetterOrDigit(ts[i]): # proper termination? return 0 # if not, reject substring self.string = ''.join(sl) return i # scan count
def matchtoo ( txt , pnc , ctx ): """ complex checks - currently only for rightmost period of A.M. or P.M. arguments: txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - list of chars in context after punctuation returns: True on match, False otherwise """ ln = len(txt) # print 'nomatch() ln=' , ln , txt nxt = ctx[0] if len(ctx) > 0 else '' if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5: return False # print 'check' , txt[-3:] if not txt[-1] in ['M','m'] or txt[-2] != '.' or not txt[-3] in ['P','p','A','a'] or txt[-4] != ' ': return False ch = txt[-5] # print 'ch=' , ch if ellyChar.isDigit(ch): # only 1 digit will be checked here! # print 'ONE DIGIT' return True # erring on the side of not to break sentence elif not ellyChar.isLetter(ch): return False # # the following code is needed only when number transforms are turned off # nn = 6 while nn <= ln and ellyChar.isLetter(txt[-nn]): nn += 1 # print 'nn=' , nn if nn < 3 or nn > 6: return False elif nn > ln: if not txt[-nn] in [ ' ' , '-' ]: return False wd = ''.join(txt[:-nn]).lower() # print 'wd=' , wd if wd in [ 'one' , 'two' , 'three' , 'four' , 'five' , 'six' , 'seven' , 'eight' , 'nine' , 'ten' , 'eleven' , 'twelve' ]: if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]): return False else: return True else: return False
def stateZip(buffr): """ recognize U.S. state abbreviation and zip code arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ if len(buffr) < 8 or buffr[2] != ' ': return 0 st = ''.join(buffr[:2]).upper() # expected 2-char state abbreviation if not st in ziprs: return 0 # if not known, quit zc = ziprs[st] # get zip-code start b = buffr[3:] # expected start of zipcode i = 0 for c in zc: # check starting digits of zipcode if c != b[i]: return 0 i += 1 while i < 5: # check for digits in rest of zipcode if not ellyChar.isDigit(b[i]): return 0 i += 1 b = b[5:] # look for proper termination if len(b) == 0: # if end of input, success return 8 # success: 5-digit zip c = b[0] if ellyChar.isLetterOrDigit(c): # if next char is alphanumeric, failure return 0 elif b[0] == '-': # look for possible 9-digit zip if len(b) > 5: b = b[1:] for i in range(4): if not ellyChar.isDigit(b[i]): return 0 # check for 4 more digits b = b[4:] # past end of 4 digits if len(b) > 0 and ellyChar.isLetterOrDigit(b[0]): return 0 # termination check return 8 + 5 # success: 9-digit zip else: return 8 # success: 5-digit zip
def _aYear ( self , ts ): """ parse a year arguments: self - ts - text stream as list of chars returns: total number of chars matched """ lts = len(ts) if lts < 2: return 0 # year must be at least 2 digits k = 0 while k < lts: # scan for digits in input list if not ellyChar.isDigit(ts[k]): break k += 1 # print k , 'digits scanned' if k != 2 and k != 4: # simple check for year range (change this as needed) return 0 if k == 4 and ts[0] != '1' and ts[0] != '2': return 0 self._yr[2:] = ts[k-2:] # save last 2 digits of year if k == 2: ce = self.cent[0] if ts[k-2:k] > self.ycur else self.cent[1] self._yr[:2] = ce else: self._yr[:2] = ts[k-4:] t = ts[k:] # look for what follows year # print 'epoch t=' , t ns = 0 if len(t) > 0 and t[0] == ' ': t = t[1:] ns = 1 lss = self.get(t) # print 'lss=' , lss , self.string if self.string in Ep: self._ep = list(self.string) k += ns + lss # print 'k=' , k , 'ns=' , ns elif k < 4: return 0 # print 'k=' , k return k if k > 3 else 0
def stateZip ( buffr ): """ recognize U.S. state abbreviation and zip code arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ if len(buffr) < 8 or buffr[2] != ' ': return 0 st = ''.join(buffr[:2]).upper() # expected 2-char state abbreviation if not st in ziprs: return 0 # if not known, quit zc = ziprs[st] # get zip-code start b = buffr[3:] # expected start of zipcode i = 0 for c in zc: # check starting digits of zipcode if c != b[i]: return 0 i += 1 while i < 5: # check for digits in rest of zipcode if not ellyChar.isDigit(b[i]): return 0 i += 1 b = b[5:] # look for proper termination if len(b) == 0: # if end of input, success return 8 # success: 5-digit zip c = b[0] if ellyChar.isLetterOrDigit(c): # if next char is alphanumeric, failure return 0 elif b[0] == '-': # look for possible 9-digit zip if len(b) > 5: b = b[1:] for i in range(4): if not ellyChar.isDigit(b[i]): return 0 # check for 4 more digits b = b[4:] # past end of 4 digits if len(b) > 0 and ellyChar.isLetterOrDigit(b[0]): return 0 # termination check return 8 + 5 # success: 9-digit zip else: return 8 # success: 5-digit zip
def _aYear(self, ts): """ parse a year arguments: self - ts - text stream as list of chars returns: total number of chars matched """ lts = len(ts) if lts < 2: return 0 # year must be at least 2 digits k = 0 while k < lts: # scan for digits in input list if not ellyChar.isDigit(ts[k]): break k += 1 # print k , 'digits scanned' if k != 2 and k != 4: # simple check for year range (change this as needed) return 0 if k == 4 and ts[0] != '1' and ts[0] != '2': return 0 self._yr[2:] = ts[k - 2:] # save last 2 digits of year if k == 2: ce = self.cent[0] if ts[k - 2:k] > self.ycur else self.cent[1] self._yr[:2] = ce else: self._yr[:2] = ts[k - 4:] t = ts[k:] # look for what follows year # print 'epoch t=' , t ns = 0 if len(t) > 0 and t[0] == ' ': t = t[1:] ns = 1 lss = self.get(t) # print 'lss=' , lss , self.string if self.string in Ep: self._ep = list(self.string) k += ns + lss # print 'k=' , k , 'ns=' , ns elif k < 4: return 0 # print 'k=' , k return k if k > 3 else 0
def __init__ ( self , tree , nsave , resto , recur=False , post='' ): """ initialization arguments: self - tree - logic tree containing node with action nsave - how many matched chars to keep in root resto - how to restore root after keeping recur - recursive matching to look for more affixes? post - how to define a removed affix """ self.tree = tree self.nsave = nsave self.resto = resto self.recur = recur self.ndrop = 0 # default if post != '': # no specification if ellyChar.isDigit(post[0]): self.ndrop = int(post[0]) # expect only single digit here, if any post = post[1:] # rest of action string self.amod = post
def __init__(self, tree, nsave, resto, recur=False, post=''): """ initialization arguments: self - tree - logic tree containing node with action nsave - how many matched chars to keep in root resto - how to restore root after keeping recur - recursive matching to look for more affixes? post - how to define a removed affix """ self.tree = tree self.nsave = nsave self.resto = resto self.recur = recur self.ndrop = 0 # default if post != '': # no specification if ellyChar.isDigit(post[0]): self.ndrop = int( post[0]) # expect only single digit here, if any post = post[1:] # rest of action string self.amod = post
def _rightside(stb, txt, sta): """ process actions for a clause arguments: stb - symbol table txt - string input for single clause sta - for status reporting returns: action list on success, None otherwise """ # print ( "right side" ) actn = [] val = 0 cnc = '' # default is no concept specified m = txt.rfind(']') # look for semantic features to set or reset n = txt.rfind(' ') # look for space marking explicit concept # print ( 'n=',n ) # print ( "0 txt=[" , txt , "]" ) if n > m: # space must not be in semantic feature specification cnc = txt[n:].strip().upper() txt = txt[:n] # remove concept from right size of clause # print ( "1 txt=[" , txt , "]" ) if len(txt) > 1: if txt[0] == '*': # inherit from phrase component? c = txt[1] if c == 'l': actn.append([semanticCommand.Clhr]) sta.lh = True elif c == 'r': actn.append([semanticCommand.Crhr]) sta.rh = True else: return _err('bad inheritance') txt = txt[2:].strip() # print ( "2 txt=[" , txt , "]" ) if len(txt) > 3 and txt[0] == '[': n = txt.find(']') # set or unset semantic features for phrase? # print ( 'n=' , n ) if n < 3: return _err('incomplete semantic features to set or unset') try: f = featureSpecification.FeatureSpecification(stb, txt[:n + 1], semantic=True) sta.res = f except ellyException.FormatFailure: return _err('bad semantic features to set or unset') if sta.id[sS] == None: sta.id[sS] = f.id elif f.id != sta.id[sS]: _err('inconsistency: final features=' + txt[:n + 1]) return None # print ( 'features=' , f.positive , f.negative ) actn.append([semanticCommand.Csetf, f.positive]) if not f.negative.zeroed(): f.negative.complement() actn.append([semanticCommand.Crstf, f.negative]) # print ( 'set:' , actn[-1] ) txt = txt[n + 1:] # print ( "3 txt=[" , txt , "]" ) if len(txt) > 0: c = txt[0] # check for sign of plausibility change if c != '+' and c != '-': if ellyChar.isDigit(c): return _err('plausibility must begin with + or -') else: return _err('bad cognitive semantic action: ' + txt) # print ( "2 txt=[",txt,"]" ) if len(txt) == 1: val = 1 elif ellyChar.isDigit(txt[1]): try: val = int(txt[1:]) # explicit numerical change except ValueError: return _err('bad cognitive plausibility: ' + txt) elif c == txt[1]: # alternate notation for plausibility change val = 2 for xc in txt[2:]: if xc != c: return _err('must be all + or all -') val += 1 # count up value else: return _err('cannot interpret clause: ' + txt) if c == '-': val = -val # get right sign # print ( 'val=' , val ) if len(cnc) > 0: actn.append([semanticCommand.Csetc, cnc]) if val != 0: actn.append([semanticCommand.Cadd, val]) return actn
def getNext(self): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print ( 'getNext' ) self.resetBracketing() inBrkt = False nspc = 0 # set space count sent = [] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print ( 'x=' , '<' + x + '>' , ord(x) ) self.inp.unread(x, SP) # put first char back to restore input # print ( '0 <<' , self.inp.buf ) # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print ( 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' ) # print ( 'sent=' , sent , 'nspc=' , nspc ) # check for table delimiters in text if len(sent) == 0: # print ( 'table' ) # print ( '1 <<' , self.inp.buf ) if x == '.' or x == '-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far #################################################### # accumulate chars and count alphanumeric and spaces #################################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print ( 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' ) if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # elif ellyChar.isWhiteSpace(c) and inBrkt: nspc += 1 svBrkt = inBrkt inBrkt = self.checkBracketing( x) # do bracketing check with modified chars if svBrkt and not inBrkt: nspc = 0 # print ( 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt ) sent.append(c) # put original char into sentence buffer if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # certain Unicode punctuation will always break if c in Hards: break # char was not alphanumeric or space # look for stop punctuation exception cx = self.inp.preview() # for context of match call # print ( '0 <<' , self.inp.buf ) # print ( 'sent=' , sent[:-1] ) # print ( 'punc=' , '<' + c + '>' ) # print ( 'next=' , cx ) if c in Stops and len(cx) > 0 and cx[0] == SP: if self.stpx.match(sent[:-1], c, cx): # print ( 'stop exception MATCH' ) if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print ( 'no stop exception MATCH for' , c ) # print ( '@1 <<' , self.inp.buf ) # handle any nonstandard punctuation exoticPunctuation.normalize(c, self.inp) # print ( '@2 <<' , self.inp.buf ) # check for dash if c == '-': d = self.inp.read() if d == '-': # print ( 'dash' ) while True: d = self.inp.read() if d != '-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print ( '@3 c=' , c , inBrkt ) if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print ( 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) ) if not inBrkt: # print ( sent , 'so far' ) z = self.inp.read() if self.shortBracketing(sent, z): break self.inp.unread(z) # print ( 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' ) if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break elif c in QUOs and lc in Stops: # print ( 'stop+quote' ) z = self.inp.read() if z in RBs: sent.append(z) y = self.inp.read() if y in Stops: sent.append(y) elif not ellyChar.isWhiteSpace(y): self.inp.unread(y) inBrkt = False break elif z in QUOs: # print ( 'stop+quote+quote' ) sent.append(z) inBrkt = False break self.inp.unread(z) # print ( 'continue' ) continue elif not c in Stops: continue else: # print ( 'check stopping!' ) d = self.inp.read() # print ( '@3 <<' , self.inp.buf ) if d == None: d = '!' # print ( 'stop=' , '<' + c + '> <' + d + '>' ) # print ( 'ellipsis check' ) if c == '.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(d) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append( SP ) # if part of token, put in space as separator continue if c == ELLP: # print ( 'found Unicode ellipsis, d=' , d ) if ellyChar.isUpperCaseLetter(d): self.inp.unread( d) # super special case of bad punctuation self.inp.unread(' ') # put in implied period and space self.inp.unread('.') # # special check for multiple stops # print ( 'next char d=' , d , ord(d) if d != END else 'NONE' ) if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent, d): break if d in self._cl and self._cl[d] == 1: dn = self.inp.peek() if ellyChar.isWhiteSpace(dn): sent.append(d) break self.inp.unread(d) # print ( 'no space after punc' ) continue # if no match for lookahead, put back elif d != END: # print ( 'unread d=' , d ) self.inp.unread(d) # print ( 'possible stop' ) # check special case of number ending in decimal point if c == '.': ixb = len(sent) - 2 ixn = ixb + 1 cxn = '' # print ( 'sent=' , sent ) # print ( 'ixn=' ,ixn ) while ixn > 0: ixn -= 1 cxn = sent[ixn] # print ( 'cxn=' , cxn ) if not ellyChar.isDigit(cxn): break # print ( 'break: ixn=' , ixn , 'ixb=' , ixb ) if ixn < ixb and cxn in [' ', '-', '+']: prvw = self.inp.preview() # print ( 'prvw=' , prvw ) if len(prvw) > 1 and not ellyChar.isUpperCaseLetter( prvw[1]): continue # final check: is sentence long enough? if inBrkt: # print ( 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() ) # print ( 'nspc=' , nspc ) if c in [':', ';'] or nspc < 3: sent.append(d) # print ( 'add' , '<' + d + '> to sentence' ) # print ( 'sent=' , sent ) self.inp.skip() nspc -= 1 continue # print ( '@4 <<' , self.inp.buf ) cx = self.inp.peek() if cx == None: cx = '!!' # print ( 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent ) # print ( 'nAN=' , nAN , 'inBrkt=' , inBrkt ) if nAN > 1: break if sent == ['\u2026']: # special case of sentence return list("-.-") # with lone ellipsis elif len(sent) > 0 or self.last != END: return sent else: return None
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __str__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + str(self.kind) + ',ct=' + str(self.count) + ',pa=' + str(self.pats) + ',tx=' + str(self.txts) + ',bd=' + str(self.bnds) + ',ns=' + str(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print ( "binding:",offs,ns ) os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) ) # print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp ) # print ( "text to span:",text[offs:] ) # print ( "pat rest=" , patn[mp:] ) k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print ( "exclude=",k,"chars from possible span for rest of pattern" ) # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print ( mx,"chars available to scan" ) mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print ( "text at",offs,"maximum wildcard match=",mx ) nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print ( 'span c=' , c ) if not tfn(c): break # stop when it fails to match nm += 1 # print ( "maximum wildcard span=",nm ) return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print ( 'starting match, limt=',limt,text[offs:limt],":",patn ) # print ( 'nsps=' , nsps ) mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print ( '---- loop mp=' , mp , 'ml=' , ml ) while mp < ml: if offs >= limt: # print ( "offs=",offs,"limt=",limt ) last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print ( 'patn=' , patn ) mc = patn[mp] # print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs ) # print ( 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' ) if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print ( 'hyphen special matching, limt=', limt , 'offs=' , offs ) # print ( 'text[offs:]=' , text[offs:] ) if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print ( 'no special matching of hyphen' ) break # print ( 'matched @mp=' , mp ) mp += 1 ## check whether mismatch is due to special pattern char # print ( 'pat @',mp,"<",ml ) # print ( "txt @",offs,'<',limt,'last=',last ) # print ( '@',offs,text[offs:] ) if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) ) if tc == cALL: # a * wildcard? # print ( "ALL last=< " + last + " >" ) if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print ( "offs=",offs,'nm=',nm ) uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print ( "END $:",last ) if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif not ellyChar.isText(last): offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print ( "ANY:",last,offs ) if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print ( 'at cCAN' ) if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print ( "ALF:",last,offs ) if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print ( "UPR:",last,'@',offs ) if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print ( "LWR:",last,'@',offs ) if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print ( "SPC:","["+last+"]" ) if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print ( 'NO space' ) elif tc == cAPO: # apostrophe wildcard? # print ( "APO: last=" , last ) if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print ( "SOS" ) # print ( last,'@',offs ) mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print ( "EOS" ) if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' ) if last != '': # still more to match? offs -= 1 # print ( 'nsps=' , nsps ) # print ( '@' , offs , text ) nm = _span(tc,nsps) # maximum match possible # print ( 'spanning=' , nm ) if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print ( 'spanning=' , nm ) if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print ( 'offs=' , offs ) last = text[offs] if offs < limt else '' continue # print ( 'fail tc=' , deconvert(tc) ) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print ( "fail - unwinding" , unj ) while unj > 0: # try unwinding, if possible # print ( "unw:",unj ) uf = unw[unj-1] # get most recent unwinding record # print ( uf ) if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print ( "no unwinding" ) break # quit if unwinding is exhausted # print ( 'cnt=' , uf.count , 'off=' , offs ) ## ## clean up on match mode or on no match possible ## # print ( "matched=",matched ) if not matched: return None # no bindings # print ( text,offs ) ## consolidate contiguous bindings for subsequent substitutions # print ( "BEFORE consolidating consecutive bindings" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print ( "AFTER" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) return mbd # consolidated bindings plus new offset
def _leftside(stb, txt, sta): """ process conditions for a clause and store arguments: stb - symbol table txt - string input for left side of single clause sta - for status reporting returns: predicate list on success, None otherwise """ # print ( "left side" ) pred = [] txt = txt.rstrip() while len(txt) > 0: txt = txt.lstrip() # print ( 'clause=' , txt ) if len(txt) <= 1: _err('malformed conditions for clause') return None side = txt[0] txt = txt[1:] if side in ['n', 'p', 'c']: sns = txt[0] txt = txt[1:] if sns != '<' and sns != '>': _err('invalid comparison in clause condition=' + sns) return None if side == 'n': op = semanticCommand.Cngt if sns == '>' else semanticCommand.Cnlt elif side == 'p': op = semanticCommand.Cpgt if sns == '>' else semanticCommand.Cplt else: op = semanticCommand.Ccgt if sns == '>' else semanticCommand.Cclt nd = 0 lt = len(txt) while nd < lt: if not ellyChar.isDigit(txt[nd]): break nd += 1 if nd == 0: _err('no token count for condition') return None test = int(txt[:nd]) txt = txt[nd:] pred.append([op, test]) continue if not side in ['l', 'r']: _err('invalid side for test=' + side) return None k = 0 if txt[0] == '[': # semantic feature check? k = txt.find(']') # if so, look for closing bracket if k < 0: return _err('incomplete semantic features to check') p = txt[:k + 1] # get semantic feature string # print ( "side:" , side , "test:" , p ) try: f = featureSpecification.FeatureSpecification(stb, p, semantic=True) except ellyException.FormatFailure: return _err('bad semantic features to check') if side == 'l': if sta.id[lS] == None: sta.id[lS] = f.id elif f.id != sta.id[lS]: _err('inconsistency: left features=' + p) return None else: if sta.id[rS] == None: sta.id[rS] = f.id elif f.id != sta.id[rS]: _err('inconsistency: right features=' + p) return None op = semanticCommand.Crhtf if side == 'r' else semanticCommand.Clftf if side == 'r': sta.rht = f else: sta.lft = f # print ( 'test:' , f.positive.hexadecimal() , f.negative.hexadecimal() ) test = ellyBits.join(f.positive, f.negative) # print ( test ) pred.append([op, test]) elif txt[0] == '(': # semantic concept check? # print ( "txt=\"" + txt +"\"" ) k = txt.find(')') # if so, look for closing parenthesis if k < 0: return _err('incomplete concept check') s = txt[1:k].strip().upper() # normalize concepts p = s.split(',') # allow for multiple disjunctive checks # print ( "p=\"" + p + "\"" ) op = semanticCommand.Crhtc if side == 'r' else semanticCommand.Clftc pred.append([op, p]) else: _err('unknown test in clause=' + side + txt) return None txt = txt[k + 1:].lstrip() # advance to next predicate # print ( "NEXT" ) return pred
def match(self, txt, pnc, ctx): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - next chars after punctuation returns: True on match, False otherwise """ # print ( 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx ) if matchtoo(txt, pnc, ctx): # exception by complex match? return True # print ( 'matchtoo() returned False' ) sep = ctx[0] if len(ctx) > 0 else '' if sep == ellyChar.THS: return True nxt = ctx[1] if len(ctx) > 1 else '' # print ( 'lstg=' , self.lstg.keys() ) if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print ( len(lp) , 'patterns' ) ltx = len(txt) # current length of accumulated text so far ntr = 1 while ntr <= ltx: if not ellyChar.isLetterOrDigit(txt[-ntr]): break ntr += 1 nrg = ntr ntr -= 1 # available trailing chars for wildcard * match while nrg <= ltx: c = txt[-nrg] if not ellyChar.isLetterOrDigit( c) and not ellyChar.isEmbeddedCombining(c): # print ( 'break at nrg=' , nrg , txt[-nrg] ) break nrg += 1 nrg -= 1 # end of range for all pattern matching # print ( 'ntr=' , ntr , 'nrg=' , nrg ) txt = txt[-nrg:] # reset text to limit for matching ltx = len(txt) # its new length # print ( 'txt= ' + str(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' ) for p in lp: # try matching each listed exception pattern if p.left != None and len(p.left) > 0: pat = p.left star = pat[-1] == ellyWildcard.cALL n = len( pat) # it each pattern element matches one sequence char if star: # except for a final wildcard * # print ( 'pattern ending with *' ) n -= 1 # print ( 'ltx=' , ltx , 'n=' , n ) if ltx < n: continue # cannot match pattern properly pat = pat[:-1] t = txt[:n] else: if ltx < n: continue # cannot match pattern properly t = txt[-n:] if not ellyWildcard.match(pat, t, 0): # print ( 'no possible pattern match' ) continue k = ltx - n # extra chars beyond any match # print ( 'k=' , k , 't=' , t ) # print ( 'txt=' , txt ) # print ( 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' ) # print ( 'matches' , n , 'chars' ) if not star and k > 0: # print ( 'check text before [' , txt[-n] , ']' ) if ellyChar.isLetterOrDigit(txt[-n]): c = txt[-n - 1] # print ( 'preceding= [', c , ']' ) if ellyChar.isLetterOrDigit(c) or c == '&': continue # because break in text is required # print ( 'pat=' , ellyWildcard.deconvert(p.left) ) # print ( 'n=' , n , 'ltx=' , ltx ) # print ( 'txt=' , txt ) # nc = '\\n' if nxt == '\n' else nxt # print ( 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' ) # print ( 'versus c=' , nc ) rp = p.right if rp == [] or rp[0] == ellyWildcard.cALL: return True pcx = rp[0] if pcx == nxt: # check for specific char after possible stop ) # print ( 'right=' , nxt ) return True elif pcx == ellyWildcard.cALF: # check for alphabetic if ellyChar.isLetter(nxt): # print ( 'right is alphabetic=' , nxt ) return True elif pcx == ellyWildcard.cDIG: # check for numeric if ellyChar.isDigit(nxt): # print ( 'right is numeric=' , nxt 0 return True elif pcx == ellyWildcard.cUPR: # check for upper case if ellyChar.isUpperCaseLetter(nxt): return True elif pcx == ellyWildcard.cLWR: # check for lower case if ellyChar.isLowerCaseLetter(nxt): return True elif pcx == ellyWildcard.cCAN: # check for non-alphanumeric if ellyChar.isLetter(nxt): # print ( 'right is alphabetic=' , nxt ) return True # print ( "no matches" ) return False
def compile ( name , stb , defn ): """ static method to create an Elly vocabulary database from text file input arguments: name - for new SQLite database stb - Elly symbol table defn - Elly definition reader for vocabulary exceptions: TableFailure on error """ global nerr nerr = 0 cdb = None # SQLite db connection cur = None # SQLite db cursor # print 'compiled stb=' , stb if stb == None : print >> sys.stderr, 'no symbol table' raise ellyException.TableFailure try: zfs = FSpec(stb,'[$]',True).positive.hexadecimal(False) except ellyException.FormatFailure: # should never need this print >> sys.stderr , 'unexpected failure with zero features' raise ellyException.TableFailure # print 'zfs=' , zfs # hexadecimal for all features off tsave = '' # original term dsave = '' # definition try: filn = name + vocabulary # where to put vocabulary database try: os.remove(filn) # delete the file if it exists except OSError: print >> sys.stderr , 'no' , filn # if no such file, warn but proceed #### SQLite #### try: cdb = dbs.connect(filn) # create new database cur = cdb.cursor() cur.execute("CREATE TABLE Vocab(Keyx TEXT, Defn TEXT)") cdb.commit() except dbs.Error , e: print >> sys.stderr , e raise ellyException.TableFailure # give up on any database failure # print 'creating' , filn # #### r = None # for error reporting while True: # process vocabulary definition records try: # for catching FormatFailure exception # print '------------' r = defn.readline() # next definition if len(r) == 0: break # stop on EOF # print type(r) , r k = r.find(':') # look for first ':' if k < 0: tsave = r dsave = None _err() # report error and quit entry t = r[:k].strip() # term to go into dictionary d = r[k+1:].strip() # its definition tsave = t # save for any error reporting dsave = d # # print ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>' if len(t) == 0 or len(d) == 0: _err() # quit on missing parts c = t[0] if not ellyChar.isLetterOrDigit(c) and c != '.' and c != '"': _err('bad term') n = delimitKey(t) # get part of term to index if n <= 0: _err() # quit on bad term wky = toKey(t[:n]) # key part of term to define # print ' SQLite key=' , wky ns = syntaxSpecification.scan(d) # find extent of syntax info # print 'ns=' , ns if ns <= 0: _err('bad syntax specification') # print 'PoS=' , d[:ns] syn = d[:ns] # syntax info as string d = d[ns:].strip() # rest of definition try: # print 'VT syn=' , syn ss = SSpec(stb,syn) # decode syntax info # print 'VT ss =' , ss except ellyException.FormatFailure: _err('malformed syntax specification') cat = str(ss.catg) # syntax category syf = ss.synf.positive.hexadecimal(False) # syntactic flags # print 'syf=' , syf smf = zfs # initialize defaults for pb = '0' # cognitive semantics cn = conceptualHierarchy.NOname # # print '0:d=[' + d + ']' if len(d) > 1: # check for cognitive semantics x = d[0] if x == '[' or x == '0' or x == '-': # semantic features? if x != '[': # a '0' or '-' means to take default if len(d) == 1 or d[1] != ' ': _err('missing semantic features') d = d[2:].strip() # skip over else: ns = featureSpecification.scan(d) # look for ']' of features # print 'ns=' , ns if ns < 0: _err() sem = d[:ns] # get semantic features d = d[ns:].strip() # skip over try: # print 'smf=' , smf fs = FSpec(stb,sem,True) except ellyException.FormatFailure: _err('bad semantic features') smf = fs.positive.hexadecimal(False) # convert to hex # print '1:d=[' + d + ']' ld = len(d) # print 'ld=' , ld if ld == 0: _err('missing plausibility') np = 0 x = d[np] if x == '+' or x == '-': np += 1 # take any plus or minus sign while np < ld: # and successive digits if ellyChar.isDigit(d[np]): np += 1 else: break # print 'np=' , np if np == 0: _err('missing plausibility') pb = d[:np] # plausibility bias # print 'pb=' , pb d = d[np:] ld = len(d) # print '2:d=[' + d + ']' if ld > 1: # any more to process? c = d[0] # get next char after bias d = d[1:] # advance scan ld -= 1 if c == '/': # check for explicit concept # print 'getting concept' np = 0 while np < ld: # get extent of concept if ellyChar.isWhiteSpace(d[np]): break np += 1 if np == 0: _err('missing concept for plausibility') cn = d[:np] # extract concept d = d[np:] elif c != ' ': _err() # signal bad format elif ld > 0: _err() # unidentifiable trailing text d = d.strip() # rest of definition # print 'rest of d=' , d if len(d) > 0 and d[-1] == '=': if len(d) == 1 or d[0] != '=': _err('incomplete definition') ld = [ ] # for normalizing definition k = 0 # count spaces removed sd = '' # previous char seen for cd in d: # scan all chars in translation if cd == ' ': if sd == '=' or sd == ',' or sd == ' ': k += 1 sd = cd continue elif cd == '=' or cd == ',': # no spaces before '=' or ',' if sd == ' ': k += 1 ld.pop() if cd == ',': if sd == '=': _err('missing translation') cd = '#' # format for PICK operation elif cd == '=' and sd == '=': print >> sys.stderr , '** WARNING \'=\' followed by \'=\'' print >> sys.stderr , '* at [' , tsave , ']' sd = cd ld.append(cd) # add char to reformatted definition if k > 0: d = ''.join(ld) # definition with spaces removed # print '3:d=[' + d + ']' vrc = [ t , ':' , cat , syf , smf , pb , cn ] # start data record vss = u' '.join(vrc) # convert to string vss += u' ' + d # fill out record with rest of input # print 'type(vss)=' , type(vss) # print 'rec=' , vrc , 'tra=' , d # print ' =' , vss except ellyException.FormatFailure: print >> sys.stderr , '* at [' , tsave , if dsave != None: print >> sys.stderr , ':' , dsave , print >> sys.stderr , ']' continue # skip rest of processing #### SQLite #### try: sql = "INSERT INTO Vocab VALUES(?,?)" # print type(wky) , wky , type(vss) , vss cur.execute(sql,(wky,vss)) except dbs.Error , e: print >> sys.stderr , 'FATAL' , e sys.exit(1)
def build(name, stb, defn): """ static method to create an Elly vocabulary database from text file input arguments: name - for new SQLite database stb - Elly symbol table defn - Elly definition reader for vocabulary exceptions: TableFailure on error """ global nerr nerr = 0 cdb = None # SQLite db connection cur = None # SQLite db cursor # print ( 'built stb=' , stb ) if stb == None: print('no symbol table', file=sys.stderr) raise ellyException.TableFailure try: zfs = FSpec(stb, '[$]', True).positive.hexadecimal(False) except ellyException.FormatFailure: # should never need this print('unexpected failure with zero features', file=sys.stderr) raise ellyException.TableFailure # print ( 'zfs=' , zfs ) # hexadecimal for all features off tsave = '' # original term dsave = '' # definition try: filn = name + vocabulary # where to put vocabulary database try: os.remove(filn) # delete the file if it exists except OSError: print('no', filn, file=sys.stderr) # if no such file, warn but proceed #### SQLite DB operations #### try: cdb = dbs.connect(filn) # create new database cur = cdb.cursor() cur.execute("CREATE TABLE Vocab(Keyx TEXT, Defn TEXT)") cdb.commit() except dbs.Error as e: print(e, file=sys.stderr) raise ellyException.TableFailure # give up on any database failure # print ( 'creating' , filn ) # #### r = None # for error reporting while True: # process vocabulary definition records try: # for catching FormatFailure exception # print ( '------------' ) r = defn.readline() # next definition if len(r) == 0: break # stop on EOF # print ( type(r) , r ) r = definitionLine.normalize(r) # # print ( 'to' , r ) k = r.find(' : ') # look for first ' : ' if k < 0: tsave = r dsave = None _err() # report error and quit entry t = r[:k].strip() # term to go into dictionary d = r[k + 2:].strip() # its definition tsave = t # save for any error reporting dsave = d # # print ( ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>' ) if len(t) == 0 or len(d) == 0: _err() # quit on missing parts if ellyConfiguration.language == 'ZH': # special key for Chinese wky = toKeyZH(t[0]) else: c = t[0] if not ellyChar.isLetterOrDigit(c) and not c in initChr: _err('bad term') n = delimitKey(t) # get part of term to index # print ( 'delimit=' , n ) if n <= 0: _err() # quit on bad term wky = toKey(t[:n]) # key part of term to define # print ( ' SQLite key=' , wky ) # print ( 'd=' , d ) ns = syntaxSpecification.scan(d) # find extent of syntax info # print ( 'ns=' , ns , '"' + d[ns:] + '"' ) if ns <= 0: _err('bad syntax specification') if not d[ns:] == '' and d[ns] != ' ': _err('trailing chars in syntax specification') # print ( 'PoS=' , d[:ns] ) syn = d[:ns] # syntax info as string d = d[ns:].strip() # rest of definition try: # print ( 'VT syn=' , syn ) ss = SSpec(stb, syn) # decode syntax info # print ( 'VT ss =' , ss ) except ellyException.FormatFailure: _err('malformed syntax specification') cat = str(ss.catg) # syntax category cid = _smfchk[ss.catg] # associated semantic feature ID syf = ss.synf.positive.hexadecimal(False) # syntactic flags # print ( 'cat=' , cat ) # print ( 'syf=' , syf ) smf = zfs # initialize defaults for pb = '0' # cognitive semantics cn = conceptualHierarchy.NOname # # print ( '0:d=[' + d + ']' ) if len(d) > 1: # check for cognitive semantics x = d[0] if x == '[' or x == '0' or x == '-': # semantic features? if x != '[': # a '0' or '-' means to take default if len(d) == 1 or d[1] != ' ': _err('missing semantic features') d = d[2:].strip() # skip over else: ns = featureSpecification.scan( d) # look for ']' of features # print ( 'ns=' , ns ) if ns < 0: _err() sem = d[:ns] # get semantic features d = d[ns:].strip( ) # skip over for subsequent processing sid = sem[1] # feature ID if sid != cid: if cid != None: _err('inconsistent semantic feature id') _smfchk[ss.catg] = sid try: # print ( 'smf=' , smf ) fs = FSpec(stb, sem, True) except ellyException.FormatFailure: _err('bad semantic features') smf = fs.positive.hexadecimal( False) # convert to hex # print ( '1:d=[' + d + ']' ) ld = len(d) # print ( 'ld=' , ld ) if ld == 0: _err('missing plausibility') np = 0 x = d[np] if x == '+' or x == '-': np += 1 # take any plus or minus sign while np < ld: # and successive digits if ellyChar.isDigit(d[np]): np += 1 else: break # print ( 'np=' , np ) if np == 0: _err('missing plausibility') pb = d[:np] # plausibility bias # print ( 'pb=' , pb ) d = d[np:] ld = len(d) # print ( '2:d=[' + d + ']' ) if ld > 1: # any more to process? c = d[0] # get next char after bias d = d[1:] # advance scan ld -= 1 if c == '/': # check for explicit concept # print ( 'getting concept' ) np = 0 while np < ld: # get extent of concept if ellyChar.isWhiteSpace(d[np]): break np += 1 if np == 0: _err('missing concept for plausibility') cn = d[:np] # extract concept d = d[np:] elif c != ' ': _err() # signal bad format elif ld > 0: _err() # unidentifiable trailing text elif d[0] != '(': dd = d while ellyChar.isLetterOrDigit(dd[0]): dd = dd[1:] if len(dd) == 0 or dd[0] != '=': _err() d = d.strip() # rest of definition # print ( 'rest of d=' , d ) if len(d) > 0 and d[-1] == '=': if len(d) == 1 or d[0] != '=': _err('incomplete definition') ld = [] # for normalizing definition k = 0 # count spaces removed sd = '' # previous char seen for cd in d: # scan all chars in translation # print ( 'cd=' , cd ) if cd == ' ': if sd == '=' or sd == ',' or sd == ' ': k += 1 sd = cd continue elif cd == '=' or cd == ',': # no spaces before '=' or ',' if sd == ' ': k += 1 ld.pop() if cd == ',': if sd == '=': _err('missing translation') cd = '#' # format for PICK operation elif cd == '=' and sd == '=': print('** WARNING \'=\' followed by \'=\'', file=sys.stderr) print('* at [', tsave, ']', file=sys.stderr) sd = cd ld.append(cd) # add char to reformatted definition # print ( 'ld=' , ld ) if k > 0: d = ''.join(ld) # definition with spaces removed # print ( '3:d=[' + d + ']' ) vrc = [t, '=:', cat, syf, smf, pb, cn] # start data record vss = ' '.join(vrc) # convert to string vss += ' ' + d # fill out record with rest of input # print ( 'type(vss)=' , type(vss) ) # print ( 'rec=' , vrc , 'tra=' , d ) # print ( ' =' , vss ) except ellyException.FormatFailure: # will catch exceptions from _err() print('* at [', tsave, end=' ', file=sys.stderr) if dsave != None: print(':', dsave, end=' ', file=sys.stderr) print(']', file=sys.stderr) continue # skip rest of processing this rule #### SQLite DB operation #### try: sql = "INSERT INTO Vocab VALUES(?,?)" # print ( type(wky) , wky , type(vss) , vss ) cur.execute(sql, (wky, vss)) except dbs.Error as e: print('FATAL', e, file=sys.stderr) sys.exit(1) # #### #### SQLite DB operations #### if nerr == 0: cdb.commit() cdb.close() # clean up # print ( 'DONE' ) # #### except Error as e: # catch any other errors print('**', e, file=sys.stderr) print('* at', r, file=sys.stderr) nerr += 1 if nerr > 0: print('**', nerr, 'vocabulary table errors in all', file=sys.stderr) print('* compilation FAILed', file=sys.stderr) cdb.close() # discard any changes raise ellyException.TableFailure
def compile ( name , stb , defn , stem=None ): """ static method to create an Elly vocabulary database from text file input arguments: name - for new BSDDB database stb - Elly symbol table defn - Elly definition reader for vocabulary stem - optional stemmer for indexing exceptions: TableFailure on error """ global nerr nerr = 0 # print >> sys.stderr , 'compiled stb=' , stb , 'stem=' , stem , 'db=' , db if stb == None : print >> sys.stderr, 'no symbol table' raise ellyException.TableFailure if db == None : print >> sys.stderr, 'no Python db package' raise ellyException.TableFailure try: zfs = FSpec(stb,'[$]',True).positive.hexadecimal(False) except ellyException.FormatFailure: # should never need this print >> sys.stderr , 'unexpected failure with zero features' raise ellyException.TableFailure # print >> sys.stderr , 'zfs=' , zfs # hexadecimal for all features off tsave = '' # original term dsave = '' # definition try: filn = name + vocabulary # where to put vocabulary database try: os.remove(filn) # delete the file if it exists except OSError: print >> sys.stderr , 'no' , filn dbs = db.DB() # create new database dbs.set_flags(db.DB_DUP) # keys may identify multiple records dbs.open(filn,None,db.DB_HASH,db.DB_CREATE) # open new database file # print >> sys.stderr , 'creating' , filn r = None # for error reporting while True: # process vocabulary records try: # print >> sys.stderr , '------------' r = defn.readline() # next definition if len(r) == 0: break # stop on EOF if r[0] == '#': continue # skip comment line # print >> sys.stderr , 'def=' , r k = r.find(':') # look for first ':' if k < 0: tsave = r dsave = None _err() # report error and quit entry continue t = r[:k].strip() # term to go into dictionary d = r[k+1:].strip() # its definition tsave = t # save for any error reporting dsave = d # # print >> sys.stderr , ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>' if len(t) == 0 or len(d) == 0: _err() # quit on missing parts continue c = t[0] if not ellyChar.isLetterOrDigit(c) and c != '.' and c != '"': _err('bad term') continue n = toIndex(t) # get part of term to index if n == 0: _err() # quit on bad term continue w = t[:n] # first word of term to define if stem != None: try: w = stem.simplify(w) # reduce for lookup key except ellyException.StemmingError: _err('bad stemming logic') continue # print >> sys.stderr , ' w=' , w lcw = lcAN(w) # convert to ASCII lower case # print >> sys.stderr , 'lcw=' , '"' + lcw + '"' ns = syntaxSpecification.scan(d) # find extent of syntax info # print >> sys.stderr , 'ns=' , ns if ns <= 0: _err('bad syntax specification') # print >> sys.stderr , 'PoS=' , d[:ns] syn = d[:ns] # syntax info as string d = d[ns:].strip() # rest of definition try: # print >> sys.stderr , 'VT syn=' , syn ss = SSpec(stb,syn) # decode syntax info to get # print >> sys.stderr , 'VT ss =' , ss except ellyException.FormatFailure: _err('malformed syntax specification') continue cat = str(ss.catg) # syntax category syf = ss.synf.positive.hexadecimal(False) # syntactic flags # print >> sys.stderr , 'syf=' , syf smf = zfs # initialize defaults for pb = '0' # cognitive semantics cn = '-' # # print >> sys.stderr , '0:d=[' + d + ']' if len(d) > 1: # check for cognitive semantics x = d[0] if x == '[' or x == '0' or x == '-': # semantic features? if x != '[': # a '0' or '-' means to take default if len(d) == 1 or d[1] != ' ': _err('missing semantic features') continue d = d[2:].strip() # skip over else: ns = featureSpecification.scan(d) # look for ']' of features # print >> sys.stderr , 'ns=' , ns if ns < 0: _err() continue sem = d[:ns] # get semantic features d = d[ns:].strip() # skip over try: # print >> sys.stderr , 'smf=' , smf fs = FSpec(stb,sem,True) except ellyException.FormatFailure: _err('bad semantic features') continue smf = fs.positive.hexadecimal(False) # convert to hex # print >> sys.stderr , '1:d=[' + d + ']' ld = len(d) # print >> sys.stderr , 'ld=' , ld if ld == 0: _err('missing plausibility') continue np = 0 x = d[np] if x == '+' or x == '-': np += 1 # take any plus or minus sign while np < ld: # and successive digits if ellyChar.isDigit(d[np]): np += 1 else: break # print >> sys.stderr , 'np=' , np if np == 0: _err('missing plausibility') continue pb = d[:np] # plausibility bias # print >> sys.stderr , 'pb=' , pb d = d[np:] ld = len(d) # print >> sys.stderr , '2:d=[' + d + ']' if ld > 1: # any more to process? c = d[0] # get next char after bias d = d[1:] # advance scan ld -= 1 if c == '/': # check for explicit concept # print >> sys.stderr , 'getting concept' np = 0 while np < ld: # get extent of concept if ellyChar.isWhiteSpace(d[np]): break np += 1 if np == 0: _err('missing concept for plausibility') continue cn = d[:np] # extract concept d = d[np:] elif c != ' ': _err() # signal bad format continue elif ld > 0: _err() # unidentifiable trailing text continue d = d.strip() # rest of definition # print 'rest of d=' , d if len(d) > 0 and d[-1] == '=': if len(d) == 1 or d[0] != '=': _err('incomplete definition') continue ld = [ ] # for normalizing definition k = 0 # count spaces removed sd = '' # previous char seen for cd in d: # scan all chars in translation if cd == ' ': if sd == '=' or sd == ',' or sd == ' ': k += 1 sd = cd continue elif cd == '=' or cd == ',': # no spaces before '=' or ',' if sd == ' ': k += 1 ld.pop() if cd == ',': if sd == '=': _err('missing translation') cd = '#' # format for PICK operation elif cd == '=' and sd == '=': print >> sys.stderr , '** WARNING \'=\' followed by \'=\'' print >> sys.stderr , '* at [' , tsave , ']' sd = cd ld.append(cd) # add char to reformatted definition if k > 0: d = ''.join(ld) # definition with spaces removed # print >> sys.stderr , '3:d=[' + d + ']' vrc = [ t , ':' , cat , syf , smf , pb , cn ] # start BdB data record vss = u' '.join(vrc) # convert to string vss += u' ' + d # fill out record with rest of input # print >> sys.stderr , 'type(vss)=' , type(vss) rss = vss.encode('utf8') # convert to UTF-8 # print >> sys.stderr , 'rec=' , vrc , 'tra=' , d # print >> sys.stderr , ' =' , rss except ellyException.FormatFailure: print >> sys.stderr , '* at [' , tsave , if dsave != None: print >> sys.stderr , ':' , dsave , print >> sys.stderr , ']' continue # print >> sys.stderr , 'lcw=' , lcw dbs.put(lcw,rss) # save in database # print >> sys.stderr , 'saved' # print >> sys.stderr , 'DONE' dbs.close() # clean up except StandardError , e: # catch any other errors print >> sys.stderr , '**' , e print >> sys.stderr , '* at' , r nerr += 1
def _aDay ( self , ts ): """ parse a day number arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'aDay', ts if len(ts) == 0: return 0 k = 0 # running match count x = ts[0] y = '' if not ellyChar.isDigit(x): if not self.rewriteNumber(ts): return 0 else: x = ts[0] # print 'rewritten ts=' , ts ls = len(ts) if ls == 1: if x == '0': return 0 # cannot have 0 as day self._dy.append(x) # accept at end of input as possible date return 1 elif not ellyChar.isDigit(ts[1]): k = 1 elif x > '3': # reject first digit bigger than '3' return 0 else: y = x # save first digit x = ts[1] # this will be second digit if y == '3' and x > '1': # reject day > 31 return 0 k = 2 ls -= k if k == 2: self._dy.append(y) self._dy.append(x) if ls == 0: return k z = ts[k] if ellyChar.isDigit(z): return 0 # reject 3-digit day if z == '.' and ls > 1 and ellyChar.isDigit(ts[k+1]): return 0 # reject digit after decimal point if ls >= 2: # at least 2 chars to check after day number if z == u'-': # print 'hypen ls=' , ls , 'k=' , k if ellyChar.isDigit(ts[k+1]): # hyphen, digit match # print 'digit=' , ts[k+1] self._dy.append(z) self._dy.append(ts[k+1]) if ls == 2: # only 2 chars to check? k += 2 # add hyphen, digit to day elif ls == 3: # only 3 chars to check? # print 'ts[k]=' , ts[k:] if not ellyChar.isLetterOrDigit(ts[k+2]): # k += 2 # add hyphen, digit to day elif ellyChar.isDigit(ts[k+2]): # found second digit to add? self._dy.append(ts[k+2]) # if so, add to day string k += 3 elif not ellyChar.isLetterOrDigit(ts[k+2]): # more than 3 chars to check? k += 2 # if not, we are done elif ellyChar.isDigit(ts[k+2]): # check for second digit # print 'k=' , k if ls > 3 and ellyChar.isDigit(ts[k+3]): return 0 if ts[k+1] > '3': # check for valid day return 0 if ts[k+1] == '3' and ts[k+2] > '1': return 0 self._dy.append(ts[k+2]) k += 3 else: return 0 # no other hyphen allowed in day else: return 0 # t = ts[k:] # print 'k=' , k , 't=' , t if len(t) == 0 or not ellyChar.isLetterOrDigit(t[0]): return k if ellyChar.isDigit(t[0]) or len(t) < 2: return 0 sx = t[0].lower() + t[1].lower() # print 'y=' , y , 'x=' , x , 'sx=' , sx if x == '1': # print 'end of day=' , y if y == '1': if sx != 'th': return 0 elif sx != 'st': return 0 elif x == '2': if sx != 'nd': return 0 elif x == '3': if sx != 'rd': return 0 else: # print 'default ordinal indicator' if sx != 'th': return 0 # print 'ord k=' , k t = t[2:] k += 2 # print 'k=' , k , 'len=' , len(ts) if len(ts) == k: # check next char in stream return k # if none, match succeeds elif ellyChar.isLetterOrDigit(ts[k]): # print 'ts[k]=' , ts[k] , k return 0 # otherwise, match fails if next char is alphanumeric else: # print 'return k=' , k return k # otherwise succeed
def convert ( strg ): """ convert wildcard and escaped chars in a string to coded chars arguments: strg - the original string returns: the converted string on success, None otherwise """ if strg == None: return None lng = len(strg) nlb = 0 # check balancing of brackets t = [ ] # converted output i = 0 while True: if i == lng: break wild = True # flag for wildcard char, True by default x = strg[i] # print "convert",i,x if x == wANY: # check for wildcard t.append(cANY) elif x == wALF: t.append(cALF) elif x == wUPR: t.append(cUPR) elif x == wDIG: t.append(cDIG) elif x == wVWL: t.append(cVWL) elif x == wCNS: t.append(cCNS) elif x == wSPC: t.append(cSPC) elif x == wAPO: t.append(cAPO) elif x == wEND: t.append(cEND) elif x == wALL: if len(t) == 0 or t[-1] != cALL: t.append(cALL) elif x == wSPN: # check for repetition of wildcard if i + 1 == lng: t.append(x) else: i += 1 y = strg[i] if y == wANY: # only these wildcards can be repeated op = cSAN elif y == wDIG: op = cSDG elif y == wALF: op = cSAL else: continue t.append(op) elif x == ellyChar.LBR: # print 'at \[ nlb=' , nlb if nlb != 0: return None nlb += 1 t.append(cSOS) # start of optional match in pattern elif x == ellyChar.RBR: # print 'at \] nlb=' , nlb if nlb != 1: return None nlb -= 1 t.append(cEOS) # end of optional match elif x == ellyChar.BSL: # escape char if i + 1 == lng: # nothing to escape? t.append(x) elif strg[i+1] == ' ': # escaped space? t.append(ellyChar.NBS) i += 1 else: # escaped non-space? z = strg[i+1] # print 'escaped=',z if ellyChar.isDigit(z): t.append(x) # if digit, preserve backslash to indicate substitution else: t.append(z) # otherwise, keep the next char literally i += 1 else: t.append(x) wild = False if wild and nlb > 0 and x != ellyChar.LBR: # print 'at wildcard' , x , 'nlb=' , nlb return None # no wildcards allowed in optional segments i += 1 # print "converted=", t return u''.join(t).lower() # converted string to match against
def _getRaw ( self ): """ obtain next raw token from buffer arguments: self returns: EllyToken on success, None otherwise """ self.skipSpaces() # print "|",len(self.buffer) ln = len(self.buffer) # print "|",len(self.buffer) if ln == 0: return None # print "proceed" ## get length of next token and if it has ## initial - or +, check for word fragment k = 0 # number of chars for next token if self.match(MIN): # check for hyphen if self.match(DSH): # it is a dash when doubled k = 2 else: k = self.find(separators,1) elif self.match(PLS): # check for elly prefix k = self.find(separators,1) elif self.match(DOT): # check for period if self.match(ELP): # it is ellipsis when tripled k = 3 else: k = 1 elif not ellyChar.isCombining(self.buffer[0]): k = 1 # if next char cannot start a token, take it as a token else: k = self.find(separators) if k < 0: # break a token at next separator k = ln while k < ln: # look at separator if it exists x = self.buffer[k] if x != MIN and x != COM: break # a hyphen or comma is not absolute break if not ellyChar.isDigit(self.buffer[k+1]): break # accept hyphen or comma if NOT followed by digit else: # otherwise, look for another separator k = self.find(separators,k+2) if k < 0: k = ln ## if token not delimited, take rest of buffer as ## will fit into token working area if k < 0: k = ln # print "take",k,"chars from",len(self.buffer),self.buffer buf = self.extract(k) # get k characters ## special check for - next in buffer after extraction if self.match(MIN): # hyphen immediately following? self.skip() # if so, take it if self.atSpace(): # when followed by space buf.append(MIN) # append hyphen to candidate token k += 1 else: if not self.match(MIN): # when not followed by another hyphen self.prepend(ellyChar.SPC) # put back a space else: self.skip() # double hyphen = dash self.prepend(ellyChar.SPC) # put back space after dash self.prepend(MIN) # put back second hyphen self.prepend(MIN) # put back first self.prepend(ellyChar.SPC) # put extra space before hyphen or dash ## fill preallocated token for current position from working area # print "raw text for token:" , '[' + u''.join(buf).encode('utf8') + ']' to = ellyToken.EllyToken(u''.join(buf)) ## strip off trailing non-token chars from token and put back in buffer km = k - 1 while km > 0: x = buf[km] if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS: break if x == APO and km > 0 and buf[km - 1] == 's': break self.prepend(x) km -= 1 km += 1 if km < k: to.shortenBy(k - km,both=True) return to
def build ( self , inp ): """ build tree logic from definition reader input arguments: self - inp - definition text for logic exceptions: TableFailure on error """ if inp == None: return nerr = 0 # error count # read in affixes and associated actions while True: line = inp.readline() # next input line if line == u'': # check for EOF break modf = '' elem = line.strip().lower().split(' ') # print 'elem=' , elem le = len(elem) if le < 4: nerr += 1 print >> sys.stderr , "** affix error: incomplete input" print >> sys.stderr , "* at: [" , line , "]" continue # skip incomplete line if le > 4: # affix mod specified? modf = elem.pop() # if so, get it # print elem[0] , modf do = elem.pop() # note main action # get affix within definition line aff = list(elem.pop(0)) # affix as list of chars # check for proper form aff = self.sequence(aff) # backward or forward matching? # print 'aff=' , aff c = aff[0] # get first char to compare with aff = aff[1:] if (not ellyChar.isLetter(c) and c != '+'): # affix must start with letter or '+' nerr += 1 print >> sys.stderr , "** affix error: must start with letter or '+'" print >> sys.stderr , "* at: [" , line , "]" continue # ignore line if not c in self.indx: # node not already in tree index? self.indx[c] = Node() # add new node node = self.indx[c] for a in aff: # now check each successive char in affix if a in node.contn: node = node.contn[a] # go to existing node if found else: new = Node() # otherwise make new node node.contn[a] = new # and insert into tree node = new # and move down # at final node in tree logic node.condn = int(elem.pop(0)) # condition for match try: nsave = 0 if len(elem) == 0 else int(elem.pop()) except ValueError , e: print >> sys.stderr , e print >> sys.stderr , "* at: [" , line , "]" continue # ignore line resto = [ Add ] # set to defaults recur = False # mode = do[-1] # kind of recursion rest = do[:-1] # added chars to fill out root # print 'mode=' + '<' + mode + '>' , 'rest=' , rest if mode == u'?': node.condn = 1 resto = [ Fail ] # will generate fatal error elif ellyChar.isDigit(mode): nerr += 1 print >> sys.stderr , "* bad action mode=" , mode continue else: if mode == ',': # allow recursion? recur = True # if so, change default if len(rest) == 1 and rest[0] == '&': resto = [ RestorE ] else: resto += list(rest) if self.addn != None: resto.insert(1,self.addn) # insert AFTER first char of list # print 'resto=' , resto # insert action node.actns = Action(self,nsave,resto,recur,modf) node.tag()
def _getRaw(self): """ obtain next raw token from buffer arguments: self returns: EllyToken on success, None otherwise """ # print ( '_getRaw() from' , len(self.buffer) , 'chars' ) # print ( 'before skipping spaces, buffer=' , self.buffer ) self.skipSpaces() ln = len(self.buffer) # print ( "after skip=",ln ) if ln == 0: return None ## get length of next token and if it has ## initial - or +, check for word fragment # print ( 'buffer start=' , self.buffer[0] ) k = 0 # number of chars for next token cz = ' ' if ln == 0 else self.buffer[0] if cz in [MIN, PLS]: k = self.findSeparator(1) elif cz == APO: if ln > 2 and self.buffer[1].lower( ) == 's' and self.buffer[2] in separators: k = 2 else: k = 1 elif cz in [COM, DOT, UELP]: # these can be tokens by themselves k = 1 else: # print ( 'full token extraction' ) k = self.findSeparator() # print ( 'k=' , k , 'ln=' , ln ) if k < 0: # break multi-char token at next separator k = ln # if no separator, go up to end of buffer elif k == 0: k = 1 # immediate break in scanning else: while k < ln: # look at any separator and following context x = self.buffer[k] if x != MIN and x != COM: break # no further check if separator not hyphen or comma if k + 1 >= ln or not ellyChar.isDigit(self.buffer[k + 1]): # print ( 'x=' , x , 'buf=' , self.buffer[k:] ) break # accept hyphen or comma if NOT followed by digit else: # otherwise, look for another separator k = self.findSeparator(k + 2) if k < 0: # k = ln ## if token not delimited, take rest of buffer as ## will fit into token working area if k < 0: k = ln # print ( "take",k,"chars from",len(self.buffer),self.buffer ) buf = self.extract(k) # get k characters ## special check for hyphen next in buffer after extraction if self.match(MIN): # hyphen immediately following? self.skip() # if so, take it if self.atSpace(): # when followed by space buf.append(MIN) # append hyphen to candidate token k += 1 else: if not self.match(MIN): # when not followed by another hyphen self.prepend(ellyChar.SPC) # put back a space else: self.skip() # double hyphen = dash self.prepend(ellyChar.SPC) # put back space after dash self.prepend(MIN) # put back second hyphen self.prepend(MIN) # put back first self.prepend( ellyChar.SPC) # put extra space before hyphen or dash ## fill preallocated token for current position from working area # print ( "raw text buf=" , buf ) to = ellyToken.EllyToken(''.join(buf)) # print ( "EllyBuffer token before=" , str(to) ) ## strip off trailing non-token chars from token and put back in buffer km = k - 1 while km > 0: x = buf[km] if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS: break # print ( 'trailing x=' , x ) if x == APO or x == APX: if km > 0 and buf[km - 1] == 's': break self.prepend(x) km -= 1 km += 1 if km < k: to.shortenBy(k - km, both=True) # print ( "EllyBuffer token=" , strx(to) ) # print ( "next in buffer=" , self.buffer ) return to
def _matchN ( self , ts ): """ apply logic for numeric only time recognition arguments: self - ts - text stream as list of chars returns: total number of chars matched """ self._m = u'00' # initialize defaults self._s = u'00' # k = 1 # count of chars already scanned if len(ts) < 3: # enough chars for time expression? return 0 # if not, fail if ellyChar.isDigit(ts[k]): k += 1 # skip second digit if ts[k] != ':': # short time expression? if k == 2: h = u''.join(ts[:k]) if h > '12': # check 2-digit hour return 0 else: h = ts[0] if h == '0': # check 1-digit hour return 0 m = self._findAMorPM(ts[k:]) # AM or PM if m == 0: # if none in short expression, fail return 0 self._hr = int(h) # set the hour return k + m # return success self._hr = int(u''.join(ts[:k])) # numerical hour if self._hr >= 24: return 0 k += 1 t = ts[k:] lt = len(t) if lt < 2: return 0 c = t[0] # should be minutes d = t[1] if not ellyChar.isDigit(c) or not ellyChar.isDigit(d): return 0 if c > '5': return 0 self._m = u''.join(t[:2]) # save t = t[2:] lt -= 2 k += 2 if lt > 2: # should be seconds if t[0] == ':': c = t[1] d = t[2] if not ellyChar.isDigit(c) or not ellyChar.isDigit(d): return 0 if c > '5': return 0 if lt > 3 and ellyChar.isDigit(t[3]): return 0 self._s = u''.join(t[1:3]) # save t = t[3:] lt -= 3 k += 3 if lt > 0 and ellyChar.isDigit(t[0]): return 0 else: return k
def getNext ( self ): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print 'getNext' self.resetBracketing() inBrkt = False nspc = 0 # set space count sent = [ ] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print 'x=' , '<' + x + '>' , ord(x) self.inp.unread(x,SP) # put first char back to restore input # print '0 <<" , self.inp.buf # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' # print 'sent=' , sent , 'nspc=' , nspc # check for table delimiters in text if len(sent) == 0: # print 'table' # print '1 <<' , self.inp.buf if x == u'.' or x == u'-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far #################################################### # accumulate chars and count alphanumeric and spaces #################################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # elif ellyChar.isWhiteSpace(c) and inBrkt: nspc += 1 svBrkt = inBrkt inBrkt = self.checkBracketing(x) # do bracketing check with modified chars if svBrkt and not inBrkt: nspc = 0 # print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt sent.append(c) # put original char into sentence buffer if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # char was not alphanumeric or space # look for stop punctuation exception cx = self.inp.preview() # for context of match call # print '0 <<' , self.inp.buf # print 'sent=' , sent[:-1] # print 'punc=' , '<' + c + '>' # print 'next=' , cx if c in Stops and len(cx) > 0 and cx[0] == SP: if self.stpx.match(sent[:-1],c,cx): # print 'stop exception MATCH' if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print 'no stop exception MATCH for' , c # print '@1 <<' , self.inp.buf # handle any nonstandard punctuation exoticPunctuation.normalize(c,self.inp) # print '@2 <<' , self.inp.buf # check for dash if c == u'-': d = self.inp.read() if d == u'-': # print 'dash' while True: d = self.inp.read() if d != u'-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print '@3 c=' , c , inBrkt if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) if not inBrkt: # print sent , 'so far' z = self.inp.read() if self.shortBracketing(sent,z): break self.inp.unread(z) # print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break elif c in QUOs and lc in Stops: # print 'stop+quote' z = self.inp.read() if z in RBs: sent.append(z) y = self.inp.read() if y in Stops: sent.append(y) elif not ellyChar.isWhiteSpace(y): self.inp.unread(y) inBrkt = False break elif z in QUOs: # print 'stop+quote+quote' sent.append(z) inBrkt = False break self.inp.unread(z) # print 'continue' continue elif not c in Stops: continue else: # print 'check stopping!' d = self.inp.read() # print '@3 <<' , self.inp.buf if d == None: d = u'!' # print 'stop=' , '<' + c + '> <' + d + '>' # print 'ellipsis check' if c == u'.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(d) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append(SP) # if part of token, put in space as separator continue if c == ELLP: # print 'found Unicode ellipsis, d=' , d if ellyChar.isUpperCaseLetter(d): self.inp.unread(d) # super special case of bad punctuation self.inp.unread(' ') # put in implied period and space self.inp.unread('.') # # special check for multiple stops # print 'next char d=' , d , ord(d) if d != END else 'NONE' if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent,d): break if d in self._cl and self._cl[d] == 1: dn = self.inp.peek() if ellyChar.isWhiteSpace(dn): sent.append(d) break self.inp.unread(d) # print 'no space after punc' continue # if no match for lookahead, put back elif d != END: # print 'unread d=' , d self.inp.unread(d) # print 'possible stop' # check special case of number ending in decimal point if c == '.': ixb = len(sent) - 2 ixn = ixb + 1 cxn = '' # print 'sent=' , sent # print 'ixn=' ,ixn while ixn > 0: ixn -= 1 cxn = sent[ixn] # print 'cxn=' , cxn if not ellyChar.isDigit(cxn): break # print 'break: ixn=' , ixn , 'ixb=' , ixb if ixn < ixb and cxn in [ ' ' , '-' , '+' ]: prvw = self.inp.preview() # print 'prvw=' , prvw if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(prvw[1]): continue # final check: is sentence long enough? if inBrkt: # print 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() # print 'nspc=' , nspc if c in [ ':' , ';' ] or nspc < 3: sent.append(d) # print 'add' , '<' + d + '> to sentence' # print 'sent=' , sent self.inp.skip() nspc -= 1 continue # print '@4 <<' , self.inp.buf cx = self.inp.peek() if cx == None: cx = u'!!' # print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent # print 'nAN=' , nAN , 'inBrkt=' , inBrkt if nAN > 1: break if sent == [ u'\u2026' ]: # special case of sentence return list("-.-") # with lone ellipsis elif len(sent) > 0 or self.last != END: return sent else: return None
def rewrite ( self , ts ): """ check for date at current text position and rewrite if found arguments: self - ts - text stream as list of chars returns: True on any rewriting, False otherwise """ lts = len(ts) if lts < Lm: return False tz = self._tz # default self._xm = '' # default self._m = u'00' # defaults self._s = u'00' c = ts[0] # first char if not ellyChar.isDigit(c): return False # time can never start with a letter # because of number transforms k = self._matchN(ts) # print 'match numeric=' , k if k == 0: return False # print 'ts[k:]=' , ts[k:] k += self._findAMorPM(ts[k:]) # print 'AM or PM k=' , k # print 'hour=' , self._hr if self._xm == 'p' and self._hr < 12: # convert to 24-hour time self._hr += 12 elif self._xm == 'a' and self._hr == 12: # self._hr = 0 # print 'hour=' , self._hr t = ts[k:] # remainder of text # print 'rest t=' , t dk = 0 # skip count ns = 0 # space count if len(t) > 0: # look for time zone if t[0] == ' ': # skip any initial space dk += 1 ns = 1 # print 't[dk:]=' , t[dk:] , 'dk=' , dk dk += self.get(t[dk:]) # extract next token from input ss = self.string # # print 'zone=' , ss if ss in Zn: # match to known time zone? tz = ss elif ns == 0 and ss == u'z': # military ZULU time tz = u'gmt' # translate else: dk = 0 # no match k += dk # update match count t = t[dk:] # advance scan # print 't=' , t if len(t) > 0 and ellyChar.isLetterOrDigit(t[0]): return False for _ in range(k): # strip matched substring to be rewritten ts.pop(0) r = str(self._hr).zfill(2) + u':' + self._m + u':' + self._s + tz rr = r[::-1] for c in rr: # do rewriting ts.insert(0,c) self._rwl = len(r) return True
def _matchN ( self , ts ): """ apply logic for numeric only date recognition arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'NUMERIC' lts = len(ts) if lts < Lm: return 0 # shortest date is 0/0 if not ellyChar.isDigit(ts[0]): return 0 n = Ln if n > lts: n = lts ss = [ ] # substring to compare ns = 0 # slash count # print 'lts=' , lts , 'n=' , n k = 0 while k < n: c = ts[k] if c == '/': ns += 1 elif c == '-': ns += 1 c = '/' elif not ellyChar.isDigit(c): break ss.append(c) k += 1 if k < Lm: return 0 if ns != 1 and ns != 2: return 0 # print 'k=' , k , 'ns=' , ns , ss if k < lts and ellyChar.isLetterOrDigit(ts[k]): return 0 dt = ''.join(ss).split('/') dt0 = dt.pop(0) # get first two date components dt1 = dt.pop(0) # # print 'split=' , dt0 , dt1 if len(dt0) == 4 or dt0[0] == '0': if ns == 1: return 0 # dt.append(dt0) # put first component at end if it looks like year dt0 = dt1 # move month up dt1 = dt.pop() # move date up m = int(dt0) if m < 1 or m > 12: return 0 # check validity of month d = int(dt1) if d < 1 or d > 31: return 0 # check validity of day if ns == 2: y = dt.pop(0) # if there is a year, process it also ly = len(y) if ly == 4: # 4-digit year? s = y[0] if s != '1' and s != '2': return 0 yls = list(y) elif ly == 2: ix = 0 if y > self.ycur else 1 yls = list(self.cent[ix] + y) else: return 0 # fail on any other number of year digits self._yr = yls # handle year self._mo = list(dt0.zfill(2)) # handle month self._dy = list(dt1.zfill(2)) # handle day return k
def _leftside ( stb , txt , sta ): """ process conditions for a clause and store arguments: stb - symbol table txt - string input for left side of single clause sta - for status reporting returns: predicate list on success, None otherwise """ # print "left side" pred = [ ] txt = txt.rstrip() while len(txt) > 0: txt = txt.lstrip() # print 'clause=' , txt if len(txt) <= 1: _err('malformed conditions for clause') return None side = txt[0] txt = txt[1:] if side in [ 'n' , 'p' , 'c' ]: sns = txt[0] txt = txt[1:] if sns != '<' and sns != '>': _err('invalid comparison in clause condition=' + sns) return None if side == 'n': op = semanticCommand.Cngt if sns == '>' else semanticCommand.Cnlt elif side == 'p': op = semanticCommand.Cpgt if sns == '>' else semanticCommand.Cplt else: op = semanticCommand.Ccgt if sns == '>' else semanticCommand.Cclt nd = 0 lt = len(txt) while nd < lt: if not ellyChar.isDigit(txt[nd]): break nd += 1 if nd == 0: _err('no token count for condition') return None test = int(txt[:nd]) txt = txt[nd:] pred.append([ op , test ]) continue if not side in [ 'l' , 'r' ]: _err('invalid side for test=' + side) return None k = 0 if txt[0] == '[': # semantic feature check? k = txt.find(']') # if so, look for closing bracket if k < 0: return _err('incomplete semantic features to check') p = txt[:k+1] # get semantic feature string # print "side:" , side , "test:" , p try: f = featureSpecification.FeatureSpecification(stb,p,semantic=True) except ellyException.FormatFailure: return _err('bad semantic features to check') if side == 'l': if sta.id[lS] == None: sta.id[lS] = f.id elif f.id != sta.id[lS]: _err('inconsistency: left features=' + p) return None else: if sta.id[rS] == None: sta.id[rS] = f.id elif f.id != sta.id[rS]: _err('inconsistency: right features=' + p) return None op = semanticCommand.Crhtf if side == 'r' else semanticCommand.Clftf if side == 'r': sta.rht = f else: sta.lft = f # print 'test:' , f.positive.hexadecimal() , f.negative.hexadecimal() test = ellyBits.join(f.positive,f.negative) # print test pred.append([ op , test ]) elif txt[0] == '(': # semantic concept check? # print "txt=\"" + txt +"\"" k = txt.find(')') # if so, look for closing parenthesis if k < 0: return _err('incomplete concept check') s = txt[1:k].strip().upper() # normalize concepts p = s.split(',') # allow for multiple disjunctive checks # print "p=\"" + p + "\"" op = semanticCommand.Crhtc if side == 'r' else semanticCommand.Clftc pred.append([ op , p ]) else: _err('unknown test in clause=' + side + txt) return None txt = txt[k+1:].lstrip() # advance to next predicate # print "NEXT" return pred
def _getRaw ( self ): """ obtain next raw token from buffer arguments: self returns: EllyToken on success, None otherwise """ # print '_getRaw() from' , len(self.buffer) , 'chars' # print unicode(self) self.skipSpaces() ln = len(self.buffer) # print "after skip=",ln if ln == 0: return None ## get length of next token and if it has ## initial - or +, check for word fragment bs = self.buffer[0] # print 'buffer start=' , bs k = 0 # number of chars for next token if self.match(MIN): # check for hyphen if self.match(DSH): # it is a dash when doubled k = 2 else: # otherwise, could be word fragment k = self.findSeparator(1) elif self.match(PLS): # check for Elly prefix k = self.findSeparator(1) elif self.match(DOT): # check for period if self.match(ELP): # it is ellipsis when tripled k = 3 else: # otherwise, single punctuation char k = 1 elif bs == APO: k = 1 else: # print 'full token extraction' k = self.findSeparator() # print 'k=' , k if k < 0: # break multi-char token at next separator k = ln # if no separator, go up to end of buffer elif k == 0: k = 1 # immediate break in scanning else: while k < ln: # look at any separator and following context x = self.buffer[k] if x != MIN and x != COM: break # no further check if separator not hyphen or comma if k + 1 >= ln or not ellyChar.isDigit(self.buffer[k+1]): break # accept hyphen or comma if NOT followed by digit else: # otherwise, look for another separator k = self.findSeparator(k+2) if k < 0: # k = ln ## if token not delimited, take rest of buffer as ## will fit into token working area if k < 0: k = ln # print "take",k,"chars from",len(self.buffer),self.buffer buf = self.extract(k) # get k characters ## special check for hyphen next in buffer after extraction if self.match(MIN): # hyphen immediately following? self.skip() # if so, take it if self.atSpace(): # when followed by space buf.append(MIN) # append hyphen to candidate token k += 1 else: if not self.match(MIN): # when not followed by another hyphen self.prepend(ellyChar.SPC) # put back a space else: self.skip() # double hyphen = dash self.prepend(ellyChar.SPC) # put back space after dash self.prepend(MIN) # put back second hyphen self.prepend(MIN) # put back first self.prepend(ellyChar.SPC) # put extra space before hyphen or dash ## fill preallocated token for current position from working area # print "raw text buf=" , buf to = ellyToken.EllyToken(u''.join(buf)) # print "EllyBuffer token before=" , unicode(to) ## strip off trailing non-token chars from token and put back in buffer km = k - 1 while km > 0: x = buf[km] if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS: break if x == APO or x == APX: if km > 0 and buf[km - 1] == 's': break self.prepend(x) km -= 1 km += 1 if km < k: to.shortenBy(k - km,both=True) # print "EllyBuffer token after =" , unicode(to) return to
def _aDay(self, ts): """ parse a day number arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'aDay', ts if len(ts) == 0: return 0 k = 0 # running match count x = ts[0] y = '' if not ellyChar.isDigit(x): if not self.rewriteNumber(ts): return 0 else: x = ts[0] # print 'rewritten ts=' , ts ls = len(ts) if ls == 1: if x == '0': return 0 # cannot have 0 as day self._dy.append(x) # accept at end of input as possible date return 1 elif not ellyChar.isDigit(ts[1]): k = 1 elif x > '3': # reject first digit bigger than '3' return 0 else: y = x # save first digit x = ts[1] # this will be second digit if y == '3' and x > '1': # reject day > 31 return 0 k = 2 ls -= k if k == 2: self._dy.append(y) self._dy.append(x) if ls == 0: return k z = ts[k] if ellyChar.isDigit(z): return 0 # reject 3-digit day if z == '.' and ls > 1 and ellyChar.isDigit(ts[k + 1]): return 0 # reject digit after decimal point if ls >= 2: # at least 2 chars to check after day number if z == u'-': # print 'hypen ls=' , ls , 'k=' , k if ellyChar.isDigit(ts[k + 1]): # hyphen, digit match # print 'digit=' , ts[k+1] self._dy.append(z) self._dy.append(ts[k + 1]) if ls == 2: # only 2 chars to check? k += 2 # add hyphen, digit to day elif ls == 3: # only 3 chars to check? # print 'ts[k]=' , ts[k:] if not ellyChar.isLetterOrDigit(ts[k + 2]): # k += 2 # add hyphen, digit to day elif ellyChar.isDigit( ts[k + 2]): # found second digit to add? self._dy.append(ts[k + 2]) # if so, add to day string k += 3 elif not ellyChar.isLetterOrDigit( ts[k + 2]): # more than 3 chars to check? k += 2 # if not, we are done elif ellyChar.isDigit(ts[k + 2]): # check for second digit # print 'k=' , k if ls > 3 and ellyChar.isDigit(ts[k + 3]): return 0 if ts[k + 1] > '3': # check for valid day return 0 if ts[k + 1] == '3' and ts[k + 2] > '1': return 0 self._dy.append(ts[k + 2]) k += 3 else: return 0 # no other hyphen allowed in day else: return 0 # t = ts[k:] # print 'k=' , k , 't=' , t if len(t) == 0 or not ellyChar.isLetterOrDigit(t[0]): return k if ellyChar.isDigit(t[0]) or len(t) < 2: return 0 sx = t[0].lower() + t[1].lower() # print 'y=' , y , 'x=' , x , 'sx=' , sx if x == '1': # print 'end of day=' , y if y == '1': if sx != 'th': return 0 elif sx != 'st': return 0 elif x == '2': if sx != 'nd': return 0 elif x == '3': if sx != 'rd': return 0 else: # print 'default ordinal indicator' if sx != 'th': return 0 # print 'ord k=' , k t = t[2:] k += 2 # print 'k=' , k , 'len=' , len(ts) if len(ts) == k: # check next char in stream return k # if none, match succeeds elif ellyChar.isLetterOrDigit(ts[k]): # print 'ts[k]=' , ts[k] , k return 0 # otherwise, match fails if next char is alphanumeric else: # print 'return k=' , k return k # otherwise succeed
def build(self, inp): """ build tree logic from definition reader input arguments: self - inp - definition text for logic exceptions: TableFailure on error """ if inp == None: return nerr = 0 # error count # read in affixes and associated actions while True: line = inp.readline() # next input line if line == '': # check for EOF break modf = '' elem = line.strip().lower().split(' ') # print ( 'elem=' , elem ) le = len(elem) if le < 4: nerr += 1 print("** affix error: incomplete input", file=sys.stderr) print("* at: [", line, "]", file=sys.stderr) continue # skip incomplete line if le > 4: # affix mod specified? modf = elem.pop() # if so, get it # print ( elem[0] , modf ) do = elem.pop() # note main action # get affix within definition line aff = list(elem.pop(0)) # affix as list of chars # check for proper form aff = self.sequence(aff) # backward or forward matching? # print ( 'aff=' , aff ) c = aff[0] # get first char to compare with aff = aff[1:] if (not ellyChar.isLetter(c) and c != '+'): # affix must start with letter or '+' nerr += 1 print("** affix error: must start with letter or '+'", file=sys.stderr) print("* at: [", line, "]", file=sys.stderr) continue # ignore line if not c in self.indx: # node not already in tree index? self.indx[c] = Node() # add new node node = self.indx[c] for a in aff: # now check each successive char in affix if a in node.contn: node = node.contn[a] # go to existing node if found else: new = Node() # otherwise make new node node.contn[a] = new # and insert into tree node = new # and move down # at final node in tree logic node.condn = int(elem.pop(0)) # condition for match try: nsave = 0 if len(elem) == 0 else int(elem.pop()) except ValueError as e: print(e, file=sys.stderr) print("* at: [", line, "]", file=sys.stderr) continue # ignore line resto = [Add] # set to defaults recur = False # mode = do[-1] # kind of recursion rest = do[:-1] # added chars to fill out root # print ( 'mode=' + '<' + mode + '>' , 'rest=' , rest ) if mode == '?': node.condn = 1 resto = [Fail] # will generate fatal error elif ellyChar.isDigit(mode): nerr += 1 print("* bad action mode=", mode, file=sys.stderr) continue else: if mode == ',': # allow recursion? recur = True # if so, change default if len(rest) == 1 and rest[0] == '&': resto = [RestorE] else: resto += list(rest) if self.addn != None: resto.insert(1, self.addn) # insert AFTER first char of list # print ( 'resto=' , resto ) # insert action node.actns = Action(self, nsave, resto, recur, modf) node.tag() # if modf != '': print ( node , node.actns ) if nerr > 0: print("**", nerr, "affix errors in all", file=sys.stderr) raise ellyException.TableFailure
def _matchN(self, ts): """ apply logic for numeric only date recognition arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'NUMERIC' lts = len(ts) if lts < Lm: return 0 # shortest date is 0/0 if not ellyChar.isDigit(ts[0]): return 0 n = Ln if n > lts: n = lts ss = [] # substring to compare ns = 0 # slash count # print 'lts=' , lts , 'n=' , n k = 0 while k < n: c = ts[k] if c == '/': ns += 1 elif c == '-': ns += 1 c = '/' elif not ellyChar.isDigit(c): break ss.append(c) k += 1 # print 'k=', k , 'Lm=' , Lm , 'ns=' , ns if k < Lm: return 0 if ns != 1 and ns != 2: return 0 # print 'ss=' , ss if k < lts and ellyChar.isLetterOrDigit(ts[k]): return 0 dt = ''.join(ss).split('/') dt0 = dt.pop(0) # get first two date components dt1 = dt.pop(0) # # print 'split=' , dt0 , dt1 if len(dt0) == 4 or dt0[0] == '0': if ns == 1: return 0 # dt.append(dt0) # put first component at end if it looks like year dt0 = dt1 # move month up dt1 = dt.pop() # move date up m = int(dt0) if m < 1 or m > 12: return 0 # check validity of month if dt1 == '': return 0 try: d = int(dt1) except ValueError: return 0 if d < 1 or d > 31: return 0 # check validity of day if ns == 2: y = dt.pop(0) # if there is a year, process it also ly = len(y) if ly == 4: # 4-digit year? s = y[0] if s != '1' and s != '2': return 0 yls = list(y) elif ly == 2: ix = 0 if y > self.ycur else 1 yls = list(self.cent[ix] + y) else: return 0 # fail on any other number of year digits self._yr = yls # handle year self._mo = list(dt0.zfill(2)) # handle month self._dy = list(dt1.zfill(2)) # handle day return k
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __unicode__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + unicode(self.kind) + ',ct=' + unicode(self.count) + ',pa=' + unicode(self.pats) + ',tx=' + unicode(self.txts) + ',bd=' + unicode(self.bnds) + ',ns=' + unicode(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print "binding:",offs,ns os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) # print "_span: txt @",offs,"pat @",mp,"nsp=",nsp # print "text to span:",text[offs:] # print "pat rest=" , patn[mp:] k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print "exclude=",k,"chars from possible span for rest of pattern" # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print mx,"chars available to scan" mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print "text at",offs,"maximum wildcard match=",mx nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print 'span c=' , c if not tfn(c): break # stop when it fails to match nm += 1 # print "maximum wildcard span=",nm return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print 'starting match, limt=',limt,text[offs:limt],":",patn # print 'nsps=' , nsps mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print '---- loop mp=' , mp , 'ml=' , ml while mp < ml: if offs >= limt: # print "offs=",offs,"limt=",limt last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print 'patn=' , patn mc = patn[mp] # print 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs # print 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print 'hyphen special matching, limt=', limt , 'offs=' , offs # print 'text[offs:]=' , text[offs:] if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print 'no special matching of hyphen' break # print 'matched @mp=' , mp mp += 1 ## check whether mismatch is due to special pattern char # print 'pat @',mp,"<",ml # print "txt @",offs,'<',limt,'last=',last # print '@',offs,text[offs:] if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) if tc == cALL: # a * wildcard? # print "ALL last=< " + last + " >" if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print "offs=",offs,'nm=',nm uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print "END $:",last if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print "ANY:",last,offs if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print 'at cCAN' if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print "ALF:",last,offs if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print "UPR:",last,'@',offs if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print "LWR:",last,'@',offs if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print "SPC:","["+last+"]" if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print 'NO space' elif tc == cAPO: # apostrophe wildcard? # print "APO: last=" , last if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print "SOS" # print last,'@',offs mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print "EOS" if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' if last != '': # still more to match? offs -= 1 # print 'nsps=' , nsps # print '@' , offs , text nm = _span(tc,nsps) # maximum match possible # print 'spanning=' , nm if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print 'spanning=' , nm if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print 'offs=' , offs last = text[offs] if offs < limt else '' continue # print 'fail tc=' , deconvert(tc) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print "fail - unwinding" , unj while unj > 0: # try unwinding, if possible # print "unw:",unj uf = unw[unj-1] # get most recent unwinding record # print uf if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print "no unwinding" break # quit if unwinding is exhausted # print 'cnt=' , uf.count , 'off=' , offs ## ## clean up on match mode or on no match possible ## # print "matched=",matched if not matched: return None # no bindings # print text,offs ## consolidate contiguous bindings for subsequent substitutions # print "BEFORE consolidating consecutive bindings" # print "bd:",len(mbd) # print mbd[0] # print '----' # for b in mbd[1:]: # print b mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print "AFTER" # print "bd:",len(mbd) # print mbd[0] # print '----' # for b in mbd[1:]: # print b return mbd # consolidated bindings plus new offset
def match ( patn , text , offs=0 , limt=None ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit of matching returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 def __unicode__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + unicode(self.kind) + ',ct=' + unicode(self.count) + ',pa=' + unicode(self.pats) + ',tx=' + unicode(self.txts) + ',bd=' + unicode(self.bnds) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # three private functions using local variables of match() # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print "binding:",offs,ns os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi return uf def _span ( typw ): """ count chars available for wildcard match arguments: typw - wildcard returns: non-negative count if any match possible, otherwise -1 """ k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print "exclude=",k,"@",offs # calculate maximum chars a wildcard can match mx = ellyChar.findBreak(text,offs) - k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # char type matching a wildcard # print "text at",offs,"maximum wildcard match=",mx nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset if not tfn(c): break # stop when it fails to match nm += 1 # print "maximum wildcard span=",nm return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match? if limt == None: limt = len(text) mp = 0 # pattern index ml = len(patn) # pattern match limit # print text[offs:limt],":",list(patn) while True: ## literally match as many next chars as possible while mp < ml: if offs >= limt: last = '' else: last = text[offs].lower() offs += 1 # print 'matching last=' , last , 'at' , offs if patn[mp] != last: break mp += 1 ## check whether mismatch is due to special pattern char # print 'pat',mp,"<",ml # print "txt @",offs if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print "tc=",ord(tc) if tc == cALL: # a * wildcard? # print "ALL last=< " + last + " >" if last != '': offs -= 1 nm = _span(cALL) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print "offs=",offs uf = _mark(1); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print "END $:",last if last == '': continue elif last in [ '.' , ',' , '-' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' ]: offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print "ALF:",last,offs if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print "SPC:" if last != '' and ellyChar.isWhiteSpace(last): _bind(); _modify(); mbi += 1 continue elif tc == cAPO: # apostrophe wildcard? # print "APO: last=" , last if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print "SOS" # print last,'@',offs mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print "EOS" if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: if last != '': # still more to match? offs -= 1 nm = _span(tc) # maximum match possible # print 'spanning=' , nm if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1); unj += 1 uf.count = nm - 1 # at least one char must be matched continue elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch # print "fail - unwinding",unj while unj > 0: # try unwinding, if possible # print "unw:",unj uf = unw[unj-1] # get most recent unwinding record # print uf if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard break else: # print "no unwinding" break # quit if unwinding is exhausted ## ## clean up on match mode or on no match possible ## # print "matched=",matched if not matched: return None # no bindings # print text,offs ## consolidate contiguous bindings for subsequent substitutions # print "BEFORE consolidating" # print "bd:",len(mbd) # for b in mbd: # print b mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print "AFTER" # print "bd:",len(mbd) # for b in mbd: # print b return mbd # consolidated bindings plus new offset
def _aDay ( self , ts ): """ parse a day number arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'aDay', ts if len(ts) == 0: return 0 k = 0 # running match count x = ts[0] if not ellyChar.isDigit(x): if not self.rewriteNumber(ts): return 0 else: x = ts[0] # print 'ts=' , ts if len(ts) == 1: self._dy[0] = x # accept at end of input as possible date return 1 elif not ellyChar.isDigit(ts[1]): k = 1 elif x > '3': # reject first digit bigger than '3' return 0 else: y = x # save first digit x = ts[1] # this known to be second digit if y == '3' and x > '1': # reject day > 31 return 0 lr = len(ts) - 2 # how many chars after possible date if lr > 0: z = ts[2] if ellyChar.isDigit(z): return 0 # reject 3-digit date if z == '.' and lr > 1 and ellyChar.isDigit(ts[3]): return 0 # reject 2 digits before decimal point self._dy[0] = y k = 2 self._dy[1] = x t = ts[k:] if len(t) == 0 or not ellyChar.isLetterOrDigit(t[0]): return k if ellyChar.isDigit(t[0]) or len(t) < 2: return 0 sx = t[0].lower() + t[1].lower() # print 'x=' , x , 'sx=' , sx if x == '1': if sx != 'st': return 0 elif x == '2': if sx != 'nd': return 0 elif x == '3': if sx != 'rd': return 0 else: if sx != 'th': return 0 t = t[2:] k += 2 # print 'k=' , k if len(ts) == k: # check next char in stream return k # if none, match succeeds elif ellyChar.isLetterOrDigit(ts[k]): return 0 # otherwise, match fails if next char is alphanumeric else: return k # otherwise succeed
def _rightside ( stb , txt ): """ process actions for a clause arguments: stb - symbol table txt - string input for single clause returns: action list on success, None otherwise """ # print "right side" actn = [ ] val = 0 cnc = '' # default is no concept specified m = txt.rfind(']') n = txt.rfind(' ') # look for space marking explicit concept # print 'n=',n # print "0 txt=[" , txt , "]" if n > m: # space must not be in semantic feature specification cnc = txt[n:].strip().upper() txt = txt[:n] # break off concept # print "1 txt=[" , txt , "]" if len(txt) > 1: if txt[0] == '*': # inherit from phrase component? c = txt[1] if c == 'l': actn.append([ semanticCommand.Clhr ]) elif c == 'r': actn.append([ semanticCommand.Crhr ]) else: return _err('bad inheritance') txt = txt[2:].strip() # print "2 txt=[" , txt , "]" if len(txt) > 3 and txt[0] == '[': n = txt.find(']') # set semantic features for phrase? # print 'n=' , n if n < 3: return _err('incomplete semantic features to set') try: f = featureSpecification.FeatureSpecification(stb,txt[:n+1],semantic=True) except ellyException.FormatFailure: return _err('bad semantic features') actn.append([ semanticCommand.Csetf , f.positive ]) # print 'set:' , actn[-1] txt = txt[n+1:] # print "3 txt=[" , txt , "]" if len(txt) > 0: c = txt[0] # check for sign of plausibility change if c != '+' and c != '-': return _err('plausibility must begin with + or -') # print "2 txt=[",txt,"]" if len(txt) == 1: val = 1 elif ellyChar.isDigit(txt[1]): try: val = int(txt[1:]) # explicit numerical change except ValueError: return _err('bad cognitive plausibility: ' + txt) elif c == txt[1]: # alternate notation for plausibility change val = 2 for xc in txt[2:]: if xc != c: return _err('must be all + or all -') val += 1 # count up value else: return _err('cannot interpret clause: ' + txt) if c == '-': val = -val # get right sign # print 'val=' , val ret = [ semanticCommand.Cadd , val ] if len(cnc) > 0: actn.append([ semanticCommand.Csetc , cnc ]) actn.append(ret) return actn
def match ( self , txt , pnc , ctx ): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - next chars after punctuation returns: True on match, False otherwise """ # print 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx if matchtoo(txt,pnc,ctx): # exception by complex match? return True # print 'matchtoo() returned False' sep = ctx[0] if len(ctx) > 0 else '' if sep == ellyChar.THS: return True nxt = ctx[1] if len(ctx) > 1 else '' # print 'lstg=' , self.lstg.keys() if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print len(lp) , 'patterns' ltx = len(txt) # current length of accumulated text so far ntr = 1 while ntr <= ltx: if not ellyChar.isLetterOrDigit(txt[-ntr]): break ntr += 1 nrg = ntr ntr -= 1 # available trailing chars for wildcard * match while nrg <= ltx: c = txt[-nrg] if not ellyChar.isLetterOrDigit(c) and not ellyChar.isEmbeddedCombining(c): # print 'break at nrg=' , nrg , txt[-nrg] break nrg += 1 nrg -= 1 # end of range for all pattern matching # print 'ntr=' , ntr , 'nrg=' , nrg txt = txt[-nrg:] # reset text to limit for matching ltx = len(txt) # its new length # print 'txt= ' + unicode(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' for p in lp: # try matching each listed exception pattern if p.left != None and len(p.left) > 0: pat = p.left star = pat[-1] == ellyWildcard.cALL n = len(pat) # it each pattern element matches one sequence char if star: # except for a final wildcard * # print 'pattern ending with *' n -= 1 # print 'ltx=' , ltx , 'n=' , n if ltx < n: continue # cannot match pattern properly pat = pat[:-1] t = txt[:n] else: if ltx < n: continue # cannot match pattern properly t = txt[-n:] if not ellyWildcard.match(pat,t,0): # print 'no possible pattern match' continue k = ltx - n # extra chars beyond any match # print 'k=' , k , 't=' , t # print 'txt=' , txt # print 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' # print 'matches' , n , 'chars' if not star and k > 0: # print 'check text before [' , txt[-n] , ']' if ellyChar.isLetterOrDigit(txt[-n]): c = txt[-n-1] # print 'preceding= [', c , ']' if ellyChar.isLetterOrDigit(c) or c == '&': continue # because break in text is required # print 'pat=' , ellyWildcard.deconvert(p.left) # print 'n=' , n , 'ltx=' , ltx # print 'txt=' , txt # nc = '\\n' if nxt == '\n' else nxt # print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' # print 'versus c=' , nc rp = p.right if rp == [] or rp[0] == ellyWildcard.cALL: return True pcx = rp[0] if pcx == nxt: # check for specific char after possible stop # print 'right=' , nxt return True elif pcx == ellyWildcard.cALF: # check for alphabetic if ellyChar.isLetter(nxt): # print 'right is alphabetic=' , nxt return True elif pcx == ellyWildcard.cDIG: # check for numeric if ellyChar.isDigit(nxt): # print 'right is numeric=' , nxt return True elif pcx == ellyWildcard.cUPR: # check for upper case if ellyChar.isUpperCaseLetter(nxt): return True elif pcx == ellyWildcard.cLWR: # check for lower case if ellyChar.isLowerCaseLetter(nxt): return True elif pcx == ellyWildcard.cCAN: # check for non-alphanumeric if ellyChar.isLetter(nxt): # print 'right is alphabetic=' , nxt return True # print "no matches" return False