def matchtoo(txt, pnc, ctx): """ complex checks - currently only for rightmost period of A.M. or P.M. arguments: txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - list of chars in context after punctuation returns: True on match, False otherwise """ ln = len(txt) # print ( 'nomatch() ln=' , ln , txt ) nxt = ctx[0] if len(ctx) > 0 else '' if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5: return False # print ( 'check' , txt[-3:] ) if not txt[-1] in ['M', 'm'] or txt[-2] != '.' or not txt[-3] in [ 'P', 'p', 'A', 'a' ] or txt[-4] != ' ': return False ch = txt[-5] # print ( 'ch=' , ch ) if ellyChar.isDigit(ch): # only 1 digit will be checked here! # print ( 'ONE DIGIT' ) return True # erring on the side of not to break sentence elif not ellyChar.isLetter(ch): return False # # the following code is needed only when number transforms are turned off # nn = 6 while nn <= ln and ellyChar.isLetter(txt[-nn]): nn += 1 # print ( 'nn=' , nn ) if nn < 3 or nn > 6: return False elif nn > ln: if not txt[-nn] in [' ', '-']: return False wd = ''.join(txt[:-nn]).lower() # print ( 'wd=' , wd ) if wd in [ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve' ]: if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]): return False else: return True else: return False
def matchtoo ( txt , pnc , ctx ): """ complex checks - currently only for rightmost period of A.M. or P.M. arguments: txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - list of chars in context after punctuation returns: True on match, False otherwise """ ln = len(txt) # print 'nomatch() ln=' , ln , txt nxt = ctx[0] if len(ctx) > 0 else '' if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5: return False # print 'check' , txt[-3:] if not txt[-1] in ['M','m'] or txt[-2] != '.' or not txt[-3] in ['P','p','A','a'] or txt[-4] != ' ': return False ch = txt[-5] # print 'ch=' , ch if ellyChar.isDigit(ch): # only 1 digit will be checked here! # print 'ONE DIGIT' return True # erring on the side of not to break sentence elif not ellyChar.isLetter(ch): return False # # the following code is needed only when number transforms are turned off # nn = 6 while nn <= ln and ellyChar.isLetter(txt[-nn]): nn += 1 # print 'nn=' , nn if nn < 3 or nn > 6: return False elif nn > ln: if not txt[-nn] in [ ' ' , '-' ]: return False wd = ''.join(txt[:-nn]).lower() # print 'wd=' , wd if wd in [ 'one' , 'two' , 'three' , 'four' , 'five' , 'six' , 'seven' , 'eight' , 'nine' , 'ten' , 'eleven' , 'twelve' ]: if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]): return False else: return True else: return False
def _find ( self , cmpo , smpl=True ): """ lookup method with recursion arguments: self - cmpo - simple or compound component smpl - simple flag returns: -1 or -2 if not found, component type code >= 0 otherwise """ # print '_find:' , cmpo lcmp = len(cmpo) if lcmp == 0: return NON if cmpo in self.dictn: # full name component known? return self.dictn[cmpo] if lcmp == 1: return INI if ellyChar.isLetter(cmpo[0]) else NON if cmpo[-1] == '.': # component ends in '.'? if lcmp == 2: if ellyChar.isLetter(cmpo[0]): return INI return NON if smpl and lcmp > 4: # check component by parts? pre = cmpo[:2] suf = cmpo[-2:] # print 'pre=' , pre , 'suf=' , suf if pre in self.pres: # if not known, check for prefix match for p in self.pres[pre]: x = p[0] n = len(x) # print 'recursion=' , p[2] if (n < lcmp and cmpo[:n] == x): # prefix match found? if not p[2] or self._find(cmpo[n:]) > 0: return p[1] elif suf in self.posts: # last resort is check for suffix match for p in self.posts[suf]: x = p[0] n = len(x) # print 'recursion=' , p[2] if (n < lcmp and cmpo[-n:] == x): # suffix match found? if not p[2] or self._find(cmpo[:-n]) > 0: return p[1] return NON
def _find(self, cmpo, smpl=True): """ lookup method with recursion arguments: self - cmpo - simple or compound component smpl - simple flag returns: -1 or -2 if not found, component type code >= 0 otherwise """ # print '_find:' , cmpo lcmp = len(cmpo) if lcmp == 0: return NON if cmpo in self.dictn: # full name component known? return self.dictn[cmpo] if lcmp == 1: return INI if ellyChar.isLetter(cmpo[0]) else NON if cmpo[-1] == '.': # component ends in '.'? if lcmp == 2: if ellyChar.isLetter(cmpo[0]): return INI return NON if smpl and lcmp > 4: # check component by parts? pre = cmpo[:2] suf = cmpo[-2:] # print 'pre=' , pre , 'suf=' , suf if pre in self.pres: # if not known, check for prefix match for p in self.pres[pre]: x = p[0] n = len(x) # print 'recursion=' , p[2] if (n < lcmp and cmpo[:n] == x): # prefix match found? if not p[2] or self._find(cmpo[n:]) > 0: return p[1] elif suf in self.posts: # last resort is check for suffix match for p in self.posts[suf]: x = p[0] n = len(x) # print 'recursion=' , p[2] if (n < lcmp and cmpo[-n:] == x): # suffix match found? if not p[2] or self._find(cmpo[:-n]) > 0: return p[1] return NON
def _planAhead ( buf ): """ check for possible problems in the next scan while context is still available and set flags if needed arguments: buf - buffer to be scanned """ global _toscan nsk = 0 # total skip count lb = len(buf) if lb > 4: if buf[0] == '(': # skip initial '(' nsk += 1 buf = buf[1:] if buf[0] == '"': # skip initial '"' nsk += 1 buf = buf[1:] lb -= nsk nix = 0 # scan count if lb > 8: for chx in buf: # go to first non-letter if not ellyChar.isLetter(chx): if ellyChar.isWhiteSpace(chx): break # must be space return nix += 1 sst = ''.join(buf[:nix]).lower() if not sst in _det: return # must find determiner nix += 1 # skip space if ellyChar.isUpperCaseLetter(buf[nix]): nix += 1 # skip first letter buf = buf[nix:] for ch in buf: # go to next non-letter if not ellyChar.isLetter(ch): if ellyChar.isWhiteSpace(ch): break return nix += 1 _toscan = lb + nsk - nix
def acronym ( buffr ): """ recognize parenthesized introduction of acronym in text arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ lb = len(buffr) if lb > Lmax: lb = Lmax if lb < Lmin or buffr[0] != '(': return 0 nu = 0 # uppercase count ib = 1 while ib < lb: bc = buffr[ib] ib += 1 if bc == ')': break if not ellyChar.isLetter(bc): return 0 if ellyChar.isUpperCaseLetter(bc): nu += 1 else: return 0 # must have enclosing ')' if ib < Lmin or ib - 2*nu > 0: return 0 if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0 return ib
def getRules ( self , a ): """ get appropriate macros for text with specified starting char arguments: self - a - first letter of current buffer contents (NOT space!) returns: a list of unpacked macro rules to try out """ # print 'getRules(a=' , a , ')' if a == '': return [ ] if ellyChar.isLetterOrDigit(a): k = ellyChar.toIndex(a) ls = self.index[k] # print 'index a=' , a , 'k=' , k ws = self.letWx if ellyChar.isLetter(a) else self.digWx uniqueAdd(ls,ws) uniqueAdd(ls,self.anyWx) elif ellyChar.isApostrophe(a): ls = self.apoWx else: ls = self.index[0] uniqueAdd(ls,self.anyWx) # print len(ls) , ' rules to check' return [ r.unpack() for r in ls ]
def normalize ( s ): """ convert all non-ASCII nonalphanumeric in sequence to _ and consecutive white spaces to a single space char arguments: s - input sequence to operate on """ spaced = False k = 0 n = len(s) for i in range(n): x = s[i] if ellyChar.isLetter(x): spaced = False elif ellyChar.isWhiteSpace(x): if spaced: continue x = ' ' spaced = True elif ord(x) > 127: x = '_' spaced = False else: spaced = False s[k] = x k += 1 s = s[:k]
def normalize(self, s): """ overrides method in parent class to convert all letters to _ and to eliminate any white space arguments: self - s - Unicode string or char list to operate on returns: normalized sequence """ # print 'ZH normalize' n = len(s) ns = [] for i in range(n): x = s[i] # print ' x=' , x if ellyChar.isLetter(x): x = '_' elif ellyChar.isWhiteSpace(x): continue # print 'norm x=' , x ns.append(x) # print 'norm=' , ns return ns
def normalize ( s ): """ convert all unrecognizable input chars to _ and any consecutive white spaces to a single space arguments: s - Unicode string or char list to operate on returns: normalized sequence """ spaced = False n = len(s) ns = [ ] for i in range(n): x = s[i] if ellyChar.isLetter(x): spaced = False elif ellyChar.isWhiteSpace(x): if spaced: continue x = ' ' spaced = True elif not ellyChar.isText(x): x = '_' spaced = False else: spaced = False ns.append(x) return ns
def __init__ ( self , symtb , defr ): """ initialization arguments: self - symtb - symbol table for interpreting syntax defr - definition input string """ self._errcount = 0 # print ( 'defr=' , defr ) ru = defr.split(' : ') if len(ru) != 2: self._err('incomplete template',defr) return [ elems , defns ] = ru rw = elems.split(' ') if len(rw) < 2: self._err('trivial template',defr) return le = [ ] for w in rw: # print ( 'w=' , w ) x = w.strip() lx = len(x) if lx == 0: self._err('null template element',defr) return if x[0] == '%': if lx > 1 and ellyChar.isLetter(x[1]): if lx > 2: if x[1] != '*': self._err('bad class ID',defr) return x = x.lower() le.append(x) if self._errcount > 0: return self.listing = le de = defns.split(' ') lde = len(de) if lde != 1 and lde != 3: self._err('bad template definition',defr) return syns = de[0] sems = de[1] if lde > 1 else None try: spec = syntaxSpecification.SyntaxSpecification(symtb,syns) semf = featureSpecification.FeatureSpecification(symtb,sems,True) except ellyException.FormatFailure: self._err('bad definition' , defr) return self.lstg = le self.catg = spec.catg self.synf = spec.synf.positive self.semf = semf.positive self.bias = int(de[2]) if lde > 1 else 0
def __init__ ( self , symtb , defr ): """ initialization arguments: self - symtb - symbol table for interpreting syntax defr - definition input string """ self._errcount = 0 # print 'defr=' , defr ru = defr.split(' : ') if len(ru) != 2: self._err('incomplete template',defr) return [ elems , defns ] = ru rw = elems.split(' ') if len(rw) < 2: self._err('trivial template',defr) return le = [ ] for w in rw: # print 'w=' , w x = w.strip() lx = len(x) if lx == 0: self._err('null template element',defr) return if x[0] == '%': if lx > 1 and ellyChar.isLetter(x[1]): if lx > 2: if x[1] != '*': self._err('bad class ID',defr) return x = x.lower() le.append(x) if self._errcount > 0: return self.listing = le de = defns.split(' ') lde = len(de) if lde != 1 and lde != 3: self._err('bad template definition',defr) return syns = de[0] sems = de[1] if lde > 1 else None try: spec = syntaxSpecification.SyntaxSpecification(symtb,syns) semf = featureSpecification.FeatureSpecification(symtb,sems,True) except ellyException.FormatFailure: self._err('bad definition' , defr) return self.lstg = le self.catg = spec.catg self.synf = spec.synf.positive self.semf = semf.positive self.bias = int(de[2]) if lde > 1 else 0
def getRules(self, a): """ get appropriate macros for text with specified starting char arguments: self - a - first letter of current buffer contents (NOT space!) returns: a list of unpacked macro rules to try out """ # print ( 'getRules(a=' , a , ')' ) if a == '': return [] if ellyChar.isLetterOrDigit(a): k = ellyChar.toIndex(a) ls = self.index[k] # print ( 'index a=' , a , 'k=' , k ) ws = self.letWx if ellyChar.isLetter(a) else self.digWx uniqueAdd(ls, ws) uniqueAdd(ls, self.anyWx) elif ellyChar.isApostrophe(a): ls = self.apoWx else: ls = self.index[0] uniqueAdd(ls, self.anyWx) # print ( len(ls) , ' rules to check' ) return [r.unpack() for r in ls]
def normalize(self, s): """ convert all unrecognizable input chars to _ and any consecutive white spaces to a single space arguments: self - s - Unicode string or char list to operate on returns: normalized sequence """ # print ( '__ normalize' ) spaced = False n = len(s) ns = [] for i in range(n): x = s[i] if ellyChar.isLetter(x): spaced = False elif ellyChar.isWhiteSpace(x): if spaced: continue x = ' ' spaced = True elif not ellyChar.isText(x): x = '_' spaced = False else: spaced = False ns.append(x) return ns
def divide ( self , word ): """ apply inflectional analysis, including for -'s and -s' arguments: self - word - ellyToken exceptions: StemmingError """ # print "divide" , word wl = word.getLength() # if so, is word long enought to be divided if wl < 3: return # if not, done if word.isAffix(): # if term is already product of division, stop return # print 'word suffixes=' , word.sufs , word.dvdd x = word.charAt(wl-1) # get last two chars of word y = word.charAt(wl-2) # print 'word= ...' , y , x if x == ESS and ( y == APO or y == APX # check for -'S ): # print "-'s ending" word.shortenBy(2) self.putSuffixBack(SFX) # print 'word=' , word , 'without -\'S' return elif y == ESS and ( x == APO or x == APX # check for implied -'S ): # print "-s' ending" word.shortenBy(1) self.putSuffixBack(SFX) # print 'word=' , word , 'without -\'' return if ellyChar.isLetter(word.charAt(0)): # print 'apply stemmer' self.stemmer.apply(word) # apply any inflectional stemmer # print 'word= ' , word if word.isSplit(): # print 'is split' sufs = word.getSuffixes() # print 'pres=' , word.getPrefixes() # print 'sufs=' , sufs while len(sufs) > 0: self.putSuffixBack(sufs.pop())
def divide(self, word): """ apply inflectional analysis, including for -'s and -s' arguments: self - word - ellyToken exceptions: StemmingError """ # print ( "divide" , word ) wl = word.getLength() # if so, is word long enought to be divided if wl < 3: return # if not, done if word.isAffix(): # if term is already product of division, stop return # print ( 'word suffixes=' , word.sufs , word.dvdd ) x = word.charAt(wl - 1) # get last two chars of word y = word.charAt(wl - 2) # print ( 'word= ...' , y , x ) if x == ESS and (y == APO or y == APX # check for -'S ): # print ( "-'s ending" ) word.shortenBy(2) self.putSuffixBack(SFX) # print ( 'word=' , word , 'without -\'S' ) return elif y == ESS and (x == APO or x == APX # check for implied -'S ): # print ( "-s' ending" ) word.shortenBy(1) self.putSuffixBack(SFX) # print ( 'word=' , word , 'without -\'' ) return if ellyChar.isLetter(word.charAt(0)): # print ( 'apply stemmer' ) self.stemmer.apply(word) # apply any inflectional stemmer # print ( 'word= ' , word ) if word.isSplit(): # print ( 'is split' ) sufs = word.getSuffixes() # print ( 'pres=' , word.getPrefixes() ) # print ( 'sufs=' , sufs ) while len(sufs) > 0: self.putSuffixBack(sufs.pop())
def isNewRule(s): """ check for start of new rule or procedure in processing input lines arguments: s - input line as string returns: True if new rule or procedure, False otherwise """ return (len(s) > 2 and ellyChar.isLetter(s[0]) and s[1] == ':')
def isNewRule ( s ): """ check for start of new rule or procedure in processing input lines arguments: s - input line as string returns: True if new rule or procedure, False otherwise """ return (len(s) > 2 and ellyChar.isLetter(s[0]) and s[1] == ':')
def alphc ( s ): """ check that all chars are letters arguments: s - list of chars returns: True if string is all alphabetic, False otherwise """ # print 'alphc s=' , s if len(s) == 0: return False for c in s: # print 'c=' , c , type(c) , len(c) if not ellyChar.isLetter(c): return False return True
def alphc ( s ): """ check that all chars are letters arguments: s - list of chars returns: True if string is all alphabetic, False otherwise """ # print ( 'alphc s=' , s ) if len(s) == 0: return False for c in s: # print ( 'c=' , c , type(c) , len(c) ) if not ellyChar.isLetter(c): return False return True
def _scan ( buffr ): """ count chars to first non-alphabetic char arguments: buffr - list of chars returns: number of letters """ n = 0 ln = len(buffr) while n < ln: if not ellyChar.isLetter(buffr[n]): break n += 1 return n
def divide ( self , word ): """ apply inflectional analysis, including for -'s and -s' arguments: self - word - ellyToken exceptions: StemmingError """ # print "divide" , word wl = word.getLength() # if so, is word long enought to be divided if wl < 3: return # if not, done if word.isAffix(): # if term is already product of division, stop return x = word.charAt(wl-1) # get last two chars of word y = word.charAt(wl-2) # print 'word= ...' , y , x if x == u's' and y == APO: # check for -S' word.addSuffix(APO+ESS) word.shortenBy(2) # print 'word=' , word return elif x == APO and y == ESS: # check for implied -'S word.addSuffix(APO+ESS) word.shortenBy(1) # print 'word=' , word return if ellyChar.isLetter(word.charAt(0)): self.stemmer.apply(word) # apply any inflectional stemmer if word.isSplit(): sufs = word.getSuffixes() # print 'sufs=' , sufs while len(sufs) > 0: if self.atToken(): self.prepend(ellyChar.SPC) self.prepend(sufs.pop())
def timePeriod ( buffr ): """ recognize time period in a day arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ # print ( 'buffr=' , buffr ) ln = len(buffr) if ln == 0 or not ellyChar.isLetter(buffr[0]): return 0 k = _scan(buffr) # g( '0 k=' , k ) if k == ln or buffr[k] != ' ': return 0 a = ''.join(buffr[:k]).lower() # print ( 'a=' , a ) if a in _modifier: n = k + 1 buffr = buffr[n:] else: n = 0 k = _scan(buffr) # print ( '1 k=' , k ) if k < 6: return 0 b = ''.join(buffr[:k]).lower() if not b in _day: return 0 buffr = buffr[k:] n += k if len(buffr) < 5 or buffr[0] != ' ': return n if n > k else 0 m = _scan(buffr[1:]) # print ( '2 m=' , m ) c = ''.join(buffr[1:m+1]).lower() if c in _period: return n + m + 1 else: return n if n > k else 0
def timePeriod ( buffr ): """ recognize time period in a day arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ # print 'buffr=' , buffr ln = len(buffr) if ln == 0 or not ellyChar.isLetter(buffr[0]): return 0 k = _scan(buffr) # print '0 k=' , k if k == ln or buffr[k] != ' ': return 0 a = u''.join(buffr[:k]).lower() # print 'a=' , a if a in _modifier: n = k + 1 buffr = buffr[n:] else: n = 0 k = _scan(buffr) # print '1 k=' , k if k < 6: return 0 b = u''.join(buffr[:k]).lower() if not b in _day: return 0 buffr = buffr[k:] n += k if len(buffr) < 5 or buffr[0] != ' ': return n if n > k else 0 m = _scan(buffr[1:]) # print '2 m=' , m c = u''.join(buffr[1:m+1]).lower() if c in _period: return n + m + 1 else: return n if n > k else 0
def _enstrg(self): """ get chars from auxiliary buffer arguments: self returns: chars joined into a string """ self.deleteCharsFromBuffer(100) chi = list(self.getDeletion()) cho = [] for ch in chi: if ellyChar.isLetter(ch): cho.append(ellyChar.Unmapping[ellyChar.Mapping[ord(ch)]]) return "".join(cho)
def getRules ( self , a ): """ get appropriate macros for text starting with specified first char arguments: self - a - first letter of current buffer contents (NOT space!) returns: a list of macro rules to try out """ if a == '': return [ ] if ellyChar.isLetterOrDigit(a): k = ellyChar.toIndex(a) ws = self.letWx if ellyChar.isLetter(a) else self.digWx ls = self.index[k] + ws + self.anyWx else: ls = self.index[0] + self.anyWx return ls
def build(self, inp): """ build tree logic from definition reader input arguments: self - inp - definition text for logic exceptions: TableFailure on error """ if inp == None: return nerr = 0 # error count # read in affixes and associated actions while True: line = inp.readline() # next input line if line == '': # check for EOF break modf = '' elem = line.strip().lower().split(' ') # print ( 'elem=' , elem ) le = len(elem) if le < 4: nerr += 1 print("** affix error: incomplete input", file=sys.stderr) print("* at: [", line, "]", file=sys.stderr) continue # skip incomplete line if le > 4: # affix mod specified? modf = elem.pop() # if so, get it # print ( elem[0] , modf ) do = elem.pop() # note main action # get affix within definition line aff = list(elem.pop(0)) # affix as list of chars # check for proper form aff = self.sequence(aff) # backward or forward matching? # print ( 'aff=' , aff ) c = aff[0] # get first char to compare with aff = aff[1:] if (not ellyChar.isLetter(c) and c != '+'): # affix must start with letter or '+' nerr += 1 print("** affix error: must start with letter or '+'", file=sys.stderr) print("* at: [", line, "]", file=sys.stderr) continue # ignore line if not c in self.indx: # node not already in tree index? self.indx[c] = Node() # add new node node = self.indx[c] for a in aff: # now check each successive char in affix if a in node.contn: node = node.contn[a] # go to existing node if found else: new = Node() # otherwise make new node node.contn[a] = new # and insert into tree node = new # and move down # at final node in tree logic node.condn = int(elem.pop(0)) # condition for match try: nsave = 0 if len(elem) == 0 else int(elem.pop()) except ValueError as e: print(e, file=sys.stderr) print("* at: [", line, "]", file=sys.stderr) continue # ignore line resto = [Add] # set to defaults recur = False # mode = do[-1] # kind of recursion rest = do[:-1] # added chars to fill out root # print ( 'mode=' + '<' + mode + '>' , 'rest=' , rest ) if mode == '?': node.condn = 1 resto = [Fail] # will generate fatal error elif ellyChar.isDigit(mode): nerr += 1 print("* bad action mode=", mode, file=sys.stderr) continue else: if mode == ',': # allow recursion? recur = True # if so, change default if len(rest) == 1 and rest[0] == '&': resto = [RestorE] else: resto += list(rest) if self.addn != None: resto.insert(1, self.addn) # insert AFTER first char of list # print ( 'resto=' , resto ) # insert action node.actns = Action(self, nsave, resto, recur, modf) node.tag() # if modf != '': print ( node , node.actns ) if nerr > 0: print("**", nerr, "affix errors in all", file=sys.stderr) raise ellyException.TableFailure
def scan ( buffr ): """ recognize personal names in text at current position arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ def doLook ( mth , itm ): """ do lookup with specified method using global variables in Python 2.7.* arguments: mth - name table method itm - string to look up """ global _typ , _nch # really need nonlocal _typ = mth(itm) if _typ < 0 and len(itm) > 3: # if no match, check for final '.' if itm[-1] == '.': _typ = mth(itm[:-1]) if _typ >= 0: _nch -= 1 # match without '.' global _typ , _nch global _toscan # print ( 'table=' , _table ) bln = len(buffr) if _table == None or bln < 2: return 0 if _toscan > 0: if bln > _toscan: return 0 else: _toscan = 0 chx = buffr[0] # print ( 'scan chx=' , chx ) if not ellyChar.isLetter(chx) and chx != '(' and chx != '"': return 0 cmps = [ ] # name components this time ncmp = 0 # number of components for current name ninf = 0 # number inferred ntyp = len(nameTable.TYP) stat = [False]*ntyp # define state for getting personal name mlen = 0 # last match length bix = 0 # buffer index to advance in scanning _typ = -1 while bix < bln: ltyp = -1 # last match type _nch = _limit(buffr[bix:],mlen) # length of next possible name component # print ( 'top _nch=' , _nch ) if _nch == 0: return 0 elm = _extract(buffr[bix:],_nch) # get possible component as string sch = buffr[bix] enclosed = (sch == '(' or sch == '"') # type of next element doLook(_table.lookUp,elm) # look it up in saved name table # print ( 'lookUp(' , elm , ')=' , _typ ) if _typ < 0: if _typ == nameTable.REJ: return 0 # immediate rejection of any match if _typ == nameTable.STP: break # stop any more matching if elm[-1] == '.': # drop any trailing '.' elm = elm[:-1] if not enclosed: _nch -= 1 if enclosed: # enclosed element assumed to be name if not elm in _cntxt: _cntxt.append(elm) # make sure always to save in local context ninf += 1 # this is inferred! if elm in _cntxt: _typ = nameTable.XNM # neutral name type to be noncommital if _typ < 0: tok = buffr[bix:bix + _nch] # unknown token to check # print ( 'call infer with tok=' , tok ) if infer(tok): # print ( 'digraph test passed' ) _typ = nameTable.XNM # neutral name type inferred if not _table.checkPhonetic(tok): ninf += 1 # count inferred component if no phonetic support # print ( '_typ=' , _typ ) if nameTable.starts(_typ) and bix > 0: # if component not at start of name, break # must stop name scan # print ( 'continuing bix=' , bix ) while _typ >= 0: # continue as long as match is viable ncmp += 1 # count up component cmps.append(elm) # save component bix += _nch # move ahead in scan # print ( 'bix=' , bix ) if _typ > 0: # print ( '_typ=' , _typ ) if stat[_typ]: # check for duplication of component type if (ltyp >= 0 and ltyp != _typ): # allowed only if duplicate is consecutive break mlen = bix # save index on actual match ltyp = _typ if nameTable.ends(_typ): # if component marks end of name, break # must stop name scan stat[_typ] = True # update match state if bix == bln: break if ellyChar.isWhiteSpace(buffr[bix]): bix += 1 # skip any space to start of next component _nch = _limit(buffr[bix:],mlen) # length of next possible name component if _nch == 0: break elm = _extract(buffr[bix:],_nch) # get possible next component as string doLook(_table.lookUpMore,elm) # look it up in saved name table # print ( 'lookUpMore(' , elm , ')=' , _typ ) if _typ < 0: # while-loop terminated without break # print ( 'ltyp=' , ltyp , 'mlen=' , mlen ) if ltyp < 0 or mlen == 0: break bix = mlen # restart at end of last match if bix == bln: break if ellyChar.isWhiteSpace(buffr[bix]): bix += 1 # skip any space to start of next component continue break # # #### additional constraints on acceptable personal name # # print ( 'checking ltyp=' , ltyp ) if (ltyp == nameTable.CNJ or ltyp == nameTable.REL): # a name cannot end with these types mlen -= _nch # have to drop them from any match if mlen == 0: return 0 if ellyChar.isWhiteSpace(buffr[mlen-1]): mlen -= 1 ncmp -= 1 cmps.pop() # print ( 'ncmp=' , ncmp ) if ncmp == 0: # nothing matched? _planAhead(buffr) # check for possible problems in next scan return 0 # print ( 'cmps=' , cmps ) if ncmp == ninf: return 0 # name cannot be purely inferred # print ( 'ncmp=' , ncmp ) if ncmp == 1: # single-component name must be known or contextual if (not stat[nameTable.SNG] and not cmps[0] in _cntxt): return 0 # print ( 'stat=' , stat[3:7] ) expl = (stat[nameTable.PNM] or # name must have a substantial component stat[nameTable.SNM] or stat[nameTable.XNM] or stat[nameTable.SNG]) # print ( 'expl=' , expl ) if (not expl and not (stat[nameTable.TTL] and # or it could have just a title stat[nameTable.INI])): # and an initial return 0 # #### # print ( 'accepted mlen=' , mlen ) for cmpo in cmps: # if whole name is OK, if not cmpo in _cntxt: # remember all components _cntxt.append(cmpo) # not already listed in context return mlen # will be > 0 on successful match
def simpleDeinflection(self, ss, ssp, ssl, mr): """ handle matching of certain forms of English inflectional endings (override this method for other languages) arguments: self - ss - input string of chars to scan for match ssp - current position in input string ssl - limit of matching in input mr - next chars to look for in input returns: char count >= 0 on match, -1 otherwise """ self.endg = '' # null inflection by default if len(mr) == 0 and ssp == ssl: finAPO(ss, ssp - 1) return 0 if ssp < 2 or ss[ssp - 2] == ' ': return -1 ts = ss[ssp:] # where to look for inflection mc = ss[ssp - 1] # last char matched lm = len(mr) # print ts , 'mc=' , mc , 'mr=' , mr if not ellyChar.isLetter(mc): return -1 dss = ssl - ssp # # print 'dss=' , dss if dss == 0: # must handle special case here if lm == 0: finAPO(ss, ssp - 1) return 0 elif dss == 1: # just a single letter left for inflection if lm != 0: return -1 elif ts[0].lower() == 's': self.endg = '-s' finAPO(ss, ssp) return 1 elif mc == 'e' and ts[0].lower() == 'd': self.endg = '-ed' return 1 elif dss == 2: # 2 letters for inflection if lm == 0 and ts[0].lower() == 'e': if ts[1].lower() == 'd': self.endg = '-ed' return 2 elif ts[1].lower() == 's': self.endg = '-s' finAPO(ss, ssp + 1) return 2 elif dss == 3: # 3 letters for inflection # print 'ts=' , ts , 'mr=' , mr if ts[0].lower() == 'i': if ts[1].lower() == 'e': if lm == 1 and mr[0] == 'y': if ts[2].lower() == 's': self.endg = '-s' return 3 elif ts[2].lower() == 'd': self.endg = '-ed' return 3 elif ts[1].lower() == 'n' and ts[2].lower() == 'g': if lm == 0 or lm == 1 and mr[0] == 'e': self.endg = '-ing' return 3 if lm == 0 and ts[0].lower() == mc and ts[1].lower( ) == 'e' and ts[2].lower() == 'd': self.endg = '-ed' return 3 elif dss == 4: # 4 letters for inflection if lm == 0 and ts[0].lower() == mc and ts[1] == 'i' and ts[ 2].lower() == 'n' and ts[3].lower() == 'g': self.endg = '-ing' return 4 if ts[0].lower() == 'y' and ts[1].lower() == 'i' and ts[2].lower( ) == 'n' and ts[3].lower() == 'g': if lm == 2 and mr[0] == 'i' and mr[1] == 'e': self.endg = '-ing' return 4 return -1 # something other than inflection found
def simpleDeinflection ( self , ss , ssp , ssl , mr ): """ handle matching of certain forms of English inflectional endings (override this method for other languages) arguments: self - ss - input string of chars to scan for match ssp - current position in input string ssl - limit of matching in input mr - next chars to look for in input returns: char count >= 0 on match, -1 otherwise """ self.endg = '' # null inflection by default if len(mr) == 0 and ssp == ssl: finAPO(ss,ssp-1) return 0 if ssp < 2 or ss[ssp-2] == ' ': return -1 ts = ss[ssp:] # where to look for inflection mc = ss[ssp-1] # last char matched lm = len(mr) # print ts , 'mc=' , mc , 'mr=' , mr if not ellyChar.isLetter(mc): return -1 dss = ssl - ssp # # print 'dss=' , dss if dss == 0: # must handle special case here if lm == 0: finAPO(ss,ssp-1) return 0 elif dss == 1: # just a single letter left for inflection if lm != 0: return -1 elif ts[0].lower() == 's': self.endg = '-s' finAPO(ss,ssp) return 1 elif mc == 'e' and ts[0].lower() == 'd': self.endg = '-ed' return 1 elif dss == 2: # 2 letters for inflection if lm == 0 and ts[0].lower() == 'e': if ts[1].lower() == 'd': self.endg = '-ed' return 2 elif ts[1].lower() == 's': self.endg = '-s' finAPO(ss,ssp+1) return 2 elif dss == 3: # 3 letters for inflection # print 'ts=' , ts , 'mr=' , mr if ts[0].lower() == 'i': if ts[1].lower() == 'e': if lm == 1 and mr[0] == 'y': if ts[2].lower() == 's': self.endg = '-s' return 3 elif ts[2].lower() == 'd': self.endg = '-ed' return 3 elif ts[1].lower() == 'n' and ts[2].lower() == 'g': if lm == 0 or lm == 1 and mr[0] == 'e': self.endg = '-ing' return 3 if lm == 0 and ts[0].lower() == mc and ts[1].lower() == 'e' and ts[2].lower() == 'd': self.endg = '-ed' return 3 elif dss == 4: # 4 letters for inflection if lm == 0 and ts[0].lower() == mc and ts[1] == 'i' and ts[2].lower() == 'n' and ts[3].lower() == 'g': self.endg = '-ing' return 4 if ts[0].lower() == 'y' and ts[1].lower() == 'i' and ts[2].lower() == 'n' and ts[3].lower() == 'g': if lm == 2 and mr[0] == 'i' and mr[1] == 'e': self.endg = '-ing' return 4 return -1 # something other than inflection found
def __init__ ( self , inpr ): """ define table from text input arguments: self - inpr - EllyDefinitionReader throws: TableFailure on table definition failure """ self.pres = { } self.posts = { } self.dictn = { } self.phone = [ ] self.compn = '' self._nerr = 0 # print 'TYP=' , TYP while True: lin = inpr.readline().lower() # ignore capitalization if len(lin) == 0: break if lin[0] == '=': # phonetic entry? lin = lin[1:] # if so, remove marker first = '' if lin[0] == 'a': # vowel is first? first = 'a' # if so, remove it lin = lin[1:] pho = first + lin.upper() # combine any vowel with uppercase rest self.phone.append(pho) # save in phonetic list continue lins = lin.strip().split(':') if len(lins) != 2: # type definition must have two parts self._err(lne=lin) continue typ = lins[1].strip() # get component type if not typ in TYP: self._err('bad name component type',lin) continue cod = TYP[typ] els = lins[0].strip().split(' ') # name component # print 'type=' , '"' + typ + '"' , els lim = len(els) if lim == 1: cmpo = els[0] chf = cmpo[0] # first char of component chl = cmpo[-1] # last char if chf == '-' or chf == '+': if not ellyChar.isLetter(chl) or len(cmpo) < 3: self._err('bad end of name',lin) continue dky = cmpo[-2:] # dictionary key is 2 chars only if not dky in self.posts: self.posts[dky] = [ ] self.posts[dky].append([ cmpo[1:] , cod , (chf == '+') ]) elif chl == '-' or chl == '+': if not ellyChar.isLetter(chf) or len(cmpo) < 3: self._err('bad start of name',lin) continue dky = cmpo[:2] # dictionary key is 2 chars only if not dky in self.pres: self.pres[dky] = [ ] self.pres[dky].append([ cmpo[:-1] , cod , (chl == '+') ]) else: self.dictn[cmpo] = cod if cmpo[-1] == '.': # if ending with '.' , also save without self.dictn[cmpo[:-1]] = cod continue Nix = 1 while Nix <= lim: # process elements of name component cmpo = ' '.join(els[0:Nix]) Nix += 1 if cmpo not in self.dictn: # first Nix elements self.dictn[cmpo] = CND if self.dictn[cmpo] != CND: self._err('name component redefined',lin) continue self.dictn[cmpo] = TYP[typ] # put into table if self._nerr > 0: print >> sys.stderr , '**' , self._nerr, 'name errors in all' print >> sys.stderr , 'name table definition FAILed' raise ellyException.TableFailure
def apply(self, token, extra=None): """ apply inflectional stemming logic against token arguments: self - token - input token extra - extra token char for any restoration returns: status code exceptions: StemmingError """ last = None # save last popped letter if len(self.table) < 2: # check for empty table return isNOTM # if so, no match # print ( 'at' , self.table[0] , extra ) it = 0 # stemming logic index word = token.root # list of letters in token word m = len(word) # end of word seq = self.table[it] # suffix to match it += 1 # n = len(seq) # ending length to match if n >= m: # return isNOTM # word not long enough for ending msh = m - n # check that table is right one for word ending ew = m # just past end of token word # print ( "suffix length= ", n, ", word length= ", m ) if n > 0: for ix in range(n): ew -= 1 # print ( word[ew], " cmp ", seq[ix] ) if word[ew] != seq[ix]: return isNOTM ew -= 1 # print ( "first char before suffix=" , end=' ') # print ( '[' + ( word[ew] if ew >= 0 else None ) + ']' ) # interpret table logic last = seq[-1] if n > 0 else extra word = word[:msh] # copy of word up to removed suffix # print ( 'word=' , word ) if not ellyChar.isLetter(word[-1]): return isNOTM while True: # advance through logic until success or failure opcode = self.table[it] # next operation code to interpret it += 1 # # print ( "opcode=", opcode ) if opcode < 0: # YE(S) on match with possible modifications # word satisfies conditions for ending removal word = token.root[:msh] # word without ending # print ( 'word=', word ) # print ( 'add or drop extra chars' ) nm = YE - opcode # get removal count from opcode # print ( 'nm=', nm ) if nm < 0: # any special restoration? if last == None: print('FATAL stemming logic error', file=sys.stderr) sys.stdout.flush() sys.exit(1) # print ( 'restore' , '[' + last + ']' ) word.append( last) # negative count restores last removed letter else: # print ( 'drop' , nm , 'from [' , word , ']' ) while nm > 0: # otherwise drop additional letters if len(word) == 0: print('FATAL stemming logic error', file=sys.stderr) sys.stdout.flush() sys.exit(1) last = word.pop() nm -= 1 # print ( 'extend=' , self.table[it] ) # append more chars word.extend(self.table[it]) token.root = word # replace token with stemmed result # print ( 'word=' , word ) # print ( 'root=' , token.root ) return isMTCH # success flag elif opcode == NO: # no match # print ( "fail!" ) return isNOTM elif opcode == IF: # enter logic block if a char sequence matches seq = self.table[it + 1] # print ( 'seq=' , seq , 'ew=' , ew , 'word=' , word[:ew] ) sln = len(seq) if sln > len(word): # enough chars to match? it += self.table[it] # if not, skip over block of logic else: k = 0 # j = -1 # print ( 'at' , j , word[] ) while k < sln and word[-k - 1] == seq[k]: # print ( 'word[' + str(k) + ']=' , word[j] ) k += 1 # j -= 1 # print ( 'k=' , k ) if k < sln: # any characters unmatched? # print ( 'IF no match' ) it += self.table[it] # if so, skip over block of logic else: # print ( 'IF match' ) it += 2 # otherwise, enter logic block word = word[:-sln] # update index in word elif opcode == IS: # check whether next character is in a specified set if len(word) <= 0: # any letters left in word? it += self.table[it] # if not, skip over block continue chs = self.table[it + 1] # get character set c = word[-1] # print ( c, ':', chs ) if chs.find(c) < 0: # word character in set? it += self.table[it] # if not, skip block else: it += 2 # if so, enter block elif opcode < Nlen: # check length of word k = self.table[it + 1] # comparison length # print ( "k= ", k, " : m= ", m , 'opcode=' , opcode ) if opcode == LT: # set match flag for type of comparison match = (m < k) # elif opcode == GT: # match = (m > k) # elif opcode == EQ: # match = (m == k) # elif opcode == NE: # match = (m != k) # else: return isNOTM # print ( "match= ", match ) if not match: # if no match, skip block it += self.table[it] else: # otherwise, go into logic of block it += 2 # elif opcode == MO: # continue to another logic table token.root = token.root[:msh] # print ( 'for more, set root=' , token.root ) return doMORE # let other table figure out what to do elif opcode == VO: # look for CVC pattern at end of stemming # and possibly restore -E word = token.root[:msh] # strip ending from end of word # print ( 'vowel check for' , word ) me = len(word) - 2 # at possible vowel in stemming result # last char assumed to be consonant # print ( 'me=' , me ) if me < 0 or ellyChar.isStrongConsonant(word[me]): token.root = word return isMTCH me -= 1 # vowel found; now check for consonant # print ( 'me=' , me ) if me < 0 or ellyChar.isStrictVowel(word[me]): return isMTCH if me <= 0 or word[me] != 'u' or word[me - 1] == 'q': word.append('e') # put back -E token.root = word # print ( 'final word=' , word ) return isMTCH else: return isNOTM raise ellyException.StemmingError
def _limit ( buffr , hstry ): """ get length of next possible name component in buffer arguments: buffr - list of chars hstry - how much matched already returns: number of chars in continuation of last component, 0 for no next component """ lnb = len(buffr) if lnb == 0: return 0 bix = 0 quot = False # indicate component starting with " parn = False # with ( cmma = False # with , # print '_limit buffr=' , buffr , 'hstry=' , hstry if buffr[0] == ',': # handle possible leading comma if hstry == 0 or lnb < 4: return 0 bix += 1 if ellyChar.isWhiteSpace(buffr[1]): bix += 1 cmma = True # print 'for comma, bix=' , bix if buffr[bix] == '(': # handle short name in parentheses bix += 1 parn = True if buffr[bix] == '"': # handle short name in double quotes bix += 1 quot = True # print 'parn=' , parn , 'quot=' , quot if parn or quot: # print 'enclosed component from' , buffr[bix:] while bix < lnb: # collect letters for name chx = buffr[bix] if ellyChar.isWhiteSpace(chx): break elif not quot and parn and chx == ')': return bix + 1 # add trailing parenthesis elif quot and chx == '"': if bix + 1 < lnb and parn and buffr[bix+1] == ')': return bix + 2 # add trailing quote and parenthesis elif not parn: return bix + 1 # add trailing quote only else: return 0 # no match elif chx == '.': return bix + 1 # add trailing period elif not ellyChar.isLetter(chx): break # unrecognizable char for name bix += 1 # print 'no closure' return 0 else: # print 'find component in' , buffr[bix:] while bix < lnb: chx = buffr[bix] # collect letters for name # print 'chx=' , chx if chx == "'": if bix + 2 < lnb: chn = buffr[bix+1] if ellyChar.isWhiteSpace(chn): break if chn == 's' and not ellyChar.isLetter(buffr[bix+2]): break elif not ellyChar.isLetter(chx): if chx == '.': bix += 1 # print 'increment bix=' , bix break bix += 1 if bix == lnb: # print 'ran out of chars' return bix # running out of chars means match else: # getting here means that more text follows limit # and so we may have to pick up extra chars here chx = buffr[bix] # print 'next chx=' , chx , 'bix=' , bix if ellyChar.isWhiteSpace(chx) or chx == "'": return bix # component can be terminated by space or (') elif chx == ',': if cmma: return bix + 1 # or comma when sequence starts with comma else: return bix # when there is no starting comma elif ellyChar.isLetter(chx): return bix # or letter, implying previous char was '.' else: return 0 # failure to find name limit
def match(self, txt, pnc, ctx): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - next chars after punctuation returns: True on match, False otherwise """ # print ( 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx ) if matchtoo(txt, pnc, ctx): # exception by complex match? return True # print ( 'matchtoo() returned False' ) sep = ctx[0] if len(ctx) > 0 else '' if sep == ellyChar.THS: return True nxt = ctx[1] if len(ctx) > 1 else '' # print ( 'lstg=' , self.lstg.keys() ) if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print ( len(lp) , 'patterns' ) ltx = len(txt) # current length of accumulated text so far ntr = 1 while ntr <= ltx: if not ellyChar.isLetterOrDigit(txt[-ntr]): break ntr += 1 nrg = ntr ntr -= 1 # available trailing chars for wildcard * match while nrg <= ltx: c = txt[-nrg] if not ellyChar.isLetterOrDigit( c) and not ellyChar.isEmbeddedCombining(c): # print ( 'break at nrg=' , nrg , txt[-nrg] ) break nrg += 1 nrg -= 1 # end of range for all pattern matching # print ( 'ntr=' , ntr , 'nrg=' , nrg ) txt = txt[-nrg:] # reset text to limit for matching ltx = len(txt) # its new length # print ( 'txt= ' + str(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' ) for p in lp: # try matching each listed exception pattern if p.left != None and len(p.left) > 0: pat = p.left star = pat[-1] == ellyWildcard.cALL n = len( pat) # it each pattern element matches one sequence char if star: # except for a final wildcard * # print ( 'pattern ending with *' ) n -= 1 # print ( 'ltx=' , ltx , 'n=' , n ) if ltx < n: continue # cannot match pattern properly pat = pat[:-1] t = txt[:n] else: if ltx < n: continue # cannot match pattern properly t = txt[-n:] if not ellyWildcard.match(pat, t, 0): # print ( 'no possible pattern match' ) continue k = ltx - n # extra chars beyond any match # print ( 'k=' , k , 't=' , t ) # print ( 'txt=' , txt ) # print ( 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' ) # print ( 'matches' , n , 'chars' ) if not star and k > 0: # print ( 'check text before [' , txt[-n] , ']' ) if ellyChar.isLetterOrDigit(txt[-n]): c = txt[-n - 1] # print ( 'preceding= [', c , ']' ) if ellyChar.isLetterOrDigit(c) or c == '&': continue # because break in text is required # print ( 'pat=' , ellyWildcard.deconvert(p.left) ) # print ( 'n=' , n , 'ltx=' , ltx ) # print ( 'txt=' , txt ) # nc = '\\n' if nxt == '\n' else nxt # print ( 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' ) # print ( 'versus c=' , nc ) rp = p.right if rp == [] or rp[0] == ellyWildcard.cALL: return True pcx = rp[0] if pcx == nxt: # check for specific char after possible stop ) # print ( 'right=' , nxt ) return True elif pcx == ellyWildcard.cALF: # check for alphabetic if ellyChar.isLetter(nxt): # print ( 'right is alphabetic=' , nxt ) return True elif pcx == ellyWildcard.cDIG: # check for numeric if ellyChar.isDigit(nxt): # print ( 'right is numeric=' , nxt 0 return True elif pcx == ellyWildcard.cUPR: # check for upper case if ellyChar.isUpperCaseLetter(nxt): return True elif pcx == ellyWildcard.cLWR: # check for lower case if ellyChar.isLowerCaseLetter(nxt): return True elif pcx == ellyWildcard.cCAN: # check for non-alphanumeric if ellyChar.isLetter(nxt): # print ( 'right is alphabetic=' , nxt ) return True # print ( "no matches" ) return False
def match(self, token): """ compare an Elly token against a tree and possibly modify that token after a match arguments: self - token - input Elly token returns: True on match, False otherwise """ # print ( token ) rec = True # recursion flag suc = False # success flag while rec: # continue comparisons recursively while flag is True # print ( 'token root=' , token.root ) if len(token.root) < 3: break # stop if token root is too short dlt = 0 if ellyChar.isLetter(token.root[0]) else -2 seq = self.sequence( token.root) + [Bound] # token sequence plus sentinel # print ( type(self).__name__ ,'seq=' , seq ) chs = seq[0] # first char in sequence to match if not chs in self.indx: return suc nod = self.indx[chs] # starting node in tree # print ( 'start nod.id=' , nod.id , '/' , (Node.Ni - 1) ) lvl = 0 # level in tree mst = [] # match stack lmt = len(seq) + dlt # sequence length = maximum possible match # print ( 'lmt=' , lmt , 'seq=' , seq ) while True: # print ( 'nod=' , nod ) if nod.actns != None: # at node with action? mst.append([nod, lvl]) # if so, save it on stack lvl += 1 # print ( 'lvl=' , lvl ) if lvl == lmt: break # continue comparing to end of token ch = seq[lvl] # print ( 'ch=' , ch , 'contn=' , nod.contn.keys() ) if not ch in nod.contn: break # quit on mismatch nod = nod.contn[ch] # go down to next node in tree # for mr in mst: # print ( ' |' , mr[0].id , mr[1] ) while len(mst) > 0: # match stack empty? mr = mst.pop() # if not, get most recent match nod = mr[0] nom = mr[1] + 1 uch = seq[nom] if nom < lmt else '_' # first unmatched char con = nod.condn # node condition for accepting match nln = lmt - nom nln += nod.delta(nln) # how chars expected after action # print ( 'lmt=' , lmt , 'nom=' , nom ) # print ( 'check rule=' , nod.id ) # print ( 'con=' , con , 'nln=' , nln ) # print ( 'uch=' , uch ) if con != 0 and nln < 3: # this must leave at least 2 letters continue # plus sentinel! # print ( 'condition' ) if con == 0: # accept match with no action? # print ( '0 condition' ) return suc # if so, done elif con == 1: # unconditionally accept? break # if so, act on this match elif con == 2: # first unmatched is consonant? if uch != '|' and not ellyChar.isVowel(uch): break # if so, act on match elif con == 3: # first unmatched is consonant or U? if not ellyChar.isStrictVowel(uch): break # if so, act on match else: ## if loop NOT terminated by break, no acceptable match # print ( 'suc=' , suc , '@' ) return suc # we are done suc = True # note acceptable match # print ( 'nod.id=' , nod.id , '/' , (Node.Ni - 1) ) # # take action for longest accepted match # # print ( '1 token=' , token ) self.rewrite(token, nom, nod) # take action for node rec = nod.actns.recur # update recursion flag # print ( '2 token=' , token ) # print ( 'rec=' , rec ) # print ( 'suc=' , suc ) return suc
def _limit ( buffr , hstry ): """ get length of next possible name component in buffer arguments: buffr - list of chars hstry - how much matched already returns: number of chars in continuation of last component, 0 for no next component """ lnb = len(buffr) if lnb == 0: return 0 bix = 0 quot = False # indicate component starting with " parn = False # with ( cmma = False # with , # print ( '_limit buffr=' , buffr , 'hstry=' , hstry ) if buffr[0] == ',': # handle possible leading comma if hstry == 0 or lnb < 4: return 0 bix += 1 if ellyChar.isWhiteSpace(buffr[1]): bix += 1 cmma = True # print ( 'for comma, bix=' , bix ) if buffr[bix] == '(': # handle short name in parentheses bix += 1 parn = True if buffr[bix] == '"': # handle short name in double quotes bix += 1 quot = True # print ( 'parn=' , parn , 'quot=' , quot ) if parn or quot: # print ( 'enclosed component from' , buffr[bix:] ) while bix < lnb: # collect letters for name chx = buffr[bix] if ellyChar.isWhiteSpace(chx): break elif not quot and parn and chx == ')': return bix + 1 # add trailing parenthesis elif quot and chx == '"': if bix + 1 < lnb and parn and buffr[bix+1] == ')': return bix + 2 # add trailing quote and parenthesis elif not parn: return bix + 1 # add trailing quote only else: return 0 # no match elif chx == '.': return bix + 1 # add trailing period elif not ellyChar.isLetter(chx): break # unrecognizable char for name bix += 1 # print ( 'no closure' ) return 0 else: # print ( 'find component in' , buffr[bix:] ) while bix < lnb: chx = buffr[bix] # collect letters for name # print ( 'chx=' , chx ) if chx == "'": if bix + 2 < lnb: chn = buffr[bix+1] if ellyChar.isWhiteSpace(chn): break if chn == 's' and not ellyChar.isLetter(buffr[bix+2]): break elif not ellyChar.isLetter(chx): if chx == '.': bix += 1 # print ( 'increment bix=' , bix ) break bix += 1 if bix == lnb: # print ( 'ran out of chars' ) return bix # running out of chars means match else: # getting here means that more text follows limit # and so we may have to pick up extra chars here chx = buffr[bix] # print ( 'next chx=' , chx , 'bix=' , bix ) if ellyChar.isWhiteSpace(chx) or chx == "'": return bix # component can be terminated by space or (') elif chx == ',': if cmma: return bix + 1 # or comma when sequence starts with comma else: return bix # when there is no starting comma elif ellyChar.isLetter(chx): return bix # or letter, implying previous char was '.' else: return 0 # failure to find name limit
def build ( self , inp ): """ build tree logic from definition reader input arguments: self - inp - definition text for logic exceptions: TableFailure on error """ if inp == None: return nerr = 0 # error count # read in affixes and associated actions while True: line = inp.readline() # next input line if line == u'': # check for EOF break modf = '' elem = line.strip().lower().split(' ') # print 'elem=' , elem le = len(elem) if le < 4: nerr += 1 print >> sys.stderr , "** affix error: incomplete input" print >> sys.stderr , "* at: [" , line , "]" continue # skip incomplete line if le > 4: # affix mod specified? modf = elem.pop() # if so, get it # print elem[0] , modf do = elem.pop() # note main action # get affix within definition line aff = list(elem.pop(0)) # affix as list of chars # check for proper form aff = self.sequence(aff) # backward or forward matching? # print 'aff=' , aff c = aff[0] # get first char to compare with aff = aff[1:] if not ellyChar.isLetter(c): # affix starts with letter? nerr += 1 print >> sys.stderr , "** affix error: must start with letter" print >> sys.stderr , "* at: [" , line , "]" continue # ignore line if not c in self.indx: # node not already in tree index? self.indx[c] = Node() # add new node node = self.indx[c] for a in aff: # now check each successive char in affix if a in node.contn: node = node.contn[a] # go to existing node if found else: new = Node() # otherwise make new node node.contn[a] = new # and insert into tree node = new # and move down # at final node in tree logic node.condn = int(elem.pop(0)) # condition for match try: nsave = 0 if len(elem) == 0 else int(elem.pop()) except ValueError , e: print >> sys.stderr , e print >> sys.stderr , "* at: [" , line , "]" continue # ignore line resto = [ Add ] # set to defaults recur = False # mode = do[-1] # kind of recursion rest = do[:-1] # added chars to fill out root # print 'mode=' + '<' + mode + '>' , 'rest=' , rest if mode == u'?': node.condn = 1 resto = [ Fail ] # will generate fatal error else: if mode == ',': # allow recursion? recur = True # if so, change default if len(rest) == 1 and rest[0] == '&': resto = [ RestorE ] else: resto += list(rest) if self.addn != None: resto.insert(1,self.addn) # insert AFTER first char of list # print 'resto=' , resto # insert action node.actns = Action(self,nsave,resto,recur,modf) node.tag()
def _lookUpNext(self): """ look up possible next segments in input buffer by various means, keeping tokens only for the LONGEST segment arguments: self returns: True on successful lookup, False otherwise exceptions: ParseOverflow """ self.sbu.skipSpaces() # skip leading spaces s = self.sbu.buffer # print ( '_lookUp@0 buffer=' , s ) if len(s) == 0: # check for end of input return False # if so, done # print ( 'in =' , str(self.sbu) ) if self.trs != None: # preanalysis of number expressions self.trs.rewriteNumber(s) # print ( '_lookUp@1 buffer=' , self.sbu.buffer ) # print ( 'macro expansion s[0]=' , s[0] ) self.sbu.expand() # apply macro substitutions # print ( 'macro expanded s[0]=' , s[0] ) # print ( '_lookUp@2 buffer=' , self.sbu.buffer ) s = self.sbu.buffer # print ( 'expanded len=' , len(s) ) if len(s) == 0: return True # macros can empty out buffer k = self.sbu.findBreak() # find extent of first component for lookup if k == 0: k = 1 # must have at least one char in token # print ( 'break at k=' , k ) kl = len(s) if k + 1 < kl and s[k] == '+' and s[k + 1] == ' ': k += 1 # recognize possible prefix # print ( 'len(s)=' , kl , 'k=' , k , 's=', s ) # print ( '_lookUp@3 buffer=' , self.sbu.buffer ) mr = self._scanText(k) # text matching in various ways mx = mr[0] # overall maximum match length chs = mr[1] # any vocabulary element matched suf = mr[2] # any suffix removed in matching # print ( '_lookUp@4 buffer=' , self.sbu.buffer ) s = self.sbu.buffer # print ( 'k=' , k ) # print ( 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf ) # print ( 'len(s)=' , len(s) , 's=' , s ) if (k < mx or k == mx and suf != '' ): # next word cannot produce token as long as already seen? # print ( 'queue:' , len(self.ptr.queue) ) # print ( 'chs=' , chs ) if len(chs) > 0: # any vocabulary matches? # print ( 'put back' , suf , mx , s ) self.sbu.skip(mx) # if so, they supersede if suf != '': # handle any suffix removal self.sbu.prepend(list(suf)) # print ( 'suf=' , suf ) else: chs = self.sbu.extract(mx) # print ( 'extract chs=' , chs ) to = ellyToken.EllyToken(chs) # print ( 'token=' , str(to) ) self.ctx.addTokenToListing(to) if suf != '': if not ellyChar.isApostrophe(suf[1]): to.dvdd = True # must note suffix removal for token! # print ( 'only queue:' , len(self.ptr.queue) ) return True # print ( 'mx=' , mx ) # print ( 'plus queue:' , len(self.ptr.queue) ) wsk = self.sbu.buffer[:k] cap = ellyChar.isUpperCaseLetter(wsk[0]) # print ( 'wsk=' , wsk ) rws = ''.join(wsk) found = self.ptr.createPhrasesFromDictionary(rws.lower(), False, cap) if not found: if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]): k -= 1 rws = rws[:-1] found = self.ptr.createPhrasesFromDictionary( rws.lower(), False, cap) # print ( rws , 'found in dictionary=' , found ) if found or mx > 0: # match found in dictionary or by text scan if not found: k = mx # if by text scan, must make token longer rws = rws[:k] # if mx > k self.sbu.skip(k) # print ( 'next=' , self.sbu.buffer[self.sbu.index:] ) # print ( 'queue after =' , len(self.ptr.queue) ) to = ellyToken.EllyToken(rws[:k]) if len(suf) > 1: # change token to show suffix properly # print ( 'suf=' , suf ) cs = suf[1] # first char in suffix after '-' rt = to.root # this is a list! lk = -1 # start at last char in token while rt[lk] != cs: lk -= 1 sn = len(rt) + lk # where to divide suffix from root # print ( 'sn=' , sn , rt ) to.root = rt[:sn] # root without suffix self.sbu.prepend(suf) # restore suffix to input for processing else: # no suffix chx = self.sbu.peek() # look at next char after match if chx == '-': # if hyphen, need to separate it self.sbu.skip() if ellyChar.isLetter(self.sbu.peek()): self.sbu.prepend(' ') self.sbu.prepend('-') # print ( 'add' , str(to) ) self.ctx.addTokenToListing(to) # add token to listing for sentence return True # print ( '[' + rws + ']' , 'still unrecognized' ) chx = rws[0] # special hyphen check if chx == '-' and k > 1: # print ( 'look in internal dictionary' ) if self.ptr.createPhrasesFromDictionary(chx, False, False): # print ( 'found!' ) to = ellyToken.EllyToken(chx) # treat hyphen as token self.ctx.addTokenToListing(to) # add it to token list self.sbu.skip() # remove from input return True to = self._extractToken( mx) # single-word matching with analysis and lookup # print ( 'extracted to=' , str(to) ) if to == None: # if no match, we are done and will return # print ( 'mx=' , mx ) return False if mx == 0 else True # still success if _scanText() found something if self.ptr.lastph != None: self.ptr.lastph.lens = to.getLength() # print ( 'to=' , str(to) , 'len(s)=' , len(s) , s ) # posn = self.ctx.countTokensInListing() # print ( 'at', posn , 'in token list' ) self.ctx.addTokenToListing(to) # add token to listing for sentence # tol = self.ctx.getNthTokenInListing(-1) # print ( 'last token root=' , tol.root ) return True # successful lookup
def _lookUpNext ( self ): """ look up possible next segments in input buffer by various means, keeping tokens only for the LONGEST segment arguments: self returns: True on successful lookup, False otherwise exceptions: ParseOverflow """ self.sbu.skipSpaces() # skip leading spaces s = self.sbu.buffer # print '_lookUp@0 buffer=' , s if len(s) == 0: # check for end of input return False # if so, done # print 'in =' , unicode(self.sbu) if self.trs != None: # preanalysis of number expressions self.trs.rewriteNumber(s) # print '_lookUp@1 buffer=' , self.sbu.buffer # print 'macro expansion s[0]=' , s[0] self.sbu.expand() # apply macro substitutions # print 'macro expanded s[0]=' , s[0] # print '_lookUp@2 buffer=' , self.sbu.buffer s = self.sbu.buffer # print 'expanded len=' , len(s) if len(s) == 0: return True # macros can empty out buffer k = self.sbu.findBreak() # find extent of first component for lookup if k == 0: k = 1 # must have at least one char in token # print 'break at k=' , k kl = len(s) if k + 1 < kl and s[k] == '+' and s[k+1] == ' ': k += 1 # recognize possible prefix # print 'len(s)=' , kl , 'k=' , k , 's=', s # print '_lookUp@3 buffer=' , self.sbu.buffer mr = self._scanText(k) # text matching in various ways mx = mr[0] # overall maximum match length chs = mr[1] # any vocabulary element matched suf = mr[2] # any suffix removed in matching # print '_lookUp@4 buffer=' , self.sbu.buffer s = self.sbu.buffer # print 'k=' , k # print 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf # print 'len(s)=' , len(s) , 's=' , s if ( k < mx or k == mx and suf != '' ): # next word cannot produce token as long as already seen? # print 'queue:' , len(self.ptr.queue) # print 'chs=' , chs if len(chs) > 0: # any vocabulary matches? # print 'put back' , suf , mx , s self.sbu.skip(mx) # if so, they supersede if suf != '': # handle any suffix removal self.sbu.prepend(list(suf)) # print 'suf=' , suf else: chs = self.sbu.extract(mx) # print 'extracted chs=' , chs # print 'token chs=' , chs to = ellyToken.EllyToken(chs) # print 'long token=' , unicode(to) self.ctx.addTokenToListing(to) if suf != '': if not ellyChar.isApostrophe(suf[1]): to.dvdd = True # must note suffix removal for token! # print 'only queue:' , len(self.ptr.queue) return True # print 'mx=' , mx # print 'plus queue:' , len(self.ptr.queue) wsk = self.sbu.buffer[:k] cap = ellyChar.isUpperCaseLetter(wsk[0]) # print 'wsk=' , wsk rws = u''.join(wsk) found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap) if not found: # print 'not found, k=' , k if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]): k -= 1 rws = rws[:-1] found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap) # print 'found in dictionary=' , found if found or mx > 0: # match found in dictionary or by text scan if not found: k = mx # if by text scan, must make token longer rws = rws[:k] # if mx > k self.sbu.skip(k) # print 'next=' , self.sbu.buffer[self.sbu.index:] # print 'queue after =' , len(self.ptr.queue) to = ellyToken.EllyToken(rws[:k]) if len(suf) > 1: # change token to show suffix properly # print 'suf=' , suf cs = suf[1] # first char in suffix after '-' rt = to.root # this is a list! lk = -1 # start at last char in token while rt[lk] != cs: lk -= 1 sn = len(rt) + lk # where to divide suffix from root # print 'sn=' , sn , rt to.root = rt[:sn] # root without suffix self.sbu.prepend(suf) # restore suffix to input for processing else: # no suffix chx = self.sbu.peek() # look at next char after match if chx == '-': # if hyphen, need to separate it self.sbu.skip() if ellyChar.isLetter(self.sbu.peek()): self.sbu.prepend(' ') self.sbu.prepend('-') # print 'add' , unicode(to) self.ctx.addTokenToListing(to) # add token to listing for sentence return True # print '[' + rws + ']' , 'still unrecognized' chx = rws[0] # special hyphen check if chx == '-' and k > 1: # print 'look in internal dictionary' if self.ptr.createPhrasesFromDictionary(chx,False,False): # print 'found!' to = ellyToken.EllyToken(chx) # treat hyphen as token self.ctx.addTokenToListing(to) # add it to token list self.sbu.skip() # remove from input return True to = self._extractToken(mx) # single-word matching with analysis and lookup # print 'extracted to=' , unicode(to) if to == None: # if no match, we are done and will return # print 'mx=' , mx return False if mx == 0 else True # still success if _scanText() found something if self.ptr.lastph != None: self.ptr.lastph.lens = to.getLength() # print 'to=' , unicode(to) , 'len(s)=' , len(s) , s # posn = self.ctx.countTokensInListing() # print 'at', posn , 'in token list' self.ctx.addTokenToListing(to) # add token to listing for sentence # tol = self.ctx.getNthTokenInListing(-1) # print 'last token root=' , tol.root return True # successful lookup
def match ( self , token ): """ compare an Elly token against a tree and possibly modify that token after a match arguments: self - token - input Elly token returns: True on match, False otherwise """ # print token rec = True # recursion flag suc = False # success flag while rec: # continue comparisons recursively while flag is True # print 'token root=' , token.root if len(token.root) < 3: break # stop if token root is too short dlt = 0 if ellyChar.isLetter(token.root[0]) else -2 seq = self.sequence(token.root) + [ Bound ] # token sequence plus sentinel # print type(self).__name__ ,'seq=' , seq chs = seq[0] # first char in sequence to match if not chs in self.indx: return suc nod = self.indx[chs] # starting node in tree # print 'start nod.id=' , nod.id , '/' , (Node.Ni - 1) lvl = 0 # level in tree mst = [ ] # match stack lmt = len(seq) + dlt # sequence length = maximum possible match # print 'lmt=' , lmt , 'seq=' , seq while True: # print 'nod=' , nod if nod.actns != None: # at node with action? mst.append([ nod , lvl ]) # if so, save it on stack lvl += 1 # print 'lvl=' , lvl if lvl == lmt: break # continue comparing to end of token ch = seq[lvl] # print 'ch=' , ch , 'contn=' , nod.contn.keys() if not ch in nod.contn: break # quit on mismatch nod = nod.contn[ch] # go down to next node in tree # for mr in mst: # print ' |' , mr[0].id , mr[1] while len(mst) > 0: # match stack empty? mr = mst.pop() # if not, get most recent match nod = mr[0] nom = mr[1] + 1 uch = seq[nom] if nom < lmt else u'_' # first unmatched char con = nod.condn # node condition for accepting match nln = lmt - nom nln += nod.delta(nln) # how chars expected after action # print 'lmt=' , lmt , 'nom=' , nom # print 'check rule=' , nod.id # print 'con=' , con , 'nln=' , nln # print 'uch=' , uch if con != 0 and nln < 3: # this must leave at least 2 letters continue # plus sentinel! # print 'condition' if con == 0: # accept match with no action? # print '0 condition' return suc # if so, done elif con == 1: # unconditionally accept? break # if so, act on this match elif con == 2: # first unmatched is consonant? if uch != '|' and not ellyChar.isVowel(uch): break # if so, act on match elif con == 3: # first unmatched is consonant or U? if not ellyChar.isStrictVowel(uch) : break # if so, act on match else: ## if loop NOT terminated by break, no acceptable match # print 'suc=' , suc , '@' return suc # we are done suc = True # note acceptable match # print 'nod.id=' , nod.id , '/' , (Node.Ni - 1) # # take action for longest accepted match # # print '1 token=' , token self.rewrite(token,nom,nod) # take action for node rec = nod.actns.recur # update recursion flag # print '2 token=' , token # print 'rec=' , rec # print 'suc=' , suc return suc
def simpleDeinflection(self, ss, ssp, ssl, mr): """ handle matching of certain forms of English inflectional endings (override this method appropriately for other languages) arguments: self - ss - input string of chars to scan for match ssp - current position in input string ssl - limit of matching in input mr - list of chars to look for next in input returns: inflection char count >= 0 on match, -1 otherwise """ # print ( 'simpleDeinflection' , 'ssp=' , ssp , 'ssl=' , ssl ) self.endg = '' # null inflection by default if len(mr) == 0 and ssp == ssl: return 0 if ssp < 2 or ss[ssp - 2] == ' ': return -1 ts = ss[ssp:] # where to look for inflection mc = ss[ssp - 1] # last char matched lm = len(mr) # print ( ts , 'mc=' , mc , 'mr=' , mr ) if not ellyChar.isLetter(mc): return -1 dss = ssl - ssp # count up extra input chars# # print ( 'dss=' , dss ) if dss == 0: # no more chars in input if lm == 0: # check for exact match return 0 elif dss == 1: # just one char left in input if lm != 0: # make sure all of pattern matched return -1 elif ts[0] in APOs: if mc == 's': # case of S' self.endg = "-'s" return 1 else: return 0 elif ts[0].lower() == 's': self.endg = '-s' # assume extra input S is for plural return 1 elif mc == 'e' and ts[0].lower() == 'd': self.endg = '-ed' # an E was last matched char return 1 elif ts[0] == '.': return 0 # but no inflection elif dss == 2: # 2 extra chars # print ( 'ts=' , ts ) if lm == 0: if ts[0].lower() == 'e': if ts[1].lower() == 'd': self.endg = '-ed' # E and D must be inflection return 2 elif ts[1].lower() == 's': self.endg = '-s' # assume E is extra return 2 elif ts[0] in APOs and ts[1].lower() == 's': # print ( "ending -'s" ) ss[ssp] = "'" # normalization just in case self.endg = "-'s" return 2 elif ts[1] in APOs and ts[0].lower() == 's': # print ( "endings -s and -'s" ) ss[ssp] = "'" # reverse letters in next input ss[ssp + 1] = "s" # self.endg = '-s' return 0 elif dss == 3: # 3 extra chars # print ( 'ts=' , ts , 'mr=' , mr ) if ts[0].lower() == 'i': if ts[1].lower() == 'e': if lm == 1 and mr[0] == 'y': if ts[2].lower() == 's': self.endg = '-s' return 3 elif ts[2].lower() == 'd': self.endg = '-ed' return 3 elif ts[1].lower() == 'n' and ts[2].lower() == 'g': if lm == 0 or lm == 1 and mr[0] == 'e': self.endg = '-ing' return 3 if lm == 0 and ts[0].lower() == mc and ts[1].lower( ) == 'e' and ts[2].lower() == 'd': self.endg = '-ed' return 3 elif dss == 4: # 4 extra chars if lm == 0 and ts[0].lower() == mc and ts[1] == 'i' and ts[ 2].lower() == 'n' and ts[3].lower() == 'g': self.endg = '-ing' return 4 if ts[0].lower() == 'y' and ts[1].lower() == 'i' and ts[2].lower( ) == 'n' and ts[3].lower() == 'g': if lm == 2 and mr[0] == 'i' and mr[1] == 'e': self.endg = '-ing' return 4 return -1 # extra chars not inflection
def scan ( buffr ): """ recognize personal names in text at current position arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ def doLook ( mth , itm ): """ do lookup with specified method using global variables in Python 2.7.* arguments: mth - name table method itm - string to look up """ global _typ , _nch # really need nonlocal _typ = mth(itm) if _typ < 0 and len(itm) > 3: # if no match, check for final '.' if itm[-1] == '.': _typ = mth(itm[:-1]) if _typ >= 0: _nch -= 1 # match without '.' global _typ , _nch global _toscan # print 'table=' , _table bln = len(buffr) if _table == None or bln < 2: return 0 if _toscan > 0: if bln > _toscan: return 0 else: _toscan = 0 chx = buffr[0] # print 'scan chx=' , chx if not ellyChar.isLetter(chx) and chx != '(' and chx != '"': return 0 cmps = [ ] # name components this time ncmp = 0 # number of components for current name ninf = 0 # number inferred ntyp = len(nameTable.TYP) stat = [False]*ntyp # define state for getting personal name mlen = 0 # last match length bix = 0 # buffer index to advance in scanning _typ = -1 while bix < bln: ltyp = -1 # last match type _nch = _limit(buffr[bix:],mlen) # length of next possible name component # print 'top _nch=' , _nch if _nch == 0: return 0 elm = _extract(buffr[bix:],_nch) # get possible component as string sch = buffr[bix] enclosed = (sch == '(' or sch == '"') # type of next element doLook(_table.lookUp,elm) # look it up in saved name table # print 'lookUp(' , elm , ')=' , _typ if _typ < 0: if _typ == nameTable.REJ: return 0 # immediate rejection of any match if _typ == nameTable.STP: break # stop any more matching if elm[-1] == '.': # drop any trailing '.' elm = elm[:-1] if not enclosed: _nch -= 1 if enclosed: # enclosed element assumed to be name if not elm in _cntxt: _cntxt.append(elm) # make sure always to save in local context ninf += 1 # this is inferred! if elm in _cntxt: _typ = nameTable.XNM # neutral name type to be noncommital if _typ < 0: tok = buffr[bix:bix + _nch] # unknown token to check # print 'call infer with tok=' , tok if infer(tok): # print 'digraph test passed' _typ = nameTable.XNM # neutral name type inferred if not _table.checkPhonetic(tok): ninf += 1 # count inferred component if no phonetic support # print '_typ=' , _typ if nameTable.starts(_typ) and bix > 0: # if component not at start of name, break # must stop name scan # print 'continuing bix=' , bix while _typ >= 0: # continue as long as match is viable ncmp += 1 # count up component cmps.append(elm) # save component bix += _nch # move ahead in scan # print 'bix=' , bix if _typ > 0: # print '_typ=' , _typ if stat[_typ]: # check for duplication of component type if (ltyp >= 0 and ltyp != _typ): # allowed only if duplicate is consecutive break mlen = bix # save index on actual match ltyp = _typ if nameTable.ends(_typ): # if component marks end of name, break # must stop name scan stat[_typ] = True # update match state if bix == bln: break if ellyChar.isWhiteSpace(buffr[bix]): bix += 1 # skip any space to start of next component _nch = _limit(buffr[bix:],mlen) # length of next possible name component if _nch == 0: break elm = _extract(buffr[bix:],_nch) # get possible next component as string doLook(_table.lookUpMore,elm) # look it up in saved name table # print 'lookUpMore(' , elm , ')=' , _typ if _typ < 0: # while-loop terminated without break # print 'ltyp=' , ltyp , 'mlen=' , mlen if ltyp < 0 or mlen == 0: break bix = mlen # restart at end of last match if bix == bln: break if ellyChar.isWhiteSpace(buffr[bix]): bix += 1 # skip any space to start of next component continue break # # #### additional constraints on acceptable personal name # # print 'checking ltyp=' , ltyp if (ltyp == nameTable.CNJ or ltyp == nameTable.REL): # a name cannot end with these types mlen -= _nch # have to drop them from any match if mlen == 0: return 0 if ellyChar.isWhiteSpace(buffr[mlen-1]): mlen -= 1 ncmp -= 1 cmps.pop() # print 'ncmp=' , ncmp if ncmp == 0: # nothing matched? _planAhead(buffr) # check for possible problems in next scan return 0 # print 'cmps=' , cmps if ncmp == ninf: return 0 # name cannot be purely inferred # print 'ncmp=' , ncmp if ncmp == 1: # single-component name must be known or contextual if (not stat[nameTable.SNG] and not cmps[0] in _cntxt): return 0 # print 'stat=' , stat[3:7] expl = (stat[nameTable.PNM] or # name must have a substantial component stat[nameTable.SNM] or stat[nameTable.XNM] or stat[nameTable.SNG]) # print 'expl=' , expl if (not expl and not (stat[nameTable.TTL] and # or it could have just a title stat[nameTable.INI])): # and an initial return 0 # #### # print 'accepted mlen=' , mlen for cmpo in cmps: # if whole name is OK, if not cmpo in _cntxt: # remember all components _cntxt.append(cmpo) # not already listed in context return mlen # will be > 0 on successful match
def _matchAN ( self , ts ): """ apply logic for alphanumeric date recognition arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'ALPHANUMERIC' t = ts tl = len(ts) k = self._aMonth(t) # look for month to start date string comma = False # print 'month len=' , k if k > 0: if k == tl: return 0 if not ellyChar.isWhiteSpace(t[k]): return 0 k += 1 # skip space after month if k == tl: return 0 t = t[k:] k = self._aDay(t) # look for day of month # print 'day len=' , k if k == 0: self._dy = [ ] k = self._aYear(t) # look for year immediately following if k > 0: return tl - len(t) + k else: return 0 # print 'ts=' , ts tl = len(t) # _aDay may have rewritten alphabetic day t = t[k:] if len(t) == 0: # print 'no year tl=' , tl , 'k=' , k , t return len(ts) - tl + k if t[0] == u',': # look for comma after day t = t[1:] # if found, remove and note comma = True if len(t) == 0: return tl if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl k = self._aYear(t) # look for year # print 'year len=' , k lnt = len(t) if comma and k < lnt and t[k] == ',': k += 1 # remove comma after year if paired # print 'len(ts)=' , len(ts) , 'len(t)=' , len(t) , t return len(ts) - len(t) + k k = self._aDay(t) # look for day of month to start date string # print 'start day len=' , k if k == 0: self._dy = [ ] elif k > 0 and k < tl: # cannot be just bare number by itself tl = len(ts) # _aDay may have rewritten alphabetic day t = t[k:] # print 'new t=' , t if (k > 2 and len(t) > 2 and t[0] == u' ' and t[1].upper() == 'O' and t[2].upper() == 'F'): t = t[3:] # to handle day reference like '4th of' if len(t) == 0: return 0 if not ellyChar.isWhiteSpace(t[0]): return 0 t = t[1:] k = self._aMonth(t) # look for month if k == 0: return 0 t = t[k:] if len(t) == 0: return tl ntl = tl - len(t) # print 'ntl=' , ntl nd = 0 if t[0] == u',': # look for comma after month t = t[1:] if len(t) == 0: return tl nd += 1 comma = True if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl nd += 1 k = self._aYear(t) # look for year if k > 0: if comma and k < len(t) and t[k] == ',': k += 1 return ntl + k + nd # full date found else: return ntl - nd # only month and day of date found # print 'look for year only in' , t k = self._aYear(t) if k > 0: if k == tl: return k elif not ellyChar.isLetter(t[k]) and t[k] != '-': return k return 0 # nothing found
def __init__(self, inpr): """ define table from text input arguments: self - inpr - EllyDefinitionReader throws: TableFailure on table definition failure """ self.pres = {} self.posts = {} self.dictn = {} self.phone = [] self.compn = '' self._nerr = 0 # print 'TYP=' , TYP while True: lin = inpr.readline().lower() # ignore capitalization if len(lin) == 0: break if lin[0] == '=': # phonetic entry? lin = lin[1:] # if so, remove marker first = '' if lin[0] == 'a': # vowel is first? first = 'a' # if so, remove it lin = lin[1:] pho = first + lin.upper( ) # combine any vowel with uppercase rest self.phone.append(pho) # save in phonetic list continue lins = lin.strip().split(':') if len(lins) != 2: # type definition must have two parts self._err(lne=lin) continue typ = lins[1].strip() # get component type if not typ in TYP: self._err('bad name component type', lin) continue cod = TYP[typ] els = lins[0].strip().split(' ') # name component # print 'type=' , '"' + typ + '"' , els lim = len(els) if lim == 1: cmpo = els[0] chf = cmpo[0] # first char of component chl = cmpo[-1] # last char if chf == '-' or chf == '+': if not ellyChar.isLetter(chl) or len(cmpo) < 3: self._err('bad end of name', lin) continue dky = cmpo[-2:] # dictionary key is 2 chars only if not dky in self.posts: self.posts[dky] = [] self.posts[dky].append([cmpo[1:], cod, (chf == '+')]) elif chl == '-' or chl == '+': if not ellyChar.isLetter(chf) or len(cmpo) < 3: self._err('bad start of name', lin) continue dky = cmpo[:2] # dictionary key is 2 chars only if not dky in self.pres: self.pres[dky] = [] self.pres[dky].append([cmpo[:-1], cod, (chl == '+')]) else: self.dictn[cmpo] = cod if cmpo[-1] == '.': # if ending with '.' , also save without self.dictn[cmpo[:-1]] = cod continue Nix = 1 while Nix <= lim: # process elements of name component cmpo = ' '.join(els[0:Nix]) Nix += 1 if cmpo not in self.dictn: # first Nix elements self.dictn[cmpo] = CND if self.dictn[cmpo] != CND: self._err('name component redefined', lin) continue self.dictn[cmpo] = TYP[typ] # put into table if self._nerr > 0: print >> sys.stderr, '**', self._nerr, 'name errors in all' print >> sys.stderr, 'name table definition FAILed' raise ellyException.TableFailure
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __unicode__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + unicode(self.kind) + ',ct=' + unicode(self.count) + ',pa=' + unicode(self.pats) + ',tx=' + unicode(self.txts) + ',bd=' + unicode(self.bnds) + ',ns=' + unicode(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print "binding:",offs,ns os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) # print "_span: txt @",offs,"pat @",mp,"nsp=",nsp # print "text to span:",text[offs:] # print "pat rest=" , patn[mp:] k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print "exclude=",k,"chars from possible span for rest of pattern" # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print mx,"chars available to scan" mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print "text at",offs,"maximum wildcard match=",mx nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print 'span c=' , c if not tfn(c): break # stop when it fails to match nm += 1 # print "maximum wildcard span=",nm return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print 'starting match, limt=',limt,text[offs:limt],":",patn # print 'nsps=' , nsps mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print '---- loop mp=' , mp , 'ml=' , ml while mp < ml: if offs >= limt: # print "offs=",offs,"limt=",limt last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print 'patn=' , patn mc = patn[mp] # print 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs # print 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print 'hyphen special matching, limt=', limt , 'offs=' , offs # print 'text[offs:]=' , text[offs:] if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print 'no special matching of hyphen' break # print 'matched @mp=' , mp mp += 1 ## check whether mismatch is due to special pattern char # print 'pat @',mp,"<",ml # print "txt @",offs,'<',limt,'last=',last # print '@',offs,text[offs:] if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) if tc == cALL: # a * wildcard? # print "ALL last=< " + last + " >" if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print "offs=",offs,'nm=',nm uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print "END $:",last if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print "ANY:",last,offs if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print 'at cCAN' if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print "ALF:",last,offs if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print "UPR:",last,'@',offs if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print "LWR:",last,'@',offs if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print "SPC:","["+last+"]" if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print 'NO space' elif tc == cAPO: # apostrophe wildcard? # print "APO: last=" , last if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print "SOS" # print last,'@',offs mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print "EOS" if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' if last != '': # still more to match? offs -= 1 # print 'nsps=' , nsps # print '@' , offs , text nm = _span(tc,nsps) # maximum match possible # print 'spanning=' , nm if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print 'spanning=' , nm if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print 'offs=' , offs last = text[offs] if offs < limt else '' continue # print 'fail tc=' , deconvert(tc) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print "fail - unwinding" , unj while unj > 0: # try unwinding, if possible # print "unw:",unj uf = unw[unj-1] # get most recent unwinding record # print uf if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print "no unwinding" break # quit if unwinding is exhausted # print 'cnt=' , uf.count , 'off=' , offs ## ## clean up on match mode or on no match possible ## # print "matched=",matched if not matched: return None # no bindings # print text,offs ## consolidate contiguous bindings for subsequent substitutions # print "BEFORE consolidating consecutive bindings" # print "bd:",len(mbd) # print mbd[0] # print '----' # for b in mbd[1:]: # print b mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print "AFTER" # print "bd:",len(mbd) # print mbd[0] # print '----' # for b in mbd[1:]: # print b return mbd # consolidated bindings plus new offset
def match ( self , txt , pnc , ctx ): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - next chars after punctuation returns: True on match, False otherwise """ # print 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx if matchtoo(txt,pnc,ctx): # exception by complex match? return True # print 'matchtoo() returned False' sep = ctx[0] if len(ctx) > 0 else '' if sep == ellyChar.THS: return True nxt = ctx[1] if len(ctx) > 1 else '' # print 'lstg=' , self.lstg.keys() if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print len(lp) , 'patterns' ltx = len(txt) # current length of accumulated text so far ntr = 1 while ntr <= ltx: if not ellyChar.isLetterOrDigit(txt[-ntr]): break ntr += 1 nrg = ntr ntr -= 1 # available trailing chars for wildcard * match while nrg <= ltx: c = txt[-nrg] if not ellyChar.isLetterOrDigit(c) and not ellyChar.isEmbeddedCombining(c): # print 'break at nrg=' , nrg , txt[-nrg] break nrg += 1 nrg -= 1 # end of range for all pattern matching # print 'ntr=' , ntr , 'nrg=' , nrg txt = txt[-nrg:] # reset text to limit for matching ltx = len(txt) # its new length # print 'txt= ' + unicode(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' for p in lp: # try matching each listed exception pattern if p.left != None and len(p.left) > 0: pat = p.left star = pat[-1] == ellyWildcard.cALL n = len(pat) # it each pattern element matches one sequence char if star: # except for a final wildcard * # print 'pattern ending with *' n -= 1 # print 'ltx=' , ltx , 'n=' , n if ltx < n: continue # cannot match pattern properly pat = pat[:-1] t = txt[:n] else: if ltx < n: continue # cannot match pattern properly t = txt[-n:] if not ellyWildcard.match(pat,t,0): # print 'no possible pattern match' continue k = ltx - n # extra chars beyond any match # print 'k=' , k , 't=' , t # print 'txt=' , txt # print 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' # print 'matches' , n , 'chars' if not star and k > 0: # print 'check text before [' , txt[-n] , ']' if ellyChar.isLetterOrDigit(txt[-n]): c = txt[-n-1] # print 'preceding= [', c , ']' if ellyChar.isLetterOrDigit(c) or c == '&': continue # because break in text is required # print 'pat=' , ellyWildcard.deconvert(p.left) # print 'n=' , n , 'ltx=' , ltx # print 'txt=' , txt # nc = '\\n' if nxt == '\n' else nxt # print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' # print 'versus c=' , nc rp = p.right if rp == [] or rp[0] == ellyWildcard.cALL: return True pcx = rp[0] if pcx == nxt: # check for specific char after possible stop # print 'right=' , nxt return True elif pcx == ellyWildcard.cALF: # check for alphabetic if ellyChar.isLetter(nxt): # print 'right is alphabetic=' , nxt return True elif pcx == ellyWildcard.cDIG: # check for numeric if ellyChar.isDigit(nxt): # print 'right is numeric=' , nxt return True elif pcx == ellyWildcard.cUPR: # check for upper case if ellyChar.isUpperCaseLetter(nxt): return True elif pcx == ellyWildcard.cLWR: # check for lower case if ellyChar.isLowerCaseLetter(nxt): return True elif pcx == ellyWildcard.cCAN: # check for non-alphanumeric if ellyChar.isLetter(nxt): # print 'right is alphabetic=' , nxt return True # print "no matches" return False
def _matchAN ( self , ts ): """ apply logic for alphanumeric date recognition arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'ALPHANUMERIC' t = ts tl = len(ts) k = self._aMonth(t) # look for month to start date string if k > 0: if k == tl: return 0 if not ellyChar.isWhiteSpace(t[k]): return 0 k += 1 # skip space after month if k == tl: return 0 t = t[k:] k = self._aDay(t) # look for day of month if k == 0: return 0 tl = len(ts) # _aDay may have rewritten alphabetic day t = t[k:] if len(t) == 0: return 0 if t[0] == u',': t = t[1:] # look for comma after day if len(t) == 0: return tl if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl k = self._aYear(t) # look for year return tl - len(t) + k k = self._aDay(t) # look for day of month to start date string if k > 0 and k < tl: # cannot be just bare number by itself tl = len(ts) # _aDay may have rewritten alphabetic day t = t[k:] # print 'new t=' , t if (k > 2 and len(t) > 2 and t[0] == u' ' and t[1].upper() == 'O' and t[2].upper() == 'F'): t = t[3:] # to handle day reference like '4th of' if len(t) == 0: return 0 if not ellyChar.isWhiteSpace(t[0]): return 0 t = t[1:] k = self._aMonth(t) # look for month if k == 0: return 0 t = t[k:] if len(t) == 0: return tl ntl = tl - len(t) # print 'ntl=' , ntl nd = 0 if t[0] == u',': # look for comma after month t = t[1:] if len(t) == 0: return tl nd += 1 if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl nd += 1 k = self._aYear(t) # look for year if k > 0: return ntl + k + nd # full date found else: return ntl - nd # only month and day of date found # print 'look for year only in' , t k = self._aYear(t) if k > 0: if k == tl: return k elif not ellyChar.isLetter(t[k]) and t[k] != '-': return k return 0 # nothing found
def _matchAN(self, ts): """ apply logic for alphanumeric date recognition arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'ALPHANUMERIC' t = ts tl = len(ts) k = self._aMonth(t) # look for month to start date string comma = False # print 'month len=' , k if k > 0: if k == tl: return 0 if not ellyChar.isWhiteSpace(t[k]): return 0 k += 1 # skip space after month if k == tl: return 0 t = t[k:] k = self._aDay(t) # look for day of month # print 'day len=' , k if k == 0: self._dy = [] k = self._aYear(t) # look for year immediately following if k > 0: return tl - len(t) + k else: return 0 # print 'ts=' , ts tl = len(t) # _aDay may have rewritten alphabetic day t = t[k:] if len(t) == 0: # print 'no year tl=' , tl , 'k=' , k , t return len(ts) - tl + k if t[0] == u',': # look for comma after day t = t[1:] # if found, remove and note comma = True if len(t) == 0: return tl if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl k = self._aYear(t) # look for year # print 'year len=' , k lnt = len(t) if comma and k < lnt and t[k] == ',': k += 1 # remove comma after year if paired # print 'len(ts)=' , len(ts) , 'len(t)=' , len(t) , t return len(ts) - len(t) + k k = self._aDay(t) # look for day of month to start date string # print 'start day len=' , k if k == 0: self._dy = [] elif k > 0 and k < tl: # cannot be just bare number by itself tl = len(ts) # _aDay may have rewritten alphabetic day t = t[k:] # print 'new t=' , t if (k > 2 and len(t) > 2 and t[0] == u' ' and t[1].upper() == 'O' and t[2].upper() == 'F'): t = t[3:] # to handle day reference like '4th of' if len(t) == 0: return 0 if not ellyChar.isWhiteSpace(t[0]): return 0 t = t[1:] k = self._aMonth(t) # look for month if k == 0: return 0 t = t[k:] if len(t) == 0: return tl ntl = tl - len(t) # print 'ntl=' , ntl nd = 0 if t[0] == u',': # look for comma after month t = t[1:] if len(t) == 0: return tl nd += 1 comma = True if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl nd += 1 k = self._aYear(t) # look for year if k > 0: if comma and k < len(t) and t[k] == ',': k += 1 return ntl + k + nd # full date found else: return ntl - nd # only month and day of date found # print 'look for year only in' , t k = self._aYear(t) if k > 0: if k == tl: return k elif not ellyChar.isLetter(t[k]) and t[k] != '-': return k return 0 # nothing found
def lookUp ( self , chrs , keyl ): """ look for terms in vocabulary at current text position arguments: self - chrs - text char list keyl - number of initial chars to use a DB key returns: list of tuples [ VocabularyElement , Result ], possibly empty """ res = [ ] # result list initially empty rln = 0 if len(chrs) == 0: return res # empty list at this point # print 'chrs=' , type(chrs) , type(chrs[0]) if keyl < 1: return res # still empty list strg = toKey(chrs[:keyl]) # print 'vocab first word=' , list(strg) , type(strg) vs = self._getDB(strg) # look up first word in vocabulary table if vs == None or len(vs) == 0: return res # print len(vs) , 'raw entries found' lm = len(chrs) # total length of text for lookup for v in vs: # look at possible vocabulary matches # print 'entry=' , v ln = v.length() # total possible match length for vocabulary entry # print 'rln=' , rln , 'ln=' , ln , 'lm=' , lm if ln > lm: # must be enough text in entry to match continue k = ln while k < lm: chrsk = chrs[k] if not ellyChar.isLetter(chrsk) and chrsk != '\'': break k += 1 # print 'k=' , k # print v.chs , ':' , chrs[:k] nm = self.doMatchUp(v.chs,chrs) if nm == 0 or nm < rln: continue # print 'rln=' , rln , 'ln=' , ln if rln < nm: # longer match than before? # print 'new list' res = [ ] # if so, start new result list for longer matches rln = nm # set new minimum match length # print 'returning' , v.chs , nm , self.endg rs = Result(v,nm,self.endg) # new result object be returned res.append(rs) # add to current result list return res # return surviving matches
def match ( patn , text , offs=0 , limt=None ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit of matching returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 def __unicode__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + unicode(self.kind) + ',ct=' + unicode(self.count) + ',pa=' + unicode(self.pats) + ',tx=' + unicode(self.txts) + ',bd=' + unicode(self.bnds) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # three private functions using local variables of match() # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print "binding:",offs,ns os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi return uf def _span ( typw ): """ count chars available for wildcard match arguments: typw - wildcard returns: non-negative count if any match possible, otherwise -1 """ k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print "exclude=",k,"@",offs # calculate maximum chars a wildcard can match mx = ellyChar.findBreak(text,offs) - k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # char type matching a wildcard # print "text at",offs,"maximum wildcard match=",mx nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset if not tfn(c): break # stop when it fails to match nm += 1 # print "maximum wildcard span=",nm return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match? if limt == None: limt = len(text) mp = 0 # pattern index ml = len(patn) # pattern match limit # print text[offs:limt],":",list(patn) while True: ## literally match as many next chars as possible while mp < ml: if offs >= limt: last = '' else: last = text[offs].lower() offs += 1 # print 'matching last=' , last , 'at' , offs if patn[mp] != last: break mp += 1 ## check whether mismatch is due to special pattern char # print 'pat',mp,"<",ml # print "txt @",offs if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print "tc=",ord(tc) if tc == cALL: # a * wildcard? # print "ALL last=< " + last + " >" if last != '': offs -= 1 nm = _span(cALL) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print "offs=",offs uf = _mark(1); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print "END $:",last if last == '': continue elif last in [ '.' , ',' , '-' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' ]: offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print "ALF:",last,offs if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print "SPC:" if last != '' and ellyChar.isWhiteSpace(last): _bind(); _modify(); mbi += 1 continue elif tc == cAPO: # apostrophe wildcard? # print "APO: last=" , last if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print "SOS" # print last,'@',offs mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print "EOS" if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: if last != '': # still more to match? offs -= 1 nm = _span(tc) # maximum match possible # print 'spanning=' , nm if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1); unj += 1 uf.count = nm - 1 # at least one char must be matched continue elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch # print "fail - unwinding",unj while unj > 0: # try unwinding, if possible # print "unw:",unj uf = unw[unj-1] # get most recent unwinding record # print uf if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard break else: # print "no unwinding" break # quit if unwinding is exhausted ## ## clean up on match mode or on no match possible ## # print "matched=",matched if not matched: return None # no bindings # print text,offs ## consolidate contiguous bindings for subsequent substitutions # print "BEFORE consolidating" # print "bd:",len(mbd) # for b in mbd: # print b mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print "AFTER" # print "bd:",len(mbd) # for b in mbd: # print b return mbd # consolidated bindings plus new offset
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __str__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + str(self.kind) + ',ct=' + str(self.count) + ',pa=' + str(self.pats) + ',tx=' + str(self.txts) + ',bd=' + str(self.bnds) + ',ns=' + str(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print ( "binding:",offs,ns ) os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) ) # print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp ) # print ( "text to span:",text[offs:] ) # print ( "pat rest=" , patn[mp:] ) k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print ( "exclude=",k,"chars from possible span for rest of pattern" ) # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print ( mx,"chars available to scan" ) mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print ( "text at",offs,"maximum wildcard match=",mx ) nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print ( 'span c=' , c ) if not tfn(c): break # stop when it fails to match nm += 1 # print ( "maximum wildcard span=",nm ) return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print ( 'starting match, limt=',limt,text[offs:limt],":",patn ) # print ( 'nsps=' , nsps ) mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print ( '---- loop mp=' , mp , 'ml=' , ml ) while mp < ml: if offs >= limt: # print ( "offs=",offs,"limt=",limt ) last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print ( 'patn=' , patn ) mc = patn[mp] # print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs ) # print ( 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' ) if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print ( 'hyphen special matching, limt=', limt , 'offs=' , offs ) # print ( 'text[offs:]=' , text[offs:] ) if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print ( 'no special matching of hyphen' ) break # print ( 'matched @mp=' , mp ) mp += 1 ## check whether mismatch is due to special pattern char # print ( 'pat @',mp,"<",ml ) # print ( "txt @",offs,'<',limt,'last=',last ) # print ( '@',offs,text[offs:] ) if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) ) if tc == cALL: # a * wildcard? # print ( "ALL last=< " + last + " >" ) if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print ( "offs=",offs,'nm=',nm ) uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print ( "END $:",last ) if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif not ellyChar.isText(last): offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print ( "ANY:",last,offs ) if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print ( 'at cCAN' ) if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print ( "ALF:",last,offs ) if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print ( "UPR:",last,'@',offs ) if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print ( "LWR:",last,'@',offs ) if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print ( "SPC:","["+last+"]" ) if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print ( 'NO space' ) elif tc == cAPO: # apostrophe wildcard? # print ( "APO: last=" , last ) if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print ( "SOS" ) # print ( last,'@',offs ) mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print ( "EOS" ) if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' ) if last != '': # still more to match? offs -= 1 # print ( 'nsps=' , nsps ) # print ( '@' , offs , text ) nm = _span(tc,nsps) # maximum match possible # print ( 'spanning=' , nm ) if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print ( 'spanning=' , nm ) if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print ( 'offs=' , offs ) last = text[offs] if offs < limt else '' continue # print ( 'fail tc=' , deconvert(tc) ) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print ( "fail - unwinding" , unj ) while unj > 0: # try unwinding, if possible # print ( "unw:",unj ) uf = unw[unj-1] # get most recent unwinding record # print ( uf ) if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print ( "no unwinding" ) break # quit if unwinding is exhausted # print ( 'cnt=' , uf.count , 'off=' , offs ) ## ## clean up on match mode or on no match possible ## # print ( "matched=",matched ) if not matched: return None # no bindings # print ( text,offs ) ## consolidate contiguous bindings for subsequent substitutions # print ( "BEFORE consolidating consecutive bindings" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print ( "AFTER" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) return mbd # consolidated bindings plus new offset
def lookUp(self, chrs, keyl): """ look for terms in vocabulary at current text position arguments: self - chrs - text char list keyl - number of initial chars to use a DB key returns: list of tuples [ VocabularyElement , Result ], possibly empty """ # print ( 'lookUp: chrs=' , chrs , 'keyl=' , keyl ) res = [] # result list initially empty rln = 0 if len(chrs) == 0: return res # empty list at this point # print ( 'chrs=' , type(chrs) , type(chrs[0]) ) if keyl < 1: return res # still empty list if ellyConfiguration.language == 'ZH': strg = toKeyZH(chrs[0]) else: strg = toKey(chrs[:keyl]) # print ( 'vocab search key=' , list(strg) , type(strg) ) # print ( '0 endg=' , self.endg ) # print ( listDBKeys(self.cdb) ) vs = self._getDB(strg) # look up first word in vocabulary table # print ( '1 endg=' , self.endg ) if vs == None or len(vs) == 0: return res # print ( len(vs) , 'raw entries found' ) lm = len(chrs) # total length of text for lookup for v in vs: # look at possible vocabulary matches # print ( 'entry=' , v ) ln = v.length() # total possible match length for vocabulary entry # print ( 'rln=' , rln , 'ln=' , ln , 'lm=' , lm ) if ln > lm: # must be enough text in entry to match continue k = ln while k < lm: chrsk = chrs[k] if not ellyChar.isLetter(chrsk) and chrsk != '\'': break k += 1 # print ( 'k=' , k ) # print ( v.chs , ':' , chrs[:k] ) nm = self.doMatchUp(v.chs, chrs) if nm == 0 or nm < rln: continue # print ( '2 endg=' , self.endg ) # print ( 'rln=' , rln , 'ln=' , ln ) if rln < nm: # longer match than before? # print ( 'new list' ) res = [] # if so, start new result list for longer matches rln = nm # set new minimum match length # print ( 'returning' , v.chs , nm , '<' + self.endg + '>' ) rs = Result(v, nm, self.endg) # new result object be returned # print ( 'rs=' , rs ) res.append(rs) # add to current result list rem = [] # print ( 'len(res)=' , len(res) ) if len(res) > 1: # check for special case where term with and for re in res: # without inflection is in vocabulary # print ( 're=' , re ) if len(re.suffx) == 0: rem.append(re) if len(rem) > 0: return rem # if so, keep only full matches else: return res # return surviving matches
def apply ( self , token , extra=None ): """ apply inflectional stemming logic against token arguments: self - token - input token extra - extra token char for any restoration returns: status code exceptions: StemmingError """ last = None # save last popped letter if len(self.table) < 2: # check for empty table return isNOTM # if so, no match # print 'at' , self.table[0] , extra it = 0 # stemming logic index word = token.root # list of letters in token word m = len(word) # end of word seq = self.table[it] # suffix to match it += 1 # n = len(seq) # ending length to match if n >= m: # return isNOTM # word not long enough for ending msh = m - n # check that table is right one for word ending ew = m # just past end of token word # print "suffix length= ", n, ", word length= ", m if n > 0: for ix in range(n): ew -= 1 # print word[ew], " cmp ", seq[ix] if word[ew] != seq[ix]: return isNOTM ew -= 1 # print "first char before suffix=" , # print '[' + ( word[ew] if ew >= 0 else None ) + ']' # interpret table logic last = seq[-1] if n > 0 else extra word = word[:msh] # copy of word up to removed suffix # print 'word=' , word if not ellyChar.isLetter(word[-1]): return isNOTM while True: # advance through logic until success or failure opcode = self.table[it] # next operation code to interpret it += 1 # # print "opcode=", opcode if opcode < 0: # YE(S) on match with possible modifications # word satisfies conditions for ending removal word = token.root[:msh] # word without ending # print 'word=', word # print 'add or drop extra chars' nm = YE - opcode # get removal count from opcode # print 'nm=', nm if nm < 0: # any special restoration? if last == None: print >> sys.stderr , 'FATAL stemming logic error' sys.stdout.flush() sys.exit(1) # print 'restore' , '[' + last + ']' word.append(last) # negative count restores last removed letter else: # print 'drop' , nm , 'from [' , word , ']' while nm > 0: # otherwise drop additional letters if len(word) == 0: print >> sys.stderr , 'FATAL stemming logic error' sys.stdout.flush() sys.exit(1) last = word.pop() nm -= 1 # print 'extend=' , self.table[it] # append more chars word.extend(self.table[it]) token.root = word # replace token with stemmed result # print 'word=' , word # print 'root=' , token.root return isMTCH # success flag elif opcode == NO: # no match # print "fail!" return isNOTM elif opcode == IF: # enter logic block if a char sequence matches seq = self.table[it+1] # print 'seq=' , seq , 'ew=' , ew , 'word=' , word[:ew] sln = len(seq) if sln > len(word): # enough chars to match? it += self.table[it] # if not, skip over block of logic else: k = 0 # j = -1 # print 'at' , j , word[] while k < sln and word[-k-1] == seq[k]: # print 'word[' + str(k) + ']=' , word[j] k += 1 # j -= 1 # print 'k=' , k if k < sln: # any characters unmatched? # print 'IF no match' it += self.table[it] # if so, skip over block of logic else: # print 'IF match' it += 2 # otherwise, enter logic block word = word[:-sln] # update index in word elif opcode == IS: # check whether next character is in a specified set if len(word) <= 0: # any letters left in word? it += self.table[it] # if not, skip over block continue chs = self.table[it+1] # get character set c = word[-1] # print c, ':', chs if chs.find(c) < 0: # word character in set? it += self.table[it] # if not, skip block else: it += 2 # if so, enter block elif opcode < Nlen: # check length of word k = self.table[it+1] # comparison length # print "k= ", k, " : m= ", m , 'opcode=' , opcode if opcode == LT: # set match flag for type of comparison match = (m < k) # elif opcode == GT: # match = (m > k) # elif opcode == EQ: # match = (m == k) # elif opcode == NE: # match = (m != k) # else: return isNOTM # print "match= ", match if not match: # if no match, skip block it += self.table[it] else: # otherwise, go into logic of block it += 2 # elif opcode == MO: # continue to another logic table token.root = token.root[:msh] # print 'for more, set root=' , token.root return doMORE # let other table figure out what to do elif opcode == VO: # look for CVC pattern at end of stemming # and possibly restore -E word = token.root[:msh] # strip ending from end of word # print 'vowel check for' , word me = len(word) - 2 # at possible vowel in stemming result # last char assumed to be consonant # print 'me=' , me if me < 0 or ellyChar.isStrongConsonant(word[me]): token.root = word return isMTCH me -= 1 # vowel found; now check for consonant # print 'me=' , me if me < 0 or ellyChar.isStrictVowel(word[me]): return isMTCH if me <= 0 or word[me] != u'u' or word[me - 1] == u'q': word.append(u'e') # put back -E token.root = word # print 'final word=' , word return isMTCH else: return isNOTM raise ellyException.StemmingError