def normalize(self, s): """ convert all unrecognizable input chars to _ and any consecutive white spaces to a single space arguments: self - s - Unicode string or char list to operate on returns: normalized sequence """ # print ( '__ normalize' ) spaced = False n = len(s) ns = [] for i in range(n): x = s[i] if ellyChar.isLetter(x): spaced = False elif ellyChar.isWhiteSpace(x): if spaced: continue x = ' ' spaced = True elif not ellyChar.isText(x): x = '_' spaced = False else: spaced = False ns.append(x) return ns
def normalize ( s ): """ convert all unrecognizable input chars to _ and any consecutive white spaces to a single space arguments: s - Unicode string or char list to operate on returns: normalized sequence """ spaced = False n = len(s) ns = [ ] for i in range(n): x = s[i] if ellyChar.isLetter(x): spaced = False elif ellyChar.isWhiteSpace(x): if spaced: continue x = ' ' spaced = True elif not ellyChar.isText(x): x = '_' spaced = False else: spaced = False ns.append(x) return ns
def minMatch ( patn ): """ compute minimum number of chars matched by pattern arguments: patn - pattern with possible Elly wildcards returns: minimum count of chars matched """ inOption = False k = 0 m = 0 ml = len(patn) while m < ml: tmc = patn[m] if tmc == ellyChar.SPC: # space in pattern will stop scan if not inOption: break elif ellyChar.isText(tmc): # ordinary text char is counted if not inOption: k += 1 elif tmc == cSOS: # optional start code # print ( "START optional match" , inOption ) inOption = True elif tmc == cEOS: # optional end code # print ( "END optional match" , inOption ) inOption = False elif tmc == cALL: # ALL (*) wildcard? pass elif tmc == cEND: # END code pass else: # any other wildcard # print ( "count up wildcard minimum match" ) k += 1 m += 1 return k
def minMatch ( patn ): """ compute minimum number of chars matched by pattern arguments: patn - pattern with possible Elly wildcards returns: minimum count of chars matched """ inOption = False k = 0 m = 0 ml = len(patn) while m < ml: tmc = patn[m] if tmc == ellyChar.SPC: # space in pattern will stop scan if not inOption: break elif ellyChar.isText(tmc): # ordinary text char is counted if not inOption: k += 1 elif tmc == cSOS: # optional start code # print "START optional match" , inOption inOption = True elif tmc == cEOS: # optional end code # print "END optional match" , inOption inOption = False elif tmc == cALL: # ALL (*) wildcard? pass elif tmc == cEND: # END code pass else: # any other wildcard # print "count up wildcard minimum match" k += 1 m += 1 return k
def _store ( self , defs , nowarn ): """ put macro substitutions into table with indexing by first char of pattern arguments: self - defs - list of macro definition as strings nowarn - whether to turn warnings off exceptions: TableFailure on error """ while True: l = defs.readline() # next macro rule # print "rule input=" , l if len(l) == 0: break # EOF check dl = definitionLine.DefinitionLine(l,False) left = dl.left # pattern to be matched tail = dl.tail # transformation to apply to match if left == None or tail == None: self._err(l=l) continue mp = ellyWildcard.convert(left) if mp == None: self._err('bad wildcards',l) continue pe = mp[-1] if pe != ellyWildcard.cALL and pe != ellyWildcard.cEND: mp += ellyWildcard.cEND # pattern must end in $ if it does not end in * if not _checkBindings(mp,tail): self._err('bad bindings in substitution',l) continue if not nowarn and not _checkExpansion(mp,tail): self._err('substitution longer than original string',l,0) r = [ mp , tail ] # print "rule =" , [ left , tail ] pat = r[0] # get coded pattern if pat == None: self._err('no pattern',l) continue c = pat[0] # first char of pattern # check type to see how to index rule # print 'c=' , ord(c) p = pat while c == ellyWildcard.cSOS: # optional sequence? k = p.find(ellyWildcard.cEOS) # if so, find the end of sequence if k < 0 or k == 1: break # if no end or empty sequence, stop k += 1 if k == len(pat): break # should be something after sequence m = ellyChar.toIndex(pat[1]) # index by first char of optional sequence self.index[m].append(r) # (must be non-wildcard) p = p[k:] # move up in pattern c = p[0] # but check for another optional sequence if c == ellyWildcard.cSOS: self._err(l=l) continue # bad sequence, skip this rule # print 'c=' , ord(c) if ellyChar.isLetterOrDigit(c): # check effective first char of pattern m = ellyChar.toIndex(c) self.index[m].append(r) # add to index under alphanumeric char elif ellyChar.isText(c): self.index[0].append(r) # add to index under punctuation elif not c in ellyWildcard.Matching: if c == ellyWildcard.cEND: print >> sys.stderr , '** macro warning: pattern can have empty match' print >> sys.stderr , '* at [' , l , ']' else: dc = '=' + str(ord(c) - ellyWildcard.X) self._err('bad wildcard code' , dc) continue elif c == ellyWildcard.cANY or c == ellyWildcard.cALL: self.anyWx.append(r) # under general wildcards elif c == ellyWildcard.cCAN: self.index[0].append(r) # under punctuation elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG: self.digWx.append(r) # under digit wildcards elif c == ellyWildcard.cSAN: self.digWx.append(r) # under both digit and self.letWx.append(r) # letter wildcards elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND: self._err('bad wildcard in context',l) continue # wildcards unacceptable here else: self.letWx.append(r) # everything else under letter wildcard self.count += 1 # count up macro substitution if self._errcount > 0: print >> sys.stderr , '**' , self._errcount , 'macro errors in all' print >> sys.stderr , 'macro table definition FAILed' raise ellyException.TableFailure
def read(self): """ get next char from input stream with filtering arguments: self returns: single Unicode char on success, null string otherwise """ # print 'reading: buf=' , self.buf while True: if not self._reload( ): # check if buffer empty and reload if needed return END # return EOF if no more chars available # print 'buf=' , self.buf c = self.buf.pop(0) # next raw char in buffer if c == SHYP: # ignore soft hyphen if len(self.buf) > 0: if self.buf[0] == SP: c = self.buf.pop(0) continue if not ellyChar.isText(c): # unrecognizable Elly char? # print 'c=' , '{0:04x}'.format(ord(c)) if ellyChar.isCJK(c): if ellyConfiguration.language != 'ZH': c = '_' # special handling for non-Chinese input elif not c in [u'\uff0c', u'\u3002']: # print 'replace' , c , 'with NBSP' c = NBSP # by default, replace with no-break space lc = self._lc # copy saved last char # print 'lc=' , ord(lc) self._lc = c # set new last char # if c == "'": # print 'apostrophe' , self.buf # print 'c=' , '<' + c + '>' if c == HYPH: # special treatment for isolated hyphens if spc(lc) and spc(self.peek()): c = DASH break elif c == '.': # check for ellipsis bb = self.buf bl = len(bb) # print 'bl=' , bl , 'bb=' , bb if bl >= 2 and bb[0] == '.' and bb[1] == '.': self.buf = bb[2:] c = ELLP elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[ 2] == ' ' and bb[3] == '.': self.buf = bb[4:] c = ELLP break elif c == RSQm: # check for single quote # print 'at single quote' nc = self.peek() # look at next char # print 'next=' , nc if nc == RSQm: # doubling of single quote? self.buf.pop(0) # if so, combine two single quotes c = RDQm # into one double quote elif not ellyChar.isWhiteSpace(c): if ellyChar.isWhiteSpace(lc): self._cap = ellyChar.isUpperCaseLetter(c) break elif c == CR: # always ignore continue elif c == NL: # special handling of \n # print 'got NL' nc = self.peek() # look at next char while nc == CR: self.buf.pop(0) # skip over CR's nc = self.peek() # print "lc= '" + lc + "'" if lc != NL and nc == NL: self.buf.pop(0) # special case when NL can be returned break if nc == NL: # NL followed NL? while nc == NL or nc == CR: self.buf.pop(0) # ignore subsequent new line chars nc = self.peek() elif nc == END or ellyChar.isWhiteSpace(nc): continue # NL followed by space is ignored elif nc == u'.' or nc == u'-': pass else: # print 'NL to SP, lc=' , ord(lc) c = SP # convert NL to SP if not before another NL else: # print 'lc=' , ord(lc) , 'c=' , ord(c) c = SP # otherwise, convert white space to plain space self._cap = False if not ellyChar.isWhiteSpace( lc): # preceding char was not white space? # print 'return SP' break # if so, keep space in stream return c # next filtered char
def _store ( self , defs , nowarn ): """ put macro substitutions into table with indexing by first char of pattern arguments: self - defs - list of macro definition as strings nowarn - whether to turn warnings off exceptions: TableFailure on error """ # print defs.linecount() , 'lines' while True: l = defs.readline() # next macro rule # print "rule input=" , l if len(l) == 0: break # EOF check dl = definitionLine.DefinitionLine(l,False) left = dl.left # pattern to be matched tail = dl.tail # transformation to apply to match # print 'dl.left=' , left if left == None or tail == None: self._err(l=l) # report missing part of rule continue if left.find(' ') >= 0: # pattern side of macro rule ms = 'pattern in macro contains spaces' self._err(s=ms,l=l,d=1) # cannot contain any space chars continue lefts = list(left) # print 'left=' , lefts nspm = ellyWildcard.numSpaces(lefts) pat = ellyWildcard.convert(left) # get pattern with encoded wildcards if pat == None: self._err('bad wildcards',l) continue # print 'pat=' , ellyWildcard.deconvert(pat) , 'len=' , len(pat) # print 'pat=' , list(pat) pe = pat[-1] if not pe in [ ellyWildcard.cALL , ellyWildcard.cEND , ellyWildcard.cSPC ]: pat += ellyWildcard.cEND # pattern must end in $ if it does not end in * or _ if not _checkBindings(pat,tail): self._err('bad bindings in substitution',l) continue if not nowarn and not _checkExpansion(pat,tail): self._err('substitution may be longer than original string',l,0) # print "rule =" , [ left , nspm , tail ] if pat == None: self._err('no pattern',l) continue r = Rule( pat , nspm , tail ) c = pat[0] # first char of pattern # check type to see how to index rule # print 'c=' , ellyWildcard.deconvert(c) , ', pat=' , ellyWildcard.deconvert(pat) p = pat while c == ellyWildcard.cSOS: # optional sequence? if not cEOS in p: break k = p.index(cEOS) # if so, find the end of sequence if k < 0 or k == 1: break # if no end or empty sequence, stop k += 1 if k == len(pat): break # should be something after sequence m = ellyChar.toIndex(pat[1]) # index by first char of optional sequence self.index[m].append(r) # (must be non-wildcard) p = p[k:] # move up in pattern c = p[0] # but check for another optional sequence if c == ellyWildcard.cSOS: self._err(l=l) continue # bad sequence, skip this rule # print 'c=' , ord(c) if ellyChar.isLetterOrDigit(c): # check effective first char of pattern m = ellyChar.toIndex(c) self.index[m].append(r) # add to index under alphanumeric char elif ellyChar.isText(c): self.index[0].append(r) # add to index under punctuation elif not c in ellyWildcard.Matching: if c == ellyWildcard.cEND: print >> sys.stderr , '** macro warning: pattern can have empty match' print >> sys.stderr , '* at [' , l , ']' else: dc = '=' + str(ord(c) - ellyWildcard.X) self._err('bad wildcard code' , dc) continue elif c == ellyWildcard.cANY or c == ellyWildcard.cALL: self.anyWx.append(r) # under general wildcards elif c == ellyWildcard.cCAN: self.index[0].append(r) # under punctuation elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG: self.digWx.append(r) # under digit wildcards elif c == ellyWildcard.cSAN: self.digWx.append(r) # under both digit and self.letWx.append(r) # letter wildcards elif c == ellyWildcard.cAPO: # right single quote or apostrophe self.apoWx.append(r) # elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND: self._err('bad wildcard in context',l) continue # wildcards unacceptable here else: self.letWx.append(r) # everything else under letter wildcard self.count += 1 # count up macro substitution # print 'count=' , self.count if self._errcount > 0: print >> sys.stderr , '**' , self._errcount , 'macro errors in all' print >> sys.stderr , 'macro table definition FAILed' raise ellyException.TableFailure
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __str__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + str(self.kind) + ',ct=' + str(self.count) + ',pa=' + str(self.pats) + ',tx=' + str(self.txts) + ',bd=' + str(self.bnds) + ',ns=' + str(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print ( "binding:",offs,ns ) os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) ) # print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp ) # print ( "text to span:",text[offs:] ) # print ( "pat rest=" , patn[mp:] ) k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print ( "exclude=",k,"chars from possible span for rest of pattern" ) # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print ( mx,"chars available to scan" ) mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print ( "text at",offs,"maximum wildcard match=",mx ) nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print ( 'span c=' , c ) if not tfn(c): break # stop when it fails to match nm += 1 # print ( "maximum wildcard span=",nm ) return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print ( 'starting match, limt=',limt,text[offs:limt],":",patn ) # print ( 'nsps=' , nsps ) mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print ( '---- loop mp=' , mp , 'ml=' , ml ) while mp < ml: if offs >= limt: # print ( "offs=",offs,"limt=",limt ) last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print ( 'patn=' , patn ) mc = patn[mp] # print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs ) # print ( 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' ) if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print ( 'hyphen special matching, limt=', limt , 'offs=' , offs ) # print ( 'text[offs:]=' , text[offs:] ) if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print ( 'no special matching of hyphen' ) break # print ( 'matched @mp=' , mp ) mp += 1 ## check whether mismatch is due to special pattern char # print ( 'pat @',mp,"<",ml ) # print ( "txt @",offs,'<',limt,'last=',last ) # print ( '@',offs,text[offs:] ) if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) ) if tc == cALL: # a * wildcard? # print ( "ALL last=< " + last + " >" ) if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print ( "offs=",offs,'nm=',nm ) uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print ( "END $:",last ) if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif not ellyChar.isText(last): offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print ( "ANY:",last,offs ) if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print ( 'at cCAN' ) if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print ( "ALF:",last,offs ) if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print ( "UPR:",last,'@',offs ) if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print ( "LWR:",last,'@',offs ) if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print ( "SPC:","["+last+"]" ) if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print ( 'NO space' ) elif tc == cAPO: # apostrophe wildcard? # print ( "APO: last=" , last ) if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print ( "SOS" ) # print ( last,'@',offs ) mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print ( "EOS" ) if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' ) if last != '': # still more to match? offs -= 1 # print ( 'nsps=' , nsps ) # print ( '@' , offs , text ) nm = _span(tc,nsps) # maximum match possible # print ( 'spanning=' , nm ) if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print ( 'spanning=' , nm ) if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print ( 'offs=' , offs ) last = text[offs] if offs < limt else '' continue # print ( 'fail tc=' , deconvert(tc) ) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print ( "fail - unwinding" , unj ) while unj > 0: # try unwinding, if possible # print ( "unw:",unj ) uf = unw[unj-1] # get most recent unwinding record # print ( uf ) if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print ( "no unwinding" ) break # quit if unwinding is exhausted # print ( 'cnt=' , uf.count , 'off=' , offs ) ## ## clean up on match mode or on no match possible ## # print ( "matched=",matched ) if not matched: return None # no bindings # print ( text,offs ) ## consolidate contiguous bindings for subsequent substitutions # print ( "BEFORE consolidating consecutive bindings" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print ( "AFTER" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) return mbd # consolidated bindings plus new offset
def _store(self, defs, nowarn): """ put macro substitutions into table with indexing by first char of pattern arguments: self - defs - list of macro definition as strings nowarn - whether to turn warnings off exceptions: TableFailure on error """ # print ( defs.linecount() , 'lines' ) while True: l = defs.readline() # next macro rule # print ( "rule input=" , l ) if len(l) == 0: break # EOF check dl = definitionLine.DefinitionLine(l, False) left = dl.left # pattern to be matched tail = dl.tail # transformation to apply to match # print ( 'dl.left=' , left ) if left == None or tail == None: self._err(l=l) # report missing part of rule continue if left.find(' ') >= 0: # pattern side of macro rule ms = 'pattern in macro contains spaces' self._err(s=ms, l=l, d=1) # cannot contain any space chars continue lefts = list(left) # print ( 'left=' , lefts ) nspm = ellyWildcard.numSpaces(lefts) pat = ellyWildcard.convert( left) # get pattern with encoded wildcards if pat == None: self._err('bad wildcards', l) continue # print ( 'pat=' , ellyWildcard.deconvert(pat) , 'len=' , len(pat) ) # print ( 'pat=' , list(pat) ) pe = pat[-1] if not pe in [ ellyWildcard.cALL, ellyWildcard.cEND, ellyWildcard.cSPC ]: pat += ellyWildcard.cEND # pattern must end in $ if it does not end in * or _ if not _checkBindings(pat, tail): self._err('bad bindings in substitution', l) continue if not nowarn and not _checkExpansion(pat, tail): self._err('substitution may be longer than original string', l, 0) # print ( "rule =" , [ left , nspm , tail ] ) if pat == None: self._err('no pattern', l) continue r = Rule(pat, nspm, tail) c = pat[0] # first char of pattern # check type to see how to index rule # print ( 'c=' , ellyWildcard.deconvert(c) , ', pat=' , ellyWildcard.deconvert(pat) ) p = pat while c == ellyWildcard.cSOS: # optional sequence? if not cEOS in p: break k = p.index(cEOS) # if so, find the end of sequence if k < 0 or k == 1: break # if no end or empty sequence, stop k += 1 if k == len(pat): break # should be something after sequence m = ellyChar.toIndex( pat[1]) # index by first char of optional sequence self.index[m].append(r) # (must be non-wildcard) p = p[k:] # move up in pattern c = p[0] # but check for another optional sequence if c == ellyWildcard.cSOS: self._err(l=l) continue # bad sequence, skip this rule # print ( 'c=' , ord(c) ) if ellyChar.isLetterOrDigit( c): # check effective first char of pattern m = ellyChar.toIndex(c) self.index[m].append(r) # add to index under alphanumeric char elif ellyChar.isText(c): self.index[0].append(r) # add to index under punctuation elif not c in ellyWildcard.Matching: if c == ellyWildcard.cEND: print('** macro warning: pattern can have empty match', file=sys.stderr) print('* at [', l, ']', file=sys.stderr) else: dc = '=' + str(ord(c) - ellyWildcard.X) self._err('bad wildcard code', dc) continue elif c == ellyWildcard.cANY or c == ellyWildcard.cALL: self.anyWx.append(r) # under general wildcards elif c == ellyWildcard.cCAN: self.index[0].append(r) # under punctuation elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG: self.digWx.append(r) # under digit wildcards elif c == ellyWildcard.cSAN: self.digWx.append(r) # under both digit and self.letWx.append(r) # letter wildcards elif c == ellyWildcard.cAPO: # right single quote or apostrophe self.apoWx.append(r) # elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND: self._err('bad wildcard in context', l) continue # wildcards unacceptable here else: self.letWx.append(r) # everything else under letter wildcard self.count += 1 # count up macro substitution # print ( 'count=' , self.count ) if self._errcount > 0: print(self._errcount, 'macro errors in all', file=sys.stderr) print('macro table definition FAILed', file=sys.stderr) raise ellyException.TableFailure
def read ( self ): """ get next char from input stream with filtering arguments: self returns: single Unicode char on success, null string otherwise """ # print 'reading: buf=' , self.buf while True: if not self._reload(): # check if buffer empty and reload if needed return END # return EOF if no more chars available # print 'buf=' , self.buf c = self.buf.pop(0) # next raw char in buffer if c == SHYP: # ignore soft hyphen if len(self.buf) > 0: if self.buf[0] == SP: c = self.buf.pop(0) continue if not ellyChar.isText(c): # unrecognizable Elly char? # print 'c=' , '{0:04x}'.format(ord(c)) if ellyChar.isCJK(c): c = '_' # special handling for Chinese else: # print 'replace' , c , 'with NBSP' c = NBSP # by default, replace with no-break space lc = self._lc # copy saved last char # print 'lc=' , ord(lc) self._lc = c # set new last char # if c == "'": # print 'apostrophe' , self.buf # print 'c=' , '<' + c + '>' if c == HYPH: # special treatment for isolated hyphens if spc(lc) and spc(self.peek()): c = DASH break elif c == '.': # check for ellipsis bb = self.buf bl = len(bb) # print 'bl=' , bl , 'bb=' , bb if bl >= 2 and bb[0] == '.' and bb[1] == '.': self.buf = bb[2:] c = ELLP elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[2] == ' ' and bb[3] == '.': self.buf = bb[4:] c = ELLP break elif c == RSQm: # check for single quote # print 'at single quote' nc = self.peek() # look at next char # print 'next=' , nc if nc == RSQm: # doubling of single quote? self.buf.pop(0) # if so, combine two single quotes c = RDQm # into one double quote elif not ellyChar.isWhiteSpace(c): if ellyChar.isWhiteSpace(lc): self._cap = ellyChar.isUpperCaseLetter(c) break elif c == CR: # always ignore continue elif c == NL: # special handling of \n # print 'got NL' nc = self.peek() # look at next char while nc == CR: self.buf.pop(0) # skip over CR's nc = self.peek() # print "lc= '" + lc + "'" if lc != NL and nc == NL: self.buf.pop(0) # special case when NL can be returned break if nc == NL: # NL followed NL? while nc == NL or nc == CR: self.buf.pop(0) # ignore subsequent new line chars nc = self.peek() elif nc == END or ellyChar.isWhiteSpace(nc): continue # NL followed by space is ignored elif nc == u'.' or nc == u'-': pass else: # print 'NL to SP, lc=' , ord(lc) c = SP # convert NL to SP if not before another NL else: # print 'lc=' , ord(lc) , 'c=' , ord(c) c = SP # otherwise, convert white space to plain space self._cap = False if not ellyChar.isWhiteSpace(lc): # preceding char was not white space? # print 'return SP' break # if so, keep space in stream return c # next filtered char
def read ( self ): """ get next char from input stream with filtering arguments: self returns: single Unicode char on success, null string otherwise """ # print 'reading: buf=' , self.buf while True: if not self._reload(): # check if buffer empty and reload if needed return END # return EOF if no more chars available # print 'buf=' , self.buf c = self.buf.pop(0) # next raw char in buffer if not ellyChar.isText(c): # unrecognizable Elly char? # print 'c=' , ord(c) c = NBSP # if so, replace with no-break space lc = self._lc # copy saved last char # print 'lc=' , ord(lc) self._lc = c # set new last char # if c == "'": # print 'apostrophe' , self.buf if c == HYPH: # special treatment for isolated hyphens if spc(lc) and spc(self.peek()): c = DASH break elif not ellyChar.isWhiteSpace(c): break elif c == CR: # always ignore continue elif c == NL: # special handling of \n # print 'got NL' nc = self.peek() # look at next char while nc == CR: self.buf.pop(0) # skip over CR's nc = self.peek() # print "lc= '" + lc + "'" if lc != NL and nc == NL: self.buf.pop(0) # special case when NL can be returned break if nc == NL: # NL followed NL? while nc == NL or nc == CR: self.buf.pop(0) # ignore subsequent new line chars nc = self.peek() elif nc == END or ellyChar.isWhiteSpace(nc): continue # NL followed by space is ignored elif nc == u'.' or nc == u'-': pass else: # print 'NL to SP, lc=' , ord(lc) c = SP # convert NL to SP if not before another NL else: # print 'lc=' , ord(lc) , 'c=' , ord(c) c = SP # otherwise, convert white space to plain space if not ellyChar.isWhiteSpace(lc): # preceding char was not white space? # print 'return SP' break # if so, keep space in stream return c # next filtered char