def _planAhead ( buf ): """ check for possible problems in the next scan while context is still available and set flags if needed arguments: buf - buffer to be scanned """ global _toscan nsk = 0 # total skip count lb = len(buf) if lb > 4: if buf[0] == '(': # skip initial '(' nsk += 1 buf = buf[1:] if buf[0] == '"': # skip initial '"' nsk += 1 buf = buf[1:] lb -= nsk nix = 0 # scan count if lb > 8: for chx in buf: # go to first non-letter if not ellyChar.isLetter(chx): if ellyChar.isWhiteSpace(chx): break # must be space return nix += 1 sst = ''.join(buf[:nix]).lower() if not sst in _det: return # must find determiner nix += 1 # skip space if ellyChar.isUpperCaseLetter(buf[nix]): nix += 1 # skip first letter buf = buf[nix:] for ch in buf: # go to next non-letter if not ellyChar.isLetter(ch): if ellyChar.isWhiteSpace(ch): break return nix += 1 _toscan = lb + nsk - nix
def _extract ( buf , nch ): """ get next possible name components at current position arguments: buf - current contents as list of chars nch - char count to work with returns: component string if found, otherwise '' """ if nch == 0: return '' lrw = buf[:nch] # list of chars in possible component # print ( 'lwr=' , lrw ) if lrw[0] == ',': if nch == 1 or not ellyChar.isWhiteSpace(lrw[1]): return '' lrw.pop(0) lrw.pop(0) # print ( 'lrw=' , lrw ) if lrw[0] == '(': lrw.pop() # remove any pair of parentheses before lookup lrw.pop(0) # print ( 'lrw=' , lrw ) if len(lrw) > 2 and lrw[0] == '"' and lrw[-1] == '"': lrw.pop() # remove any pair of double quotes before lookup lrw.pop(0) if len(lrw) > 0 and lrw[-1] == ',': lrw.pop() # print ( 'lrw=' , lrw ) return ''.join(lrw) # possible name component as string
def normalize ( s ): """ convert all unrecognizable input chars to _ and any consecutive white spaces to a single space arguments: s - Unicode string or char list to operate on returns: normalized sequence """ spaced = False n = len(s) ns = [ ] for i in range(n): x = s[i] if ellyChar.isLetter(x): spaced = False elif ellyChar.isWhiteSpace(x): if spaced: continue x = ' ' spaced = True elif not ellyChar.isText(x): x = '_' spaced = False else: spaced = False ns.append(x) return ns
def findClose ( self , opn , cls ): """ look ahead for closing bracket in input stream buffer arguments: self - opn - opening bracket cls - closing bracket to look for returns: offset in stream if found, -1 otherwise """ skp = 0 # skip count nos = 0 # offset in buffer nlm = len(self.buf) if nlm > NLM: nlm = NLM # set lookahead limit while nos < nlm: if self.buf[nos] == opn: # another opening bracket means skp += 1 # to skip a closing one elif self.buf[nos] == cls: if skp > 0: # check for skip skp -= 1 elif nos + 1 == nlm or ellyChar.isWhiteSpace(self.buf[nos+1]): return nos # offset for closure nos += 1 return -1
def normalize(self, s): """ convert all unrecognizable input chars to _ and any consecutive white spaces to a single space arguments: self - s - Unicode string or char list to operate on returns: normalized sequence """ # print ( '__ normalize' ) spaced = False n = len(s) ns = [] for i in range(n): x = s[i] if ellyChar.isLetter(x): spaced = False elif ellyChar.isWhiteSpace(x): if spaced: continue x = ' ' spaced = True elif not ellyChar.isText(x): x = '_' spaced = False else: spaced = False ns.append(x) return ns
def findClose(self, opn, cls): """ look ahead for closing bracket in input stream buffer arguments: self - opn - opening bracket cls - closing bracket to look for returns: offset in stream if found, -1 otherwise """ skp = 0 # skip count nos = 0 # offset in buffer nlm = len(self.buf) if nlm > NLM: nlm = NLM # set lookahead limit while nos < nlm: if self.buf[nos] == opn: # another opening bracket means skp += 1 # to skip a closing one elif self.buf[nos] == cls: if skp > 0: # check for skip skp -= 1 elif nos + 1 == nlm or ellyChar.isWhiteSpace( self.buf[nos + 1]): return nos # offset for closure nos += 1 return -1
def _extract ( buf , nch ): """ get next possible name components at current position arguments: buf - current contents as list of chars nch - char count to work with returns: component string if found, otherwise '' """ if nch == 0: return '' lrw = buf[:nch] # list of chars in possible component # print 'lwr=' , lrw if lrw[0] == ',': if nch == 1 or not ellyChar.isWhiteSpace(lrw[1]): return '' lrw.pop(0) lrw.pop(0) # print 'lrw=' , lrw if lrw[0] == '(': lrw.pop() # remove any pair of parentheses before lookup lrw.pop(0) # print 'lrw=' , lrw if len(lrw) > 2 and lrw[0] == '"' and lrw[-1] == '"': lrw.pop() # remove any pair of double quotes before lookup lrw.pop(0) if len(lrw) > 0 and lrw[-1] == ',': lrw.pop() # print 'lrw=' , lrw return u''.join(lrw) # possible name component as string
def normalize ( s ): """ convert all non-ASCII nonalphanumeric in sequence to _ and consecutive white spaces to a single space char arguments: s - input sequence to operate on """ spaced = False k = 0 n = len(s) for i in range(n): x = s[i] if ellyChar.isLetter(x): spaced = False elif ellyChar.isWhiteSpace(x): if spaced: continue x = ' ' spaced = True elif ord(x) > 127: x = '_' spaced = False else: spaced = False s[k] = x k += 1 s = s[:k]
def normalize(self, s): """ overrides method in parent class to convert all letters to _ and to eliminate any white space arguments: self - s - Unicode string or char list to operate on returns: normalized sequence """ # print 'ZH normalize' n = len(s) ns = [] for i in range(n): x = s[i] # print ' x=' , x if ellyChar.isLetter(x): x = '_' elif ellyChar.isWhiteSpace(x): continue # print 'norm x=' , x ns.append(x) # print 'norm=' , ns return ns
def matchtoo(txt, pnc, ctx): """ complex checks - currently only for rightmost period of A.M. or P.M. arguments: txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - list of chars in context after punctuation returns: True on match, False otherwise """ ln = len(txt) # print ( 'nomatch() ln=' , ln , txt ) nxt = ctx[0] if len(ctx) > 0 else '' if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5: return False # print ( 'check' , txt[-3:] ) if not txt[-1] in ['M', 'm'] or txt[-2] != '.' or not txt[-3] in [ 'P', 'p', 'A', 'a' ] or txt[-4] != ' ': return False ch = txt[-5] # print ( 'ch=' , ch ) if ellyChar.isDigit(ch): # only 1 digit will be checked here! # print ( 'ONE DIGIT' ) return True # erring on the side of not to break sentence elif not ellyChar.isLetter(ch): return False # # the following code is needed only when number transforms are turned off # nn = 6 while nn <= ln and ellyChar.isLetter(txt[-nn]): nn += 1 # print ( 'nn=' , nn ) if nn < 3 or nn > 6: return False elif nn > ln: if not txt[-nn] in [' ', '-']: return False wd = ''.join(txt[:-nn]).lower() # print ( 'wd=' , wd ) if wd in [ 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve' ]: if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]): return False else: return True else: return False
def spc(c): """ special space check arguments: c - single char returns: True if white space or null, False otherwise """ return c == '' or ellyChar.isWhiteSpace(c)
def spc ( c ): """ special space check arguments: c - single char returns: True if white space or null, False otherwise """ return c == '' or ellyChar.isWhiteSpace(c)
def matchtoo ( txt , pnc , ctx ): """ complex checks - currently only for rightmost period of A.M. or P.M. arguments: txt - list of text chars leading up to punctuation char pnc - punctuation char ctx - list of chars in context after punctuation returns: True on match, False otherwise """ ln = len(txt) # print 'nomatch() ln=' , ln , txt nxt = ctx[0] if len(ctx) > 0 else '' if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5: return False # print 'check' , txt[-3:] if not txt[-1] in ['M','m'] or txt[-2] != '.' or not txt[-3] in ['P','p','A','a'] or txt[-4] != ' ': return False ch = txt[-5] # print 'ch=' , ch if ellyChar.isDigit(ch): # only 1 digit will be checked here! # print 'ONE DIGIT' return True # erring on the side of not to break sentence elif not ellyChar.isLetter(ch): return False # # the following code is needed only when number transforms are turned off # nn = 6 while nn <= ln and ellyChar.isLetter(txt[-nn]): nn += 1 # print 'nn=' , nn if nn < 3 or nn > 6: return False elif nn > ln: if not txt[-nn] in [ ' ' , '-' ]: return False wd = ''.join(txt[:-nn]).lower() # print 'wd=' , wd if wd in [ 'one' , 'two' , 'three' , 'four' , 'five' , 'six' , 'seven' , 'eight' , 'nine' , 'ten' , 'eleven' , 'twelve' ]: if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]): return False else: return True else: return False
def _reload ( self ): """ refill input line buffer and compute indentation arguments: self returns: True on success if buffer has at least one char, False otherwise """ # print '_reload' if len(self.buf) > 0: return True # no refilling needed if self._eof: return False # must return immediately on previous EOF while len(self.buf) == 0: # print 'get more text' try: if self._prmpt: sys.stdout.write('>> ') s = self.inp.readline() # new text line to add if len(s) == 0: # print '**EOF' self._eof = True return False # EOF s = s.decode('utf8') # to tell Python how to interpret input string # print 'raw s=' , s except IOError: print >> sys.stderr , '** char stream ERROR' return False # treat read failure as empty line k = 0 while k < len(s): # count leading white space chars if s[k] == NL: break # but stop at end of line if not ellyChar.isWhiteSpace(s[k]): break k += 1 self._in = k # save indentation level s = s[k:] self.buf = list(s) # put unindented text into buffer # print 'k=' , k , ', s=' , '"' + s + '"' # print self.buf if k > 0 and ellyConfiguration.noteIndentation: self.buf.insert(0,NL) # if noted, indentation will break sentence # print 'len=' , len(self.buf) if len(self.buf) > 0: # if no usable input, stop return True return False
def append(self, text): """ add chars to end of buffer arguments: self - text - text to append, string or list of chars """ if not isinstance(text, list): text = list(text) # get new text as list if not already if len(self.buffer) > 0: if not ellyChar.isWhiteSpace(self.buffer[-1]) and text[0] != ' ': self.buffer.append(' ') # put in space separator if needed self.buffer.extend(text) # add new text
def atSpace(self): """ look for space char at start of buffer arguments: self returns: True if found, False otherwise """ if len(self.buffer) == 0: return False else: return ellyChar.isWhiteSpace(self.buffer[0])
def append ( self , text ): """ add chars to end of buffer arguments: self - text - text to append, string or list of chars """ if type(text) != list: # get new text as list text = list(text) if len(self.buffer) > 0: if not ellyChar.isWhiteSpace(self.buffer[-1]) and text[0] != ' ': self.buffer.append(' ') # put in space separator if needed self.buffer.extend(text) # add new text
def atSpace ( self ): """ look for space char at start of buffer arguments: self returns: True if found, False otherwise """ if len(self.buffer) == 0: return False else: return ellyChar.isWhiteSpace(self.buffer[0])
def skipSpaces(self): """ skip over spaces at start of buffer arguments: self """ n = len(self.buffer) if n == 0: return None k = 0 while k < n: if not ellyChar.isWhiteSpace(self.buffer[k]): break k += 1 self.buffer = self.buffer[k:] self._reset()
def skipSpaces ( self ): """ skip over spaces at start of buffer arguments: self """ n = len(self.buffer) if n == 0: return None k = 0 while k < n: if not ellyChar.isWhiteSpace(self.buffer[k]): break k += 1 self.buffer = self.buffer[k:] self._reset()
def compile ( name , stb , defn , stem=None ): """ static method to create an Elly vocabulary database from text file input arguments: name - for new BSDDB database stb - Elly symbol table defn - Elly definition reader for vocabulary stem - optional stemmer for indexing exceptions: TableFailure on error """ global nerr nerr = 0 # print >> sys.stderr , 'compiled stb=' , stb , 'stem=' , stem , 'db=' , db if stb == None : print >> sys.stderr, 'no symbol table' raise ellyException.TableFailure if db == None : print >> sys.stderr, 'no Python db package' raise ellyException.TableFailure try: zfs = FSpec(stb,'[$]',True).positive.hexadecimal(False) except ellyException.FormatFailure: # should never need this print >> sys.stderr , 'unexpected failure with zero features' raise ellyException.TableFailure # print >> sys.stderr , 'zfs=' , zfs # hexadecimal for all features off tsave = '' # original term dsave = '' # definition try: filn = name + vocabulary # where to put vocabulary database try: os.remove(filn) # delete the file if it exists except OSError: print >> sys.stderr , 'no' , filn dbs = db.DB() # create new database dbs.set_flags(db.DB_DUP) # keys may identify multiple records dbs.open(filn,None,db.DB_HASH,db.DB_CREATE) # open new database file # print >> sys.stderr , 'creating' , filn r = None # for error reporting while True: # process vocabulary records try: # print >> sys.stderr , '------------' r = defn.readline() # next definition if len(r) == 0: break # stop on EOF if r[0] == '#': continue # skip comment line # print >> sys.stderr , 'def=' , r k = r.find(':') # look for first ':' if k < 0: tsave = r dsave = None _err() # report error and quit entry continue t = r[:k].strip() # term to go into dictionary d = r[k+1:].strip() # its definition tsave = t # save for any error reporting dsave = d # # print >> sys.stderr , ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>' if len(t) == 0 or len(d) == 0: _err() # quit on missing parts continue c = t[0] if not ellyChar.isLetterOrDigit(c) and c != '.' and c != '"': _err('bad term') continue n = toIndex(t) # get part of term to index if n == 0: _err() # quit on bad term continue w = t[:n] # first word of term to define if stem != None: try: w = stem.simplify(w) # reduce for lookup key except ellyException.StemmingError: _err('bad stemming logic') continue # print >> sys.stderr , ' w=' , w lcw = lcAN(w) # convert to ASCII lower case # print >> sys.stderr , 'lcw=' , '"' + lcw + '"' ns = syntaxSpecification.scan(d) # find extent of syntax info # print >> sys.stderr , 'ns=' , ns if ns <= 0: _err('bad syntax specification') # print >> sys.stderr , 'PoS=' , d[:ns] syn = d[:ns] # syntax info as string d = d[ns:].strip() # rest of definition try: # print >> sys.stderr , 'VT syn=' , syn ss = SSpec(stb,syn) # decode syntax info to get # print >> sys.stderr , 'VT ss =' , ss except ellyException.FormatFailure: _err('malformed syntax specification') continue cat = str(ss.catg) # syntax category syf = ss.synf.positive.hexadecimal(False) # syntactic flags # print >> sys.stderr , 'syf=' , syf smf = zfs # initialize defaults for pb = '0' # cognitive semantics cn = '-' # # print >> sys.stderr , '0:d=[' + d + ']' if len(d) > 1: # check for cognitive semantics x = d[0] if x == '[' or x == '0' or x == '-': # semantic features? if x != '[': # a '0' or '-' means to take default if len(d) == 1 or d[1] != ' ': _err('missing semantic features') continue d = d[2:].strip() # skip over else: ns = featureSpecification.scan(d) # look for ']' of features # print >> sys.stderr , 'ns=' , ns if ns < 0: _err() continue sem = d[:ns] # get semantic features d = d[ns:].strip() # skip over try: # print >> sys.stderr , 'smf=' , smf fs = FSpec(stb,sem,True) except ellyException.FormatFailure: _err('bad semantic features') continue smf = fs.positive.hexadecimal(False) # convert to hex # print >> sys.stderr , '1:d=[' + d + ']' ld = len(d) # print >> sys.stderr , 'ld=' , ld if ld == 0: _err('missing plausibility') continue np = 0 x = d[np] if x == '+' or x == '-': np += 1 # take any plus or minus sign while np < ld: # and successive digits if ellyChar.isDigit(d[np]): np += 1 else: break # print >> sys.stderr , 'np=' , np if np == 0: _err('missing plausibility') continue pb = d[:np] # plausibility bias # print >> sys.stderr , 'pb=' , pb d = d[np:] ld = len(d) # print >> sys.stderr , '2:d=[' + d + ']' if ld > 1: # any more to process? c = d[0] # get next char after bias d = d[1:] # advance scan ld -= 1 if c == '/': # check for explicit concept # print >> sys.stderr , 'getting concept' np = 0 while np < ld: # get extent of concept if ellyChar.isWhiteSpace(d[np]): break np += 1 if np == 0: _err('missing concept for plausibility') continue cn = d[:np] # extract concept d = d[np:] elif c != ' ': _err() # signal bad format continue elif ld > 0: _err() # unidentifiable trailing text continue d = d.strip() # rest of definition # print 'rest of d=' , d if len(d) > 0 and d[-1] == '=': if len(d) == 1 or d[0] != '=': _err('incomplete definition') continue ld = [ ] # for normalizing definition k = 0 # count spaces removed sd = '' # previous char seen for cd in d: # scan all chars in translation if cd == ' ': if sd == '=' or sd == ',' or sd == ' ': k += 1 sd = cd continue elif cd == '=' or cd == ',': # no spaces before '=' or ',' if sd == ' ': k += 1 ld.pop() if cd == ',': if sd == '=': _err('missing translation') cd = '#' # format for PICK operation elif cd == '=' and sd == '=': print >> sys.stderr , '** WARNING \'=\' followed by \'=\'' print >> sys.stderr , '* at [' , tsave , ']' sd = cd ld.append(cd) # add char to reformatted definition if k > 0: d = ''.join(ld) # definition with spaces removed # print >> sys.stderr , '3:d=[' + d + ']' vrc = [ t , ':' , cat , syf , smf , pb , cn ] # start BdB data record vss = u' '.join(vrc) # convert to string vss += u' ' + d # fill out record with rest of input # print >> sys.stderr , 'type(vss)=' , type(vss) rss = vss.encode('utf8') # convert to UTF-8 # print >> sys.stderr , 'rec=' , vrc , 'tra=' , d # print >> sys.stderr , ' =' , rss except ellyException.FormatFailure: print >> sys.stderr , '* at [' , tsave , if dsave != None: print >> sys.stderr , ':' , dsave , print >> sys.stderr , ']' continue # print >> sys.stderr , 'lcw=' , lcw dbs.put(lcw,rss) # save in database # print >> sys.stderr , 'saved' # print >> sys.stderr , 'DONE' dbs.close() # clean up except StandardError , e: # catch any other errors print >> sys.stderr , '**' , e print >> sys.stderr , '* at' , r nerr += 1
def read(self): """ get next char from input stream with filtering arguments: self returns: single Unicode char on success, null string otherwise """ # print 'reading: buf=' , self.buf while True: if not self._reload( ): # check if buffer empty and reload if needed return END # return EOF if no more chars available # print 'buf=' , self.buf c = self.buf.pop(0) # next raw char in buffer if c == SHYP: # ignore soft hyphen if len(self.buf) > 0: if self.buf[0] == SP: c = self.buf.pop(0) continue if not ellyChar.isText(c): # unrecognizable Elly char? # print 'c=' , '{0:04x}'.format(ord(c)) if ellyChar.isCJK(c): if ellyConfiguration.language != 'ZH': c = '_' # special handling for non-Chinese input elif not c in [u'\uff0c', u'\u3002']: # print 'replace' , c , 'with NBSP' c = NBSP # by default, replace with no-break space lc = self._lc # copy saved last char # print 'lc=' , ord(lc) self._lc = c # set new last char # if c == "'": # print 'apostrophe' , self.buf # print 'c=' , '<' + c + '>' if c == HYPH: # special treatment for isolated hyphens if spc(lc) and spc(self.peek()): c = DASH break elif c == '.': # check for ellipsis bb = self.buf bl = len(bb) # print 'bl=' , bl , 'bb=' , bb if bl >= 2 and bb[0] == '.' and bb[1] == '.': self.buf = bb[2:] c = ELLP elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[ 2] == ' ' and bb[3] == '.': self.buf = bb[4:] c = ELLP break elif c == RSQm: # check for single quote # print 'at single quote' nc = self.peek() # look at next char # print 'next=' , nc if nc == RSQm: # doubling of single quote? self.buf.pop(0) # if so, combine two single quotes c = RDQm # into one double quote elif not ellyChar.isWhiteSpace(c): if ellyChar.isWhiteSpace(lc): self._cap = ellyChar.isUpperCaseLetter(c) break elif c == CR: # always ignore continue elif c == NL: # special handling of \n # print 'got NL' nc = self.peek() # look at next char while nc == CR: self.buf.pop(0) # skip over CR's nc = self.peek() # print "lc= '" + lc + "'" if lc != NL and nc == NL: self.buf.pop(0) # special case when NL can be returned break if nc == NL: # NL followed NL? while nc == NL or nc == CR: self.buf.pop(0) # ignore subsequent new line chars nc = self.peek() elif nc == END or ellyChar.isWhiteSpace(nc): continue # NL followed by space is ignored elif nc == u'.' or nc == u'-': pass else: # print 'NL to SP, lc=' , ord(lc) c = SP # convert NL to SP if not before another NL else: # print 'lc=' , ord(lc) , 'c=' , ord(c) c = SP # otherwise, convert white space to plain space self._cap = False if not ellyChar.isWhiteSpace( lc): # preceding char was not white space? # print 'return SP' break # if so, keep space in stream return c # next filtered char
def match ( self , txt , pnc , nxt ): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars up to and including punctuation char pnc - punctuation char nxt - single char after punctuation returns: True on match, False otherwise """ self.noteBracketing(pnc) # just in case this is bracketing if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] txl = txt[-self.maxl:] if len(txt) > self.maxl else txt txs = map(lambda x: x.lower(),txl) # actual left context for matching # print 'txs= ' + str(txs) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' lt = len(txs) # its length # print len(lp) , 'patterns' for p in lp: # try matching each pattern if p.left != None: n = len(p.left) # assume each pattern element must match one sequence char # print n , 'pattern elements' , lt , 'chars' if n > lt: continue # fail immediately because of impossibility of match if n < lt and ellyChar.isLetterOrDigit(txs[-n-1]): continue # fail because of text to match is after alphanumeric t = txs if n == lt else txs[-n:] # print 'pat=' , '[' + ellyWildcard.deconvert(p.left) + ']' if not ellyWildcard.match(p.left,t,0): continue # nc = '\\n' if nxt == '\n' else nxt # print 'nxt=' , '[' + nc + ']' # print 'pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' # if len(p.right) > 0: print ' ' , ord(p.right) if p.right == u'' or p.right == nxt: # check for specific char after possible stop return True if p.right == ellyWildcard.cCAN: # check for nonalphanumeric if nxt == u'' or not ellyChar.isLetterOrDigit(nxt): return True if p.right == ellyWildcard.cSPC: # check for white space # print 'looking for space' if nxt == u'' or nxt == u' ' or nxt == u'\n': return True if p.right == u'.': # check for any punctuation if not ellyChar.isLetterOrDigit(nxt) and not ellyChar.isWhiteSpace(nxt): return True return False
def getNext(self): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print ( 'getNext' ) self.resetBracketing() inBrkt = False nspc = 0 # set space count sent = [] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print ( 'x=' , '<' + x + '>' , ord(x) ) self.inp.unread(x, SP) # put first char back to restore input # print ( '0 <<' , self.inp.buf ) # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print ( 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' ) # print ( 'sent=' , sent , 'nspc=' , nspc ) # check for table delimiters in text if len(sent) == 0: # print ( 'table' ) # print ( '1 <<' , self.inp.buf ) if x == '.' or x == '-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far #################################################### # accumulate chars and count alphanumeric and spaces #################################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print ( 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' ) if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # elif ellyChar.isWhiteSpace(c) and inBrkt: nspc += 1 svBrkt = inBrkt inBrkt = self.checkBracketing( x) # do bracketing check with modified chars if svBrkt and not inBrkt: nspc = 0 # print ( 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt ) sent.append(c) # put original char into sentence buffer if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # certain Unicode punctuation will always break if c in Hards: break # char was not alphanumeric or space # look for stop punctuation exception cx = self.inp.preview() # for context of match call # print ( '0 <<' , self.inp.buf ) # print ( 'sent=' , sent[:-1] ) # print ( 'punc=' , '<' + c + '>' ) # print ( 'next=' , cx ) if c in Stops and len(cx) > 0 and cx[0] == SP: if self.stpx.match(sent[:-1], c, cx): # print ( 'stop exception MATCH' ) if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print ( 'no stop exception MATCH for' , c ) # print ( '@1 <<' , self.inp.buf ) # handle any nonstandard punctuation exoticPunctuation.normalize(c, self.inp) # print ( '@2 <<' , self.inp.buf ) # check for dash if c == '-': d = self.inp.read() if d == '-': # print ( 'dash' ) while True: d = self.inp.read() if d != '-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print ( '@3 c=' , c , inBrkt ) if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print ( 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) ) if not inBrkt: # print ( sent , 'so far' ) z = self.inp.read() if self.shortBracketing(sent, z): break self.inp.unread(z) # print ( 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' ) if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break elif c in QUOs and lc in Stops: # print ( 'stop+quote' ) z = self.inp.read() if z in RBs: sent.append(z) y = self.inp.read() if y in Stops: sent.append(y) elif not ellyChar.isWhiteSpace(y): self.inp.unread(y) inBrkt = False break elif z in QUOs: # print ( 'stop+quote+quote' ) sent.append(z) inBrkt = False break self.inp.unread(z) # print ( 'continue' ) continue elif not c in Stops: continue else: # print ( 'check stopping!' ) d = self.inp.read() # print ( '@3 <<' , self.inp.buf ) if d == None: d = '!' # print ( 'stop=' , '<' + c + '> <' + d + '>' ) # print ( 'ellipsis check' ) if c == '.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(d) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append( SP ) # if part of token, put in space as separator continue if c == ELLP: # print ( 'found Unicode ellipsis, d=' , d ) if ellyChar.isUpperCaseLetter(d): self.inp.unread( d) # super special case of bad punctuation self.inp.unread(' ') # put in implied period and space self.inp.unread('.') # # special check for multiple stops # print ( 'next char d=' , d , ord(d) if d != END else 'NONE' ) if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent, d): break if d in self._cl and self._cl[d] == 1: dn = self.inp.peek() if ellyChar.isWhiteSpace(dn): sent.append(d) break self.inp.unread(d) # print ( 'no space after punc' ) continue # if no match for lookahead, put back elif d != END: # print ( 'unread d=' , d ) self.inp.unread(d) # print ( 'possible stop' ) # check special case of number ending in decimal point if c == '.': ixb = len(sent) - 2 ixn = ixb + 1 cxn = '' # print ( 'sent=' , sent ) # print ( 'ixn=' ,ixn ) while ixn > 0: ixn -= 1 cxn = sent[ixn] # print ( 'cxn=' , cxn ) if not ellyChar.isDigit(cxn): break # print ( 'break: ixn=' , ixn , 'ixb=' , ixb ) if ixn < ixb and cxn in [' ', '-', '+']: prvw = self.inp.preview() # print ( 'prvw=' , prvw ) if len(prvw) > 1 and not ellyChar.isUpperCaseLetter( prvw[1]): continue # final check: is sentence long enough? if inBrkt: # print ( 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() ) # print ( 'nspc=' , nspc ) if c in [':', ';'] or nspc < 3: sent.append(d) # print ( 'add' , '<' + d + '> to sentence' ) # print ( 'sent=' , sent ) self.inp.skip() nspc -= 1 continue # print ( '@4 <<' , self.inp.buf ) cx = self.inp.peek() if cx == None: cx = '!!' # print ( 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent ) # print ( 'nAN=' , nAN , 'inBrkt=' , inBrkt ) if nAN > 1: break if sent == ['\u2026']: # special case of sentence return list("-.-") # with lone ellipsis elif len(sent) > 0 or self.last != END: return sent else: return None
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __str__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + str(self.kind) + ',ct=' + str(self.count) + ',pa=' + str(self.pats) + ',tx=' + str(self.txts) + ',bd=' + str(self.bnds) + ',ns=' + str(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print ( "binding:",offs,ns ) os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) ) # print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp ) # print ( "text to span:",text[offs:] ) # print ( "pat rest=" , patn[mp:] ) k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print ( "exclude=",k,"chars from possible span for rest of pattern" ) # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print ( mx,"chars available to scan" ) mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print ( "text at",offs,"maximum wildcard match=",mx ) nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print ( 'span c=' , c ) if not tfn(c): break # stop when it fails to match nm += 1 # print ( "maximum wildcard span=",nm ) return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print ( 'starting match, limt=',limt,text[offs:limt],":",patn ) # print ( 'nsps=' , nsps ) mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print ( '---- loop mp=' , mp , 'ml=' , ml ) while mp < ml: if offs >= limt: # print ( "offs=",offs,"limt=",limt ) last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print ( 'patn=' , patn ) mc = patn[mp] # print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs ) # print ( 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' ) if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print ( 'hyphen special matching, limt=', limt , 'offs=' , offs ) # print ( 'text[offs:]=' , text[offs:] ) if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print ( 'no special matching of hyphen' ) break # print ( 'matched @mp=' , mp ) mp += 1 ## check whether mismatch is due to special pattern char # print ( 'pat @',mp,"<",ml ) # print ( "txt @",offs,'<',limt,'last=',last ) # print ( '@',offs,text[offs:] ) if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) ) if tc == cALL: # a * wildcard? # print ( "ALL last=< " + last + " >" ) if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print ( "offs=",offs,'nm=',nm ) uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print ( "END $:",last ) if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif not ellyChar.isText(last): offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print ( "ANY:",last,offs ) if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print ( 'at cCAN' ) if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print ( "ALF:",last,offs ) if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print ( "UPR:",last,'@',offs ) if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print ( "LWR:",last,'@',offs ) if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print ( "SPC:","["+last+"]" ) if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print ( 'NO space' ) elif tc == cAPO: # apostrophe wildcard? # print ( "APO: last=" , last ) if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print ( "SOS" ) # print ( last,'@',offs ) mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print ( "EOS" ) if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' ) if last != '': # still more to match? offs -= 1 # print ( 'nsps=' , nsps ) # print ( '@' , offs , text ) nm = _span(tc,nsps) # maximum match possible # print ( 'spanning=' , nm ) if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print ( 'spanning=' , nm ) if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print ( 'offs=' , offs ) last = text[offs] if offs < limt else '' continue # print ( 'fail tc=' , deconvert(tc) ) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print ( "fail - unwinding" , unj ) while unj > 0: # try unwinding, if possible # print ( "unw:",unj ) uf = unw[unj-1] # get most recent unwinding record # print ( uf ) if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print ( "no unwinding" ) break # quit if unwinding is exhausted # print ( 'cnt=' , uf.count , 'off=' , offs ) ## ## clean up on match mode or on no match possible ## # print ( "matched=",matched ) if not matched: return None # no bindings # print ( text,offs ) ## consolidate contiguous bindings for subsequent substitutions # print ( "BEFORE consolidating consecutive bindings" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print ( "AFTER" ) # print ( "bd:",len(mbd) ) # print ( mbd[0] ) # print ( '----' ) # for b in mbd[1:]: # print ( b ) return mbd # consolidated bindings plus new offset
def getNext ( self ): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print 'getNext' self.resetBracketing() sent = [ ] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print 'x=' , '<' + x + '>' , ord(x) self.inp.unread(x,SP) # put first char back to restore input # print '0 <<" , self.inp.buf # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' # print 'sent=' , sent # check for table delimiters in text if len(sent) == 0: # print 'table' # print '1 <<' , self.inp.buf if x == u'.' or x == u'-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far ######################################### # accumulate chars and count alphanumeric ######################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # inBrkt = self.checkBracketing(x) # do bracket checking with modified chars # print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt sent.append(c) # but buffer original chars if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # char was not alphanumeric or space # look for stop punctuation exception z = self.inp.peek() # for context of match call # print '0 <<' , self.inp.buf # print 'sent=' , sent[:-1] # print 'punc=' , '<' + c + '>' # print 'next=' , '<' + z + '>' if c in Stops and self.stpx.match(sent[:-1],c,z): # print 'exception MATCH' if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print 'no stop exception MATCH for' , c # print '@1 <<' , self.inp.buf # handle any nonstandard punctuation exoticPunctuation.normalize(c,self.inp) # print '@2 <<' , self.inp.buf # check for dash if c == u'-': d = self.inp.read() if d == u'-': # print 'dash' while True: d = self.inp.read() if d != u'-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print '@3 c=' , c if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) if not inBrkt: # print sent , 'so far' z = self.inp.read() if self.shortBracketing(sent,z): break self.inp.unread(z) # print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break continue elif not c in Stops or inBrkt: continue else: # print 'check stopping!' d = self.inp.read() # print '@3 <<' , self.inp.buf if d == None: d = u'!' # print 'stop=' , '<' + c + '> <' + d + '>' # print 'ellipsis check' if c == u'.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(c) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append(SP) # if part of token, put in space as separator continue # special check for multiple stops # print 'next char d=' , d , ord(d) if d != END else 'NONE' if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent,d): break self.inp.unread(d) # print 'no space after punc' continue # if no match for lookahead, put back elif d != END: # print 'unread d=' , d self.inp.unread(d) # final check: is sentence long enough? # print '@4 <<' , self.inp.buf cx = self.inp.peek() if cx == None: cx = u'!!' # print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent if nAN > 1 and not inBrkt: break if len(sent) > 0 or self.last != END: return sent else: return None
def _matchAN ( self , ts ): """ apply logic for alphanumeric date recognition arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'ALPHANUMERIC' t = ts tl = len(ts) k = self._aMonth(t) # look for month to start date string comma = False # print 'month len=' , k if k > 0: if k == tl: return 0 if not ellyChar.isWhiteSpace(t[k]): return 0 k += 1 # skip space after month if k == tl: return 0 t = t[k:] k = self._aDay(t) # look for day of month # print 'day len=' , k if k == 0: self._dy = [ ] k = self._aYear(t) # look for year immediately following if k > 0: return tl - len(t) + k else: return 0 # print 'ts=' , ts tl = len(t) # _aDay may have rewritten alphabetic day t = t[k:] if len(t) == 0: # print 'no year tl=' , tl , 'k=' , k , t return len(ts) - tl + k if t[0] == u',': # look for comma after day t = t[1:] # if found, remove and note comma = True if len(t) == 0: return tl if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl k = self._aYear(t) # look for year # print 'year len=' , k lnt = len(t) if comma and k < lnt and t[k] == ',': k += 1 # remove comma after year if paired # print 'len(ts)=' , len(ts) , 'len(t)=' , len(t) , t return len(ts) - len(t) + k k = self._aDay(t) # look for day of month to start date string # print 'start day len=' , k if k == 0: self._dy = [ ] elif k > 0 and k < tl: # cannot be just bare number by itself tl = len(ts) # _aDay may have rewritten alphabetic day t = t[k:] # print 'new t=' , t if (k > 2 and len(t) > 2 and t[0] == u' ' and t[1].upper() == 'O' and t[2].upper() == 'F'): t = t[3:] # to handle day reference like '4th of' if len(t) == 0: return 0 if not ellyChar.isWhiteSpace(t[0]): return 0 t = t[1:] k = self._aMonth(t) # look for month if k == 0: return 0 t = t[k:] if len(t) == 0: return tl ntl = tl - len(t) # print 'ntl=' , ntl nd = 0 if t[0] == u',': # look for comma after month t = t[1:] if len(t) == 0: return tl nd += 1 comma = True if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl nd += 1 k = self._aYear(t) # look for year if k > 0: if comma and k < len(t) and t[k] == ',': k += 1 return ntl + k + nd # full date found else: return ntl - nd # only month and day of date found # print 'look for year only in' , t k = self._aYear(t) if k > 0: if k == tl: return k elif not ellyChar.isLetter(t[k]) and t[k] != '-': return k return 0 # nothing found
def _matchAN(self, ts): """ apply logic for alphanumeric date recognition arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'ALPHANUMERIC' t = ts tl = len(ts) k = self._aMonth(t) # look for month to start date string comma = False # print 'month len=' , k if k > 0: if k == tl: return 0 if not ellyChar.isWhiteSpace(t[k]): return 0 k += 1 # skip space after month if k == tl: return 0 t = t[k:] k = self._aDay(t) # look for day of month # print 'day len=' , k if k == 0: self._dy = [] k = self._aYear(t) # look for year immediately following if k > 0: return tl - len(t) + k else: return 0 # print 'ts=' , ts tl = len(t) # _aDay may have rewritten alphabetic day t = t[k:] if len(t) == 0: # print 'no year tl=' , tl , 'k=' , k , t return len(ts) - tl + k if t[0] == u',': # look for comma after day t = t[1:] # if found, remove and note comma = True if len(t) == 0: return tl if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl k = self._aYear(t) # look for year # print 'year len=' , k lnt = len(t) if comma and k < lnt and t[k] == ',': k += 1 # remove comma after year if paired # print 'len(ts)=' , len(ts) , 'len(t)=' , len(t) , t return len(ts) - len(t) + k k = self._aDay(t) # look for day of month to start date string # print 'start day len=' , k if k == 0: self._dy = [] elif k > 0 and k < tl: # cannot be just bare number by itself tl = len(ts) # _aDay may have rewritten alphabetic day t = t[k:] # print 'new t=' , t if (k > 2 and len(t) > 2 and t[0] == u' ' and t[1].upper() == 'O' and t[2].upper() == 'F'): t = t[3:] # to handle day reference like '4th of' if len(t) == 0: return 0 if not ellyChar.isWhiteSpace(t[0]): return 0 t = t[1:] k = self._aMonth(t) # look for month if k == 0: return 0 t = t[k:] if len(t) == 0: return tl ntl = tl - len(t) # print 'ntl=' , ntl nd = 0 if t[0] == u',': # look for comma after month t = t[1:] if len(t) == 0: return tl nd += 1 comma = True if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl nd += 1 k = self._aYear(t) # look for year if k > 0: if comma and k < len(t) and t[k] == ',': k += 1 return ntl + k + nd # full date found else: return ntl - nd # only month and day of date found # print 'look for year only in' , t k = self._aYear(t) if k > 0: if k == tl: return k elif not ellyChar.isLetter(t[k]) and t[k] != '-': return k return 0 # nothing found
def _limit ( buffr , hstry ): """ get length of next possible name component in buffer arguments: buffr - list of chars hstry - how much matched already returns: number of chars in continuation of last component, 0 for no next component """ lnb = len(buffr) if lnb == 0: return 0 bix = 0 quot = False # indicate component starting with " parn = False # with ( cmma = False # with , # print ( '_limit buffr=' , buffr , 'hstry=' , hstry ) if buffr[0] == ',': # handle possible leading comma if hstry == 0 or lnb < 4: return 0 bix += 1 if ellyChar.isWhiteSpace(buffr[1]): bix += 1 cmma = True # print ( 'for comma, bix=' , bix ) if buffr[bix] == '(': # handle short name in parentheses bix += 1 parn = True if buffr[bix] == '"': # handle short name in double quotes bix += 1 quot = True # print ( 'parn=' , parn , 'quot=' , quot ) if parn or quot: # print ( 'enclosed component from' , buffr[bix:] ) while bix < lnb: # collect letters for name chx = buffr[bix] if ellyChar.isWhiteSpace(chx): break elif not quot and parn and chx == ')': return bix + 1 # add trailing parenthesis elif quot and chx == '"': if bix + 1 < lnb and parn and buffr[bix+1] == ')': return bix + 2 # add trailing quote and parenthesis elif not parn: return bix + 1 # add trailing quote only else: return 0 # no match elif chx == '.': return bix + 1 # add trailing period elif not ellyChar.isLetter(chx): break # unrecognizable char for name bix += 1 # print ( 'no closure' ) return 0 else: # print ( 'find component in' , buffr[bix:] ) while bix < lnb: chx = buffr[bix] # collect letters for name # print ( 'chx=' , chx ) if chx == "'": if bix + 2 < lnb: chn = buffr[bix+1] if ellyChar.isWhiteSpace(chn): break if chn == 's' and not ellyChar.isLetter(buffr[bix+2]): break elif not ellyChar.isLetter(chx): if chx == '.': bix += 1 # print ( 'increment bix=' , bix ) break bix += 1 if bix == lnb: # print ( 'ran out of chars' ) return bix # running out of chars means match else: # getting here means that more text follows limit # and so we may have to pick up extra chars here chx = buffr[bix] # print ( 'next chx=' , chx , 'bix=' , bix ) if ellyChar.isWhiteSpace(chx) or chx == "'": return bix # component can be terminated by space or (') elif chx == ',': if cmma: return bix + 1 # or comma when sequence starts with comma else: return bix # when there is no starting comma elif ellyChar.isLetter(chx): return bix # or letter, implying previous char was '.' else: return 0 # failure to find name limit
def read ( self ): """ get next char from input stream with filtering arguments: self returns: single Unicode char on success, null string otherwise """ # print 'reading: buf=' , self.buf while True: if not self._reload(): # check if buffer empty and reload if needed return END # return EOF if no more chars available # print 'buf=' , self.buf c = self.buf.pop(0) # next raw char in buffer if not ellyChar.isText(c): # unrecognizable Elly char? # print 'c=' , ord(c) c = NBSP # if so, replace with no-break space lc = self._lc # copy saved last char # print 'lc=' , ord(lc) self._lc = c # set new last char # if c == "'": # print 'apostrophe' , self.buf if c == HYPH: # special treatment for isolated hyphens if spc(lc) and spc(self.peek()): c = DASH break elif not ellyChar.isWhiteSpace(c): break elif c == CR: # always ignore continue elif c == NL: # special handling of \n # print 'got NL' nc = self.peek() # look at next char while nc == CR: self.buf.pop(0) # skip over CR's nc = self.peek() # print "lc= '" + lc + "'" if lc != NL and nc == NL: self.buf.pop(0) # special case when NL can be returned break if nc == NL: # NL followed NL? while nc == NL or nc == CR: self.buf.pop(0) # ignore subsequent new line chars nc = self.peek() elif nc == END or ellyChar.isWhiteSpace(nc): continue # NL followed by space is ignored elif nc == u'.' or nc == u'-': pass else: # print 'NL to SP, lc=' , ord(lc) c = SP # convert NL to SP if not before another NL else: # print 'lc=' , ord(lc) , 'c=' , ord(c) c = SP # otherwise, convert white space to plain space if not ellyChar.isWhiteSpace(lc): # preceding char was not white space? # print 'return SP' break # if so, keep space in stream return c # next filtered char
def getNext ( self ): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print 'getNext' self.resetBracketing() inBrkt = False nspc = 0 # set space count sent = [ ] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print 'x=' , '<' + x + '>' , ord(x) self.inp.unread(x,SP) # put first char back to restore input # print '0 <<" , self.inp.buf # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' # print 'sent=' , sent , 'nspc=' , nspc # check for table delimiters in text if len(sent) == 0: # print 'table' # print '1 <<' , self.inp.buf if x == u'.' or x == u'-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far #################################################### # accumulate chars and count alphanumeric and spaces #################################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # elif ellyChar.isWhiteSpace(c) and inBrkt: nspc += 1 svBrkt = inBrkt inBrkt = self.checkBracketing(x) # do bracketing check with modified chars if svBrkt and not inBrkt: nspc = 0 # print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt sent.append(c) # put original char into sentence buffer if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # char was not alphanumeric or space # look for stop punctuation exception cx = self.inp.preview() # for context of match call # print '0 <<' , self.inp.buf # print 'sent=' , sent[:-1] # print 'punc=' , '<' + c + '>' # print 'next=' , cx if c in Stops and len(cx) > 0 and cx[0] == SP: if self.stpx.match(sent[:-1],c,cx): # print 'stop exception MATCH' if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print 'no stop exception MATCH for' , c # print '@1 <<' , self.inp.buf # handle any nonstandard punctuation exoticPunctuation.normalize(c,self.inp) # print '@2 <<' , self.inp.buf # check for dash if c == u'-': d = self.inp.read() if d == u'-': # print 'dash' while True: d = self.inp.read() if d != u'-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print '@3 c=' , c , inBrkt if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) if not inBrkt: # print sent , 'so far' z = self.inp.read() if self.shortBracketing(sent,z): break self.inp.unread(z) # print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break elif c in QUOs and lc in Stops: # print 'stop+quote' z = self.inp.read() if z in RBs: sent.append(z) y = self.inp.read() if y in Stops: sent.append(y) elif not ellyChar.isWhiteSpace(y): self.inp.unread(y) inBrkt = False break elif z in QUOs: # print 'stop+quote+quote' sent.append(z) inBrkt = False break self.inp.unread(z) # print 'continue' continue elif not c in Stops: continue else: # print 'check stopping!' d = self.inp.read() # print '@3 <<' , self.inp.buf if d == None: d = u'!' # print 'stop=' , '<' + c + '> <' + d + '>' # print 'ellipsis check' if c == u'.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(d) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append(SP) # if part of token, put in space as separator continue if c == ELLP: # print 'found Unicode ellipsis, d=' , d if ellyChar.isUpperCaseLetter(d): self.inp.unread(d) # super special case of bad punctuation self.inp.unread(' ') # put in implied period and space self.inp.unread('.') # # special check for multiple stops # print 'next char d=' , d , ord(d) if d != END else 'NONE' if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent,d): break if d in self._cl and self._cl[d] == 1: dn = self.inp.peek() if ellyChar.isWhiteSpace(dn): sent.append(d) break self.inp.unread(d) # print 'no space after punc' continue # if no match for lookahead, put back elif d != END: # print 'unread d=' , d self.inp.unread(d) # print 'possible stop' # check special case of number ending in decimal point if c == '.': ixb = len(sent) - 2 ixn = ixb + 1 cxn = '' # print 'sent=' , sent # print 'ixn=' ,ixn while ixn > 0: ixn -= 1 cxn = sent[ixn] # print 'cxn=' , cxn if not ellyChar.isDigit(cxn): break # print 'break: ixn=' , ixn , 'ixb=' , ixb if ixn < ixb and cxn in [ ' ' , '-' , '+' ]: prvw = self.inp.preview() # print 'prvw=' , prvw if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(prvw[1]): continue # final check: is sentence long enough? if inBrkt: # print 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() # print 'nspc=' , nspc if c in [ ':' , ';' ] or nspc < 3: sent.append(d) # print 'add' , '<' + d + '> to sentence' # print 'sent=' , sent self.inp.skip() nspc -= 1 continue # print '@4 <<' , self.inp.buf cx = self.inp.peek() if cx == None: cx = u'!!' # print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent # print 'nAN=' , nAN , 'inBrkt=' , inBrkt if nAN > 1: break if sent == [ u'\u2026' ]: # special case of sentence return list("-.-") # with lone ellipsis elif len(sent) > 0 or self.last != END: return sent else: return None
def read ( self ): """ get next char from input stream with filtering arguments: self returns: single Unicode char on success, null string otherwise """ # print 'reading: buf=' , self.buf while True: if not self._reload(): # check if buffer empty and reload if needed return END # return EOF if no more chars available # print 'buf=' , self.buf c = self.buf.pop(0) # next raw char in buffer if c == SHYP: # ignore soft hyphen if len(self.buf) > 0: if self.buf[0] == SP: c = self.buf.pop(0) continue if not ellyChar.isText(c): # unrecognizable Elly char? # print 'c=' , '{0:04x}'.format(ord(c)) if ellyChar.isCJK(c): c = '_' # special handling for Chinese else: # print 'replace' , c , 'with NBSP' c = NBSP # by default, replace with no-break space lc = self._lc # copy saved last char # print 'lc=' , ord(lc) self._lc = c # set new last char # if c == "'": # print 'apostrophe' , self.buf # print 'c=' , '<' + c + '>' if c == HYPH: # special treatment for isolated hyphens if spc(lc) and spc(self.peek()): c = DASH break elif c == '.': # check for ellipsis bb = self.buf bl = len(bb) # print 'bl=' , bl , 'bb=' , bb if bl >= 2 and bb[0] == '.' and bb[1] == '.': self.buf = bb[2:] c = ELLP elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[2] == ' ' and bb[3] == '.': self.buf = bb[4:] c = ELLP break elif c == RSQm: # check for single quote # print 'at single quote' nc = self.peek() # look at next char # print 'next=' , nc if nc == RSQm: # doubling of single quote? self.buf.pop(0) # if so, combine two single quotes c = RDQm # into one double quote elif not ellyChar.isWhiteSpace(c): if ellyChar.isWhiteSpace(lc): self._cap = ellyChar.isUpperCaseLetter(c) break elif c == CR: # always ignore continue elif c == NL: # special handling of \n # print 'got NL' nc = self.peek() # look at next char while nc == CR: self.buf.pop(0) # skip over CR's nc = self.peek() # print "lc= '" + lc + "'" if lc != NL and nc == NL: self.buf.pop(0) # special case when NL can be returned break if nc == NL: # NL followed NL? while nc == NL or nc == CR: self.buf.pop(0) # ignore subsequent new line chars nc = self.peek() elif nc == END or ellyChar.isWhiteSpace(nc): continue # NL followed by space is ignored elif nc == u'.' or nc == u'-': pass else: # print 'NL to SP, lc=' , ord(lc) c = SP # convert NL to SP if not before another NL else: # print 'lc=' , ord(lc) , 'c=' , ord(c) c = SP # otherwise, convert white space to plain space self._cap = False if not ellyChar.isWhiteSpace(lc): # preceding char was not white space? # print 'return SP' break # if so, keep space in stream return c # next filtered char
def _reload(self): """ refill input line buffer and compute indentation arguments: self returns: True on success if buffer has at least one char, False otherwise """ # print '_reload' bex = '' # save space char at end of buffer bcn = len(self.buf) if bcn > 1: return True # no refilling needed elif bcn == 1: if ellyChar.isWhiteSpace(self.buf[0]): bex = self.buf[0] # special case when only space char left self.buf = [] # refill to get chars after that space else: return True # no refilling yet if self._eof: return False # must return immediately on previous EOF while len(self.buf) == 0: # print 'get more text' try: # read in UTF8 line from input stream if self._prmpt: sys.stdout.write('>> ') s = self.inp.readline() # new text line to add # print 's=' , s if len(s) == 0: # print '**EOF' self._eof = True return False # EOF s = s.decode('utf8') # convert UTF8 to Unicode string # print 'raw s=' , s except IOError: print >> sys.stderr, '** char stream ERROR' return False # treat read failure as empty line k = 0 while k < len(s): # count leading white space chars if s[k] == NL: break # but stop at end of line if not ellyChar.isWhiteSpace(s[k]): break k += 1 self._in = k # save indentation level s = s[k:] self.buf = list(s) # put unindented text into buffer # print 'k=' , k , ', s=' , '"' + s + '"' # print self.buf if k > 0 and ellyConfiguration.noteIndentation: self.buf.insert( 0, NL) # if noted, indentation will break sentence # print 'len=' , len(self.buf) if len(self.buf) > 0: # if usable input, stop filling if bex != '': # but restore any saved space char from buffer self.buf.insert(0, bex) return True return False # cannot refill, ignore trailing space char
def build(name, stb, defn): """ static method to create an Elly vocabulary database from text file input arguments: name - for new SQLite database stb - Elly symbol table defn - Elly definition reader for vocabulary exceptions: TableFailure on error """ global nerr nerr = 0 cdb = None # SQLite db connection cur = None # SQLite db cursor # print ( 'built stb=' , stb ) if stb == None: print('no symbol table', file=sys.stderr) raise ellyException.TableFailure try: zfs = FSpec(stb, '[$]', True).positive.hexadecimal(False) except ellyException.FormatFailure: # should never need this print('unexpected failure with zero features', file=sys.stderr) raise ellyException.TableFailure # print ( 'zfs=' , zfs ) # hexadecimal for all features off tsave = '' # original term dsave = '' # definition try: filn = name + vocabulary # where to put vocabulary database try: os.remove(filn) # delete the file if it exists except OSError: print('no', filn, file=sys.stderr) # if no such file, warn but proceed #### SQLite DB operations #### try: cdb = dbs.connect(filn) # create new database cur = cdb.cursor() cur.execute("CREATE TABLE Vocab(Keyx TEXT, Defn TEXT)") cdb.commit() except dbs.Error as e: print(e, file=sys.stderr) raise ellyException.TableFailure # give up on any database failure # print ( 'creating' , filn ) # #### r = None # for error reporting while True: # process vocabulary definition records try: # for catching FormatFailure exception # print ( '------------' ) r = defn.readline() # next definition if len(r) == 0: break # stop on EOF # print ( type(r) , r ) r = definitionLine.normalize(r) # # print ( 'to' , r ) k = r.find(' : ') # look for first ' : ' if k < 0: tsave = r dsave = None _err() # report error and quit entry t = r[:k].strip() # term to go into dictionary d = r[k + 2:].strip() # its definition tsave = t # save for any error reporting dsave = d # # print ( ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>' ) if len(t) == 0 or len(d) == 0: _err() # quit on missing parts if ellyConfiguration.language == 'ZH': # special key for Chinese wky = toKeyZH(t[0]) else: c = t[0] if not ellyChar.isLetterOrDigit(c) and not c in initChr: _err('bad term') n = delimitKey(t) # get part of term to index # print ( 'delimit=' , n ) if n <= 0: _err() # quit on bad term wky = toKey(t[:n]) # key part of term to define # print ( ' SQLite key=' , wky ) # print ( 'd=' , d ) ns = syntaxSpecification.scan(d) # find extent of syntax info # print ( 'ns=' , ns , '"' + d[ns:] + '"' ) if ns <= 0: _err('bad syntax specification') if not d[ns:] == '' and d[ns] != ' ': _err('trailing chars in syntax specification') # print ( 'PoS=' , d[:ns] ) syn = d[:ns] # syntax info as string d = d[ns:].strip() # rest of definition try: # print ( 'VT syn=' , syn ) ss = SSpec(stb, syn) # decode syntax info # print ( 'VT ss =' , ss ) except ellyException.FormatFailure: _err('malformed syntax specification') cat = str(ss.catg) # syntax category cid = _smfchk[ss.catg] # associated semantic feature ID syf = ss.synf.positive.hexadecimal(False) # syntactic flags # print ( 'cat=' , cat ) # print ( 'syf=' , syf ) smf = zfs # initialize defaults for pb = '0' # cognitive semantics cn = conceptualHierarchy.NOname # # print ( '0:d=[' + d + ']' ) if len(d) > 1: # check for cognitive semantics x = d[0] if x == '[' or x == '0' or x == '-': # semantic features? if x != '[': # a '0' or '-' means to take default if len(d) == 1 or d[1] != ' ': _err('missing semantic features') d = d[2:].strip() # skip over else: ns = featureSpecification.scan( d) # look for ']' of features # print ( 'ns=' , ns ) if ns < 0: _err() sem = d[:ns] # get semantic features d = d[ns:].strip( ) # skip over for subsequent processing sid = sem[1] # feature ID if sid != cid: if cid != None: _err('inconsistent semantic feature id') _smfchk[ss.catg] = sid try: # print ( 'smf=' , smf ) fs = FSpec(stb, sem, True) except ellyException.FormatFailure: _err('bad semantic features') smf = fs.positive.hexadecimal( False) # convert to hex # print ( '1:d=[' + d + ']' ) ld = len(d) # print ( 'ld=' , ld ) if ld == 0: _err('missing plausibility') np = 0 x = d[np] if x == '+' or x == '-': np += 1 # take any plus or minus sign while np < ld: # and successive digits if ellyChar.isDigit(d[np]): np += 1 else: break # print ( 'np=' , np ) if np == 0: _err('missing plausibility') pb = d[:np] # plausibility bias # print ( 'pb=' , pb ) d = d[np:] ld = len(d) # print ( '2:d=[' + d + ']' ) if ld > 1: # any more to process? c = d[0] # get next char after bias d = d[1:] # advance scan ld -= 1 if c == '/': # check for explicit concept # print ( 'getting concept' ) np = 0 while np < ld: # get extent of concept if ellyChar.isWhiteSpace(d[np]): break np += 1 if np == 0: _err('missing concept for plausibility') cn = d[:np] # extract concept d = d[np:] elif c != ' ': _err() # signal bad format elif ld > 0: _err() # unidentifiable trailing text elif d[0] != '(': dd = d while ellyChar.isLetterOrDigit(dd[0]): dd = dd[1:] if len(dd) == 0 or dd[0] != '=': _err() d = d.strip() # rest of definition # print ( 'rest of d=' , d ) if len(d) > 0 and d[-1] == '=': if len(d) == 1 or d[0] != '=': _err('incomplete definition') ld = [] # for normalizing definition k = 0 # count spaces removed sd = '' # previous char seen for cd in d: # scan all chars in translation # print ( 'cd=' , cd ) if cd == ' ': if sd == '=' or sd == ',' or sd == ' ': k += 1 sd = cd continue elif cd == '=' or cd == ',': # no spaces before '=' or ',' if sd == ' ': k += 1 ld.pop() if cd == ',': if sd == '=': _err('missing translation') cd = '#' # format for PICK operation elif cd == '=' and sd == '=': print('** WARNING \'=\' followed by \'=\'', file=sys.stderr) print('* at [', tsave, ']', file=sys.stderr) sd = cd ld.append(cd) # add char to reformatted definition # print ( 'ld=' , ld ) if k > 0: d = ''.join(ld) # definition with spaces removed # print ( '3:d=[' + d + ']' ) vrc = [t, '=:', cat, syf, smf, pb, cn] # start data record vss = ' '.join(vrc) # convert to string vss += ' ' + d # fill out record with rest of input # print ( 'type(vss)=' , type(vss) ) # print ( 'rec=' , vrc , 'tra=' , d ) # print ( ' =' , vss ) except ellyException.FormatFailure: # will catch exceptions from _err() print('* at [', tsave, end=' ', file=sys.stderr) if dsave != None: print(':', dsave, end=' ', file=sys.stderr) print(']', file=sys.stderr) continue # skip rest of processing this rule #### SQLite DB operation #### try: sql = "INSERT INTO Vocab VALUES(?,?)" # print ( type(wky) , wky , type(vss) , vss ) cur.execute(sql, (wky, vss)) except dbs.Error as e: print('FATAL', e, file=sys.stderr) sys.exit(1) # #### #### SQLite DB operations #### if nerr == 0: cdb.commit() cdb.close() # clean up # print ( 'DONE' ) # #### except Error as e: # catch any other errors print('**', e, file=sys.stderr) print('* at', r, file=sys.stderr) nerr += 1 if nerr > 0: print('**', nerr, 'vocabulary table errors in all', file=sys.stderr) print('* compilation FAILed', file=sys.stderr) cdb.close() # discard any changes raise ellyException.TableFailure
def scan ( buffr ): """ recognize personal names in text at current position arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ def doLook ( mth , itm ): """ do lookup with specified method using global variables in Python 2.7.* arguments: mth - name table method itm - string to look up """ global _typ , _nch # really need nonlocal _typ = mth(itm) if _typ < 0 and len(itm) > 3: # if no match, check for final '.' if itm[-1] == '.': _typ = mth(itm[:-1]) if _typ >= 0: _nch -= 1 # match without '.' global _typ , _nch global _toscan # print 'table=' , _table bln = len(buffr) if _table == None or bln < 2: return 0 if _toscan > 0: if bln > _toscan: return 0 else: _toscan = 0 chx = buffr[0] # print 'scan chx=' , chx if not ellyChar.isLetter(chx) and chx != '(' and chx != '"': return 0 cmps = [ ] # name components this time ncmp = 0 # number of components for current name ninf = 0 # number inferred ntyp = len(nameTable.TYP) stat = [False]*ntyp # define state for getting personal name mlen = 0 # last match length bix = 0 # buffer index to advance in scanning _typ = -1 while bix < bln: ltyp = -1 # last match type _nch = _limit(buffr[bix:],mlen) # length of next possible name component # print 'top _nch=' , _nch if _nch == 0: return 0 elm = _extract(buffr[bix:],_nch) # get possible component as string sch = buffr[bix] enclosed = (sch == '(' or sch == '"') # type of next element doLook(_table.lookUp,elm) # look it up in saved name table # print 'lookUp(' , elm , ')=' , _typ if _typ < 0: if _typ == nameTable.REJ: return 0 # immediate rejection of any match if _typ == nameTable.STP: break # stop any more matching if elm[-1] == '.': # drop any trailing '.' elm = elm[:-1] if not enclosed: _nch -= 1 if enclosed: # enclosed element assumed to be name if not elm in _cntxt: _cntxt.append(elm) # make sure always to save in local context ninf += 1 # this is inferred! if elm in _cntxt: _typ = nameTable.XNM # neutral name type to be noncommital if _typ < 0: tok = buffr[bix:bix + _nch] # unknown token to check # print 'call infer with tok=' , tok if infer(tok): # print 'digraph test passed' _typ = nameTable.XNM # neutral name type inferred if not _table.checkPhonetic(tok): ninf += 1 # count inferred component if no phonetic support # print '_typ=' , _typ if nameTable.starts(_typ) and bix > 0: # if component not at start of name, break # must stop name scan # print 'continuing bix=' , bix while _typ >= 0: # continue as long as match is viable ncmp += 1 # count up component cmps.append(elm) # save component bix += _nch # move ahead in scan # print 'bix=' , bix if _typ > 0: # print '_typ=' , _typ if stat[_typ]: # check for duplication of component type if (ltyp >= 0 and ltyp != _typ): # allowed only if duplicate is consecutive break mlen = bix # save index on actual match ltyp = _typ if nameTable.ends(_typ): # if component marks end of name, break # must stop name scan stat[_typ] = True # update match state if bix == bln: break if ellyChar.isWhiteSpace(buffr[bix]): bix += 1 # skip any space to start of next component _nch = _limit(buffr[bix:],mlen) # length of next possible name component if _nch == 0: break elm = _extract(buffr[bix:],_nch) # get possible next component as string doLook(_table.lookUpMore,elm) # look it up in saved name table # print 'lookUpMore(' , elm , ')=' , _typ if _typ < 0: # while-loop terminated without break # print 'ltyp=' , ltyp , 'mlen=' , mlen if ltyp < 0 or mlen == 0: break bix = mlen # restart at end of last match if bix == bln: break if ellyChar.isWhiteSpace(buffr[bix]): bix += 1 # skip any space to start of next component continue break # # #### additional constraints on acceptable personal name # # print 'checking ltyp=' , ltyp if (ltyp == nameTable.CNJ or ltyp == nameTable.REL): # a name cannot end with these types mlen -= _nch # have to drop them from any match if mlen == 0: return 0 if ellyChar.isWhiteSpace(buffr[mlen-1]): mlen -= 1 ncmp -= 1 cmps.pop() # print 'ncmp=' , ncmp if ncmp == 0: # nothing matched? _planAhead(buffr) # check for possible problems in next scan return 0 # print 'cmps=' , cmps if ncmp == ninf: return 0 # name cannot be purely inferred # print 'ncmp=' , ncmp if ncmp == 1: # single-component name must be known or contextual if (not stat[nameTable.SNG] and not cmps[0] in _cntxt): return 0 # print 'stat=' , stat[3:7] expl = (stat[nameTable.PNM] or # name must have a substantial component stat[nameTable.SNM] or stat[nameTable.XNM] or stat[nameTable.SNG]) # print 'expl=' , expl if (not expl and not (stat[nameTable.TTL] and # or it could have just a title stat[nameTable.INI])): # and an initial return 0 # #### # print 'accepted mlen=' , mlen for cmpo in cmps: # if whole name is OK, if not cmpo in _cntxt: # remember all components _cntxt.append(cmpo) # not already listed in context return mlen # will be > 0 on successful match
def compile ( name , stb , defn ): """ static method to create an Elly vocabulary database from text file input arguments: name - for new SQLite database stb - Elly symbol table defn - Elly definition reader for vocabulary exceptions: TableFailure on error """ global nerr nerr = 0 cdb = None # SQLite db connection cur = None # SQLite db cursor # print 'compiled stb=' , stb if stb == None : print >> sys.stderr, 'no symbol table' raise ellyException.TableFailure try: zfs = FSpec(stb,'[$]',True).positive.hexadecimal(False) except ellyException.FormatFailure: # should never need this print >> sys.stderr , 'unexpected failure with zero features' raise ellyException.TableFailure # print 'zfs=' , zfs # hexadecimal for all features off tsave = '' # original term dsave = '' # definition try: filn = name + vocabulary # where to put vocabulary database try: os.remove(filn) # delete the file if it exists except OSError: print >> sys.stderr , 'no' , filn # if no such file, warn but proceed #### SQLite #### try: cdb = dbs.connect(filn) # create new database cur = cdb.cursor() cur.execute("CREATE TABLE Vocab(Keyx TEXT, Defn TEXT)") cdb.commit() except dbs.Error , e: print >> sys.stderr , e raise ellyException.TableFailure # give up on any database failure # print 'creating' , filn # #### r = None # for error reporting while True: # process vocabulary definition records try: # for catching FormatFailure exception # print '------------' r = defn.readline() # next definition if len(r) == 0: break # stop on EOF # print type(r) , r k = r.find(':') # look for first ':' if k < 0: tsave = r dsave = None _err() # report error and quit entry t = r[:k].strip() # term to go into dictionary d = r[k+1:].strip() # its definition tsave = t # save for any error reporting dsave = d # # print ' tm=' , '<' + t + '>' , 'df=' , '<' + d + '>' if len(t) == 0 or len(d) == 0: _err() # quit on missing parts c = t[0] if not ellyChar.isLetterOrDigit(c) and c != '.' and c != '"': _err('bad term') n = delimitKey(t) # get part of term to index if n <= 0: _err() # quit on bad term wky = toKey(t[:n]) # key part of term to define # print ' SQLite key=' , wky ns = syntaxSpecification.scan(d) # find extent of syntax info # print 'ns=' , ns if ns <= 0: _err('bad syntax specification') # print 'PoS=' , d[:ns] syn = d[:ns] # syntax info as string d = d[ns:].strip() # rest of definition try: # print 'VT syn=' , syn ss = SSpec(stb,syn) # decode syntax info # print 'VT ss =' , ss except ellyException.FormatFailure: _err('malformed syntax specification') cat = str(ss.catg) # syntax category syf = ss.synf.positive.hexadecimal(False) # syntactic flags # print 'syf=' , syf smf = zfs # initialize defaults for pb = '0' # cognitive semantics cn = conceptualHierarchy.NOname # # print '0:d=[' + d + ']' if len(d) > 1: # check for cognitive semantics x = d[0] if x == '[' or x == '0' or x == '-': # semantic features? if x != '[': # a '0' or '-' means to take default if len(d) == 1 or d[1] != ' ': _err('missing semantic features') d = d[2:].strip() # skip over else: ns = featureSpecification.scan(d) # look for ']' of features # print 'ns=' , ns if ns < 0: _err() sem = d[:ns] # get semantic features d = d[ns:].strip() # skip over try: # print 'smf=' , smf fs = FSpec(stb,sem,True) except ellyException.FormatFailure: _err('bad semantic features') smf = fs.positive.hexadecimal(False) # convert to hex # print '1:d=[' + d + ']' ld = len(d) # print 'ld=' , ld if ld == 0: _err('missing plausibility') np = 0 x = d[np] if x == '+' or x == '-': np += 1 # take any plus or minus sign while np < ld: # and successive digits if ellyChar.isDigit(d[np]): np += 1 else: break # print 'np=' , np if np == 0: _err('missing plausibility') pb = d[:np] # plausibility bias # print 'pb=' , pb d = d[np:] ld = len(d) # print '2:d=[' + d + ']' if ld > 1: # any more to process? c = d[0] # get next char after bias d = d[1:] # advance scan ld -= 1 if c == '/': # check for explicit concept # print 'getting concept' np = 0 while np < ld: # get extent of concept if ellyChar.isWhiteSpace(d[np]): break np += 1 if np == 0: _err('missing concept for plausibility') cn = d[:np] # extract concept d = d[np:] elif c != ' ': _err() # signal bad format elif ld > 0: _err() # unidentifiable trailing text d = d.strip() # rest of definition # print 'rest of d=' , d if len(d) > 0 and d[-1] == '=': if len(d) == 1 or d[0] != '=': _err('incomplete definition') ld = [ ] # for normalizing definition k = 0 # count spaces removed sd = '' # previous char seen for cd in d: # scan all chars in translation if cd == ' ': if sd == '=' or sd == ',' or sd == ' ': k += 1 sd = cd continue elif cd == '=' or cd == ',': # no spaces before '=' or ',' if sd == ' ': k += 1 ld.pop() if cd == ',': if sd == '=': _err('missing translation') cd = '#' # format for PICK operation elif cd == '=' and sd == '=': print >> sys.stderr , '** WARNING \'=\' followed by \'=\'' print >> sys.stderr , '* at [' , tsave , ']' sd = cd ld.append(cd) # add char to reformatted definition if k > 0: d = ''.join(ld) # definition with spaces removed # print '3:d=[' + d + ']' vrc = [ t , ':' , cat , syf , smf , pb , cn ] # start data record vss = u' '.join(vrc) # convert to string vss += u' ' + d # fill out record with rest of input # print 'type(vss)=' , type(vss) # print 'rec=' , vrc , 'tra=' , d # print ' =' , vss except ellyException.FormatFailure: print >> sys.stderr , '* at [' , tsave , if dsave != None: print >> sys.stderr , ':' , dsave , print >> sys.stderr , ']' continue # skip rest of processing #### SQLite #### try: sql = "INSERT INTO Vocab VALUES(?,?)" # print type(wky) , wky , type(vss) , vss cur.execute(sql,(wky,vss)) except dbs.Error , e: print >> sys.stderr , 'FATAL' , e sys.exit(1)
def match ( patn , text , offs=0 , limt=None , nsps=0 ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit for any matching nsps - number of spaces to match in pattern returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index nsps - saved count of spaces matched """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 self.nsps = 0 def __unicode__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + unicode(self.kind) + ',ct=' + unicode(self.count) + ',pa=' + unicode(self.pats) + ',tx=' + unicode(self.txts) + ',bd=' + unicode(self.bnds) + ',ns=' + unicode(self.nsps) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # four private functions using local variables of match() defined just above # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print "binding:",offs,ns os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind , nsp ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard nsp - number of spaces in pattern still unmatched returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi uf.nsps = nsp return uf def _span ( typw , nsp=0 ): """ count chars available for wildcard match arguments: typw - wildcard nsp - spaces to be matched in pattern returns: non-negative count if any match possible, otherwise -1 """ # print "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) # print "_span: txt @",offs,"pat @",mp,"nsp=",nsp # print "text to span:",text[offs:] # print "pat rest=" , patn[mp:] k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print "exclude=",k,"chars from possible span for rest of pattern" # calculate maximum chars a wildcard can match mx = ellyChar.findExtendedBreak(text,offs,nsp) # print mx,"chars available to scan" mx -= k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # matchup function for wildcard type # print "text at",offs,"maximum wildcard match=",mx nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset # print 'span c=' , c if not tfn(c): break # stop when it fails to match nm += 1 # print "maximum wildcard span=",nm return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match yet? if limt == None: limt = len(text) # print 'starting match, limt=',limt,text[offs:limt],":",patn # print 'nsps=' , nsps mp = 0 # pattern index ml = len(patn) # pattern match limit last = '' while True: ## literally match as many next chars as possible # print '---- loop mp=' , mp , 'ml=' , ml while mp < ml: if offs >= limt: # print "offs=",offs,"limt=",limt last = '' elif limt == 0: break else: last = text[offs] offs += 1 # print 'patn=' , patn mc = patn[mp] # print 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs # print 'against ' , mc , '(' , '{:04x}'.format(ord(mc)) , ')' if mc != last: if mc != last.lower(): if mc == Hyphn and last == ' ' and limt - offs > 2: # print 'hyphen special matching, limt=', limt , 'offs=' , offs # print 'text[offs:]=' , text[offs:] if text[offs] != Hyphn or text[offs+1] != ' ': break offs += 2 else: # print 'no special matching of hyphen' break # print 'matched @mp=' , mp mp += 1 ## check whether mismatch is due to special pattern char # print 'pat @',mp,"<",ml # print "txt @",offs,'<',limt,'last=',last # print '@',offs,text[offs:] if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) if tc == cALL: # a * wildcard? # print "ALL last=< " + last + " >" if last != '': offs -= 1 nm = _span(cALL,nsps) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print "offs=",offs,'nm=',nm uf = _mark(1,nsps); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print "END $:",last if last == '': continue elif last == '-': offs -= 1 continue elif last in [ '.' , ',' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' , ellyChar.HYPH ]: offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? # print "ANY:",last,offs if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cCAN: # nonalphanumeric wildcard? # print 'at cCAN' if last != ellyChar.AMP: if last == '' or not ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print "ALF:",last,offs if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cUPR: # uppercase letter wildcard? # print "UPR:",last,'@',offs if last != '' and ellyChar.isUpperCaseLetter(last): _bind(); mbi += 1 continue elif tc == cLWR: # lowercase letter wildcard? # print "LWR:",last,'@',offs if last != '' and ellyChar.isLowerCaseLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print "SPC:","["+last+"]" if last != '' and ellyChar.isWhiteSpace(last): nsps -= 1 _bind(); _modify(); mbi += 1 continue # print 'NO space' elif tc == cAPO: # apostrophe wildcard? # print "APO: last=" , last if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print "SOS" # print last,'@',offs mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print "EOS" if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: # print 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' if last != '': # still more to match? offs -= 1 # print 'nsps=' , nsps # print '@' , offs , text nm = _span(tc,nsps) # maximum match possible # print 'spanning=' , nm if nm == 0: # compensate for findExtendedBreak peculiarity if offs + 1 < limt and mp < ml: # with closing ] or ) to be matched in pattern if patn[mp] in Enc: # from text input nm += 1 # print 'spanning=' , nm if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1,nsps); unj += 1 uf.count = nm - 1 # at least one char must be matched # print 'offs=' , offs last = text[offs] if offs < limt else '' continue # print 'fail tc=' , deconvert(tc) elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch ## # print "fail - unwinding" , unj while unj > 0: # try unwinding, if possible # print "unw:",unj uf = unw[unj-1] # get most recent unwinding record # print uf if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard nsps = uf.nsps # break else: # print "no unwinding" break # quit if unwinding is exhausted # print 'cnt=' , uf.count , 'off=' , offs ## ## clean up on match mode or on no match possible ## # print "matched=",matched if not matched: return None # no bindings # print text,offs ## consolidate contiguous bindings for subsequent substitutions # print "BEFORE consolidating consecutive bindings" # print "bd:",len(mbd) # print mbd[0] # print '----' # for b in mbd[1:]: # print b mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print "AFTER" # print "bd:",len(mbd) # print mbd[0] # print '----' # for b in mbd[1:]: # print b return mbd # consolidated bindings plus new offset
def _reload ( self ): """ refill input line buffer and compute indentation arguments: self returns: True on success if buffer has at least one char, False otherwise """ # print '_reload' bex = '' # save space char at end of buffer bcn = len(self.buf) if bcn > 1: return True # no refilling needed elif bcn == 1: if ellyChar.isWhiteSpace(self.buf[0]): bex = self.buf[0] # special case when only space char left self.buf = [ ] # refill to get chars after that space else: return True # no refilling yet if self._eof: return False # must return immediately on previous EOF while len(self.buf) == 0: # print 'get more text' try: # read in UTF8 line from input stream if self._prmpt: sys.stdout.write('>> ') s = self.inp.readline() # new text line to add # print 's=' , s if len(s) == 0: # print '**EOF' self._eof = True return False # EOF s = s.decode('utf8') # convert UTF8 to Unicode string # print 'raw s=' , s except IOError: print >> sys.stderr , '** char stream ERROR' return False # treat read failure as empty line k = 0 while k < len(s): # count leading white space chars if s[k] == NL: break # but stop at end of line if not ellyChar.isWhiteSpace(s[k]): break k += 1 self._in = k # save indentation level s = s[k:] self.buf = list(s) # put unindented text into buffer # print 'k=' , k , ', s=' , '"' + s + '"' # print self.buf if k > 0 and ellyConfiguration.noteIndentation: self.buf.insert(0,NL) # if noted, indentation will break sentence # print 'len=' , len(self.buf) if len(self.buf) > 0: # if usable input, stop filling if bex != '': # but restore any saved space char from buffer self.buf.insert(0,bex) return True return False # cannot refill, ignore trailing space char
def _limit ( buffr , hstry ): """ get length of next possible name component in buffer arguments: buffr - list of chars hstry - how much matched already returns: number of chars in continuation of last component, 0 for no next component """ lnb = len(buffr) if lnb == 0: return 0 bix = 0 quot = False # indicate component starting with " parn = False # with ( cmma = False # with , # print '_limit buffr=' , buffr , 'hstry=' , hstry if buffr[0] == ',': # handle possible leading comma if hstry == 0 or lnb < 4: return 0 bix += 1 if ellyChar.isWhiteSpace(buffr[1]): bix += 1 cmma = True # print 'for comma, bix=' , bix if buffr[bix] == '(': # handle short name in parentheses bix += 1 parn = True if buffr[bix] == '"': # handle short name in double quotes bix += 1 quot = True # print 'parn=' , parn , 'quot=' , quot if parn or quot: # print 'enclosed component from' , buffr[bix:] while bix < lnb: # collect letters for name chx = buffr[bix] if ellyChar.isWhiteSpace(chx): break elif not quot and parn and chx == ')': return bix + 1 # add trailing parenthesis elif quot and chx == '"': if bix + 1 < lnb and parn and buffr[bix+1] == ')': return bix + 2 # add trailing quote and parenthesis elif not parn: return bix + 1 # add trailing quote only else: return 0 # no match elif chx == '.': return bix + 1 # add trailing period elif not ellyChar.isLetter(chx): break # unrecognizable char for name bix += 1 # print 'no closure' return 0 else: # print 'find component in' , buffr[bix:] while bix < lnb: chx = buffr[bix] # collect letters for name # print 'chx=' , chx if chx == "'": if bix + 2 < lnb: chn = buffr[bix+1] if ellyChar.isWhiteSpace(chn): break if chn == 's' and not ellyChar.isLetter(buffr[bix+2]): break elif not ellyChar.isLetter(chx): if chx == '.': bix += 1 # print 'increment bix=' , bix break bix += 1 if bix == lnb: # print 'ran out of chars' return bix # running out of chars means match else: # getting here means that more text follows limit # and so we may have to pick up extra chars here chx = buffr[bix] # print 'next chx=' , chx , 'bix=' , bix if ellyChar.isWhiteSpace(chx) or chx == "'": return bix # component can be terminated by space or (') elif chx == ',': if cmma: return bix + 1 # or comma when sequence starts with comma else: return bix # when there is no starting comma elif ellyChar.isLetter(chx): return bix # or letter, implying previous char was '.' else: return 0 # failure to find name limit
def _matchAN ( self , ts ): """ apply logic for alphanumeric date recognition arguments: self - ts - text stream as list of chars returns: total number of chars matched """ # print 'ALPHANUMERIC' t = ts tl = len(ts) k = self._aMonth(t) # look for month to start date string if k > 0: if k == tl: return 0 if not ellyChar.isWhiteSpace(t[k]): return 0 k += 1 # skip space after month if k == tl: return 0 t = t[k:] k = self._aDay(t) # look for day of month if k == 0: return 0 tl = len(ts) # _aDay may have rewritten alphabetic day t = t[k:] if len(t) == 0: return 0 if t[0] == u',': t = t[1:] # look for comma after day if len(t) == 0: return tl if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl k = self._aYear(t) # look for year return tl - len(t) + k else: k = self._aDay(t) # look for day of month to start date string if k == 0 or k == tl: return 0 tl = len(ts) # _aDay may have rewritten alphabetic day t = t[k:] # print 'new t=' , t if (k > 2 and len(t) > 2 and t[0] == u' ' and t[1].upper() == 'O' and t[2].upper() == 'F'): t = t[3:] # to handle day reference like '4th of' if len(t) == 0: return 0 if not ellyChar.isWhiteSpace(t[0]): return 0 t = t[1:] k = self._aMonth(t) # look for month if k == 0: return 0 t = t[k:] if len(t) == 0: return tl ntl = tl - len(t) # print 'ntl=' , ntl nd = 0 if t[0] == u',': # look for comma after month t = t[1:] if len(t) == 0: return tl nd += 1 if ellyChar.isWhiteSpace(t[0]): t = t[1:] if len(t) == 0: return tl nd += 1 k = self._aYear(t) # look for year if k > 0: return ntl + k + nd # full date found else: return ntl - nd # only month and day of date found
def match ( patn , text , offs=0 , limt=None ): """ compare a pattern with wildcards to input text arguments: patn - pattern to matched text - what to match against offs - start text offset for matching limt - limit of matching returns: bindings if match is successful, None otherwise """ class Unwinding(object): """ to hold unwinding information for macro pattern backup and rematch attributes: kind - 0=optional 1=* wildcard count - how many backups allowed pats - saved pattern index for backup txts - saved input text index bnds - saved binding index """ def __init__ ( self , kind ): """ initialize to defaults arguments: self - kind - of winding """ self.kind = kind self.count = 0 self.pats = 0 self.txts = 0 self.bnds = 1 def __unicode__ ( self ): """ show unwinding contents for debugging arguments: self returns: attributes as array """ return ( '[kd=' + unicode(self.kind) + ',ct=' + unicode(self.count) + ',pa=' + unicode(self.pats) + ',tx=' + unicode(self.txts) + ',bd=' + unicode(self.bnds) + ']' ) #### local variables for match( ) #### mbd = [ 0 ] # stack for pattern match bindings (first usable index = 1) mbi = 1 # current binding index unw = [ ] # stack for unwinding on match failure unj = 0 # current unwinding index ## # three private functions using local variables of match() # def _bind ( ns=None ): """ get next available wildcard binding frame arguments: ns - optional initial span of text for binding returns: binding frame """ # print "binding:",offs,ns os = offs - 1 if ns == None: ns = 1 # by default, binding is to 1 char if mbi == len(mbd): # check if at end of available frames mbd.append([ 0 , 0 ]) bf = mbd[mbi] # next available record bf[0] = os # set binding to range of chars bf[1] = os + ns # return bf def _modify ( ): """ set special tag for binding arguments: """ mbd[mbi].append(None) def _mark ( kind ): """ set up for backing up pattern match arguments: kind - 0=optional 1=* wildcard returns: unwinding frame """ if unj == len(unw): # check if at end of available frames unw.append(Unwinding(kind)) uf = unw[unj] # next available uf.kind = kind uf.count = 1 uf.pats = mp uf.txts = offs uf.bnds = mbi return uf def _span ( typw ): """ count chars available for wildcard match arguments: typw - wildcard returns: non-negative count if any match possible, otherwise -1 """ k = minMatch(patn[mp:]) # calculate min char count to match rest of pattern # print "exclude=",k,"@",offs # calculate maximum chars a wildcard can match mx = ellyChar.findBreak(text,offs) - k # max span reduced by exclusion if mx < 0: return -1 # cannot match if max span < 0 tfn = Matching[typw] # char type matching a wildcard # print "text at",offs,"maximum wildcard match=",mx nm = 0 for i in range(mx): c = text[offs+i] # next char in text from offset if not tfn(c): break # stop when it fails to match nm += 1 # print "maximum wildcard span=",nm return nm # # end of private functions ## ############################# #### main matching loop #### ############################# matched = False # successful pattern match? if limt == None: limt = len(text) mp = 0 # pattern index ml = len(patn) # pattern match limit # print text[offs:limt],":",list(patn) while True: ## literally match as many next chars as possible while mp < ml: if offs >= limt: last = '' else: last = text[offs].lower() offs += 1 # print 'matching last=' , last , 'at' , offs if patn[mp] != last: break mp += 1 ## check whether mismatch is due to special pattern char # print 'pat',mp,"<",ml # print "txt @",offs if mp >= ml: # past end of pattern? matched = True # if so, match is made break tc = patn[mp] # otherwise, get unmatched pattern element mp += 1 # # print "tc=",ord(tc) if tc == cALL: # a * wildcard? # print "ALL last=< " + last + " >" if last != '': offs -= 1 nm = _span(cALL) ## save info for restart of matching on subsequent failure bf = _bind(nm); mbi += 1 # get new binding record bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset # print "offs=",offs uf = _mark(1); unj += 1 # get new unwinding record uf.count = nm # can back up this many times on mismatch continue elif tc == cEND: # end specification # print "END $:",last if last == '': continue elif last in [ '.' , ',' , '-' ]: if offs == limt: offs -= 1 continue txc = text[offs] if ellyChar.isWhiteSpace(txc) or txc in Trmls: offs -= 1 continue elif last in ellyBuffer.separators: offs -= 1 continue elif last in [ '?' , '!' ]: offs -= 1 continue elif tc == cANY: # alphanumeric wildcard? if last != '' and ellyChar.isLetterOrDigit(last): _bind(); mbi += 1 continue elif tc == cDIG: # digit wildcard? if last != '' and ellyChar.isDigit(last): _bind(); mbi += 1 continue elif tc == cALF: # letter wildcard? # print "ALF:",last,offs if last != '' and ellyChar.isLetter(last): _bind(); mbi += 1 continue elif tc == cSPC: # space wildcard? # print "SPC:" if last != '' and ellyChar.isWhiteSpace(last): _bind(); _modify(); mbi += 1 continue elif tc == cAPO: # apostrophe wildcard? # print "APO: last=" , last if ellyChar.isApostrophe(last): _bind(); _modify(); mbi += 1 continue elif tc == cSOS: # print "SOS" # print last,'@',offs mf = _bind(0); mbi += 1 # dummy record to block mf[0] = -1 # later binding consolidation if last != '': offs -= 1 # try for rematch m = mp # find corresponding EOS while m < ml: # if patn[m] == cEOS: break m += 1 else: # no EOS? m -= 1 # if so, pretend there is one anyway uf = _mark(0); unj += 1 # for unwinding on any later match failure uf.pats = m + 1 # i.e. one char past next EOS uf.txts = offs # start of text before optional match continue elif tc == cEOS: # print "EOS" if last != '': offs -= 1 # back up for rematch continue elif tc == cSAN or tc == cSDG or tc == cSAL: if last != '': # still more to match? offs -= 1 nm = _span(tc) # maximum match possible # print 'spanning=' , nm if nm >= 1: bf = _bind(nm); mbi += 1 bf[0] = offs # bind from current offset offs += nm # move offset past end of span bf[1] = offs # bind to new offset uf = _mark(1); unj += 1 uf.count = nm - 1 # at least one char must be matched continue elif tc == '': if last == '' or not ellyChar.isPureCombining(last): matched = True # successful match break ## match failure: rewind to last match branch # print "fail - unwinding",unj while unj > 0: # try unwinding, if possible # print "unw:",unj uf = unw[unj-1] # get most recent unwinding record # print uf if uf.count <= 0: # if available count is used up, unj -= 1 # go to next unwinding record continue uf.count -= 1 # decrement available count uf.txts -= uf.kind # back up one char for scanning text input mp = uf.pats # unwind pattern pointer offs = uf.txts # unwind text input mbi = uf.bnds # restore binding mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard break else: # print "no unwinding" break # quit if unwinding is exhausted ## ## clean up on match mode or on no match possible ## # print "matched=",matched if not matched: return None # no bindings # print text,offs ## consolidate contiguous bindings for subsequent substitutions # print "BEFORE consolidating" # print "bd:",len(mbd) # for b in mbd: # print b mbdo = mbd lb = -1 # binding reference lbd = [ 0 , -1 ] # sentinel value, not real binding mbd = [ lbd ] # initialize with new offset after matching mbdo.pop(0) # ignore empty binding while len(mbdo) > 0: # bd = mbdo.pop(0) # get next binding if len(bd) > 2: bd.pop() mbd.append(bd) lbd = bd lb = -1 elif bd[0] < 0: # check for optional match indicator here lb = -1 # if so, drop from new consolidated bindings elif lb == bd[0]: # check for binding continuous with previous lb = bd[1] # lbd[1] = lb # if so, combine with previous binding else: # mbd.append(bd) # otherwise, add new binding lbd = bd # lb = bd[1] # mbd[0] = offs # replace start of bindings with text length matched # print "AFTER" # print "bd:",len(mbd) # for b in mbd: # print b return mbd # consolidated bindings plus new offset
def scan ( buffr ): """ recognize personal names in text at current position arguments: buffr - current contents as list of chars returns: char count matched on success, 0 otherwise """ def doLook ( mth , itm ): """ do lookup with specified method using global variables in Python 2.7.* arguments: mth - name table method itm - string to look up """ global _typ , _nch # really need nonlocal _typ = mth(itm) if _typ < 0 and len(itm) > 3: # if no match, check for final '.' if itm[-1] == '.': _typ = mth(itm[:-1]) if _typ >= 0: _nch -= 1 # match without '.' global _typ , _nch global _toscan # print ( 'table=' , _table ) bln = len(buffr) if _table == None or bln < 2: return 0 if _toscan > 0: if bln > _toscan: return 0 else: _toscan = 0 chx = buffr[0] # print ( 'scan chx=' , chx ) if not ellyChar.isLetter(chx) and chx != '(' and chx != '"': return 0 cmps = [ ] # name components this time ncmp = 0 # number of components for current name ninf = 0 # number inferred ntyp = len(nameTable.TYP) stat = [False]*ntyp # define state for getting personal name mlen = 0 # last match length bix = 0 # buffer index to advance in scanning _typ = -1 while bix < bln: ltyp = -1 # last match type _nch = _limit(buffr[bix:],mlen) # length of next possible name component # print ( 'top _nch=' , _nch ) if _nch == 0: return 0 elm = _extract(buffr[bix:],_nch) # get possible component as string sch = buffr[bix] enclosed = (sch == '(' or sch == '"') # type of next element doLook(_table.lookUp,elm) # look it up in saved name table # print ( 'lookUp(' , elm , ')=' , _typ ) if _typ < 0: if _typ == nameTable.REJ: return 0 # immediate rejection of any match if _typ == nameTable.STP: break # stop any more matching if elm[-1] == '.': # drop any trailing '.' elm = elm[:-1] if not enclosed: _nch -= 1 if enclosed: # enclosed element assumed to be name if not elm in _cntxt: _cntxt.append(elm) # make sure always to save in local context ninf += 1 # this is inferred! if elm in _cntxt: _typ = nameTable.XNM # neutral name type to be noncommital if _typ < 0: tok = buffr[bix:bix + _nch] # unknown token to check # print ( 'call infer with tok=' , tok ) if infer(tok): # print ( 'digraph test passed' ) _typ = nameTable.XNM # neutral name type inferred if not _table.checkPhonetic(tok): ninf += 1 # count inferred component if no phonetic support # print ( '_typ=' , _typ ) if nameTable.starts(_typ) and bix > 0: # if component not at start of name, break # must stop name scan # print ( 'continuing bix=' , bix ) while _typ >= 0: # continue as long as match is viable ncmp += 1 # count up component cmps.append(elm) # save component bix += _nch # move ahead in scan # print ( 'bix=' , bix ) if _typ > 0: # print ( '_typ=' , _typ ) if stat[_typ]: # check for duplication of component type if (ltyp >= 0 and ltyp != _typ): # allowed only if duplicate is consecutive break mlen = bix # save index on actual match ltyp = _typ if nameTable.ends(_typ): # if component marks end of name, break # must stop name scan stat[_typ] = True # update match state if bix == bln: break if ellyChar.isWhiteSpace(buffr[bix]): bix += 1 # skip any space to start of next component _nch = _limit(buffr[bix:],mlen) # length of next possible name component if _nch == 0: break elm = _extract(buffr[bix:],_nch) # get possible next component as string doLook(_table.lookUpMore,elm) # look it up in saved name table # print ( 'lookUpMore(' , elm , ')=' , _typ ) if _typ < 0: # while-loop terminated without break # print ( 'ltyp=' , ltyp , 'mlen=' , mlen ) if ltyp < 0 or mlen == 0: break bix = mlen # restart at end of last match if bix == bln: break if ellyChar.isWhiteSpace(buffr[bix]): bix += 1 # skip any space to start of next component continue break # # #### additional constraints on acceptable personal name # # print ( 'checking ltyp=' , ltyp ) if (ltyp == nameTable.CNJ or ltyp == nameTable.REL): # a name cannot end with these types mlen -= _nch # have to drop them from any match if mlen == 0: return 0 if ellyChar.isWhiteSpace(buffr[mlen-1]): mlen -= 1 ncmp -= 1 cmps.pop() # print ( 'ncmp=' , ncmp ) if ncmp == 0: # nothing matched? _planAhead(buffr) # check for possible problems in next scan return 0 # print ( 'cmps=' , cmps ) if ncmp == ninf: return 0 # name cannot be purely inferred # print ( 'ncmp=' , ncmp ) if ncmp == 1: # single-component name must be known or contextual if (not stat[nameTable.SNG] and not cmps[0] in _cntxt): return 0 # print ( 'stat=' , stat[3:7] ) expl = (stat[nameTable.PNM] or # name must have a substantial component stat[nameTable.SNM] or stat[nameTable.XNM] or stat[nameTable.SNG]) # print ( 'expl=' , expl ) if (not expl and not (stat[nameTable.TTL] and # or it could have just a title stat[nameTable.INI])): # and an initial return 0 # #### # print ( 'accepted mlen=' , mlen ) for cmpo in cmps: # if whole name is OK, if not cmpo in _cntxt: # remember all components _cntxt.append(cmpo) # not already listed in context return mlen # will be > 0 on successful match
def match ( self , txt , pnc , nxt ): """ compare a punctuation mark and its context with a pattern arguments: self - txt - list of text chars up to and including punctuation char pnc - punctuation char nxt - single char after punctuation returns: True on match, False otherwise """ # print 'matching for txt=' , txt , 'pnc=' , pnc , 'nxt=' , nxt # print 'lstg=' , self.lstg if not pnc in self.lstg: # get stored patterns for punctuation return False lp = self.lstg[pnc] # print len(lp) , 'patterns' txl = txt[-self.maxl:] if len(txt) > self.maxl else txt txs = map(lambda x: x.lower(),txl) # actual left context for matching lt = len(txs) # its length # print 'txs= ' + unicode(txs) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' for p in lp: # try matching each pattern if p.left != None: n = len(p.left) # assume each pattern element must match one sequence char # print 'n=' , n , 'p=' , unicode(p) if n > lt: continue # fail immediately because of impossibility of match t = txs if n == lt else txs[-n:] # print 'left pat=' , '[' + ellyWildcard.deconvert(p.left) + ']' # print 'versus t=' , t if not ellyWildcard.match(p.left,t,0): # print 'no left match' continue if n < lt and ellyChar.isLetterOrDigit(t[0]): if ellyChar.isLetterOrDigit(txs[-n-1]): continue # fail because of no break in text # nc = '\\n' if nxt == '\n' else nxt # print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' # print 'versus c=' , nc if p.right == []: return True pcx = p.right[0] if pcx == nxt: # check for specific char after possible stop # print 'right=' , nxt return True if pcx == ellyWildcard.cCAN: # check for nonalphanumeric if nxt == u'' or not ellyChar.isLetterOrDigit(nxt): # print 'right nonalphanumeric=' , nxt return True if pcx == ellyWildcard.cSPC: # check for white space # print 'looking for space' if nxt == u'' or nxt == u' ' or nxt == u'\n': # print 'right space' return True # print 'last check' if p.right == u'.': # check for any punctuation if not ellyChar.isLetterOrDigit(nxt) and not ellyChar.isWhiteSpace(nxt): # print 'right punc=' , nxt return True return False
def getNext ( self ): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print 'getNext' sent = [ ] # list buffer to fill parenstop = False # initially, parentheses will NOT stop sentence c = self.inp.read() if c == SP: c = self.inp.read() if c == END: # EOF check return None # print 'c=' , ord(c) self.inp.unread(c,SP) # print '0 <<" , self.inp.buf # fill sentence buffer up to next stop punctuation nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print 'x=' , '<' + x + '>' # print 'sent=' , sent # check for table delimiters in text if len(sent) == 0: # print 'table' # print '1 <<' , self.inp.buf if x == u'.' or x == u'-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far ######################################### # accumulate chars and count alphanumeric ######################################### c = x sent.append(c) if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # char was not alphanumeric or space # look for stop punctuation exception z = self.inp.peek() # for context of match call # print 'peek z=' , z # print '0 <<' , self.inp.buf # print 'sent=' , sent[:-1] # print 'punc=' , '<' + c + '>' # print 'next=' , '<' + z + '>' if self.stpx.match(sent[:-1],c,z): # print 'exception MATCH' if self.drop: sent.pop() # remove punctuation char from sentence continue # print '1 <<' , self.inp.buf # print 'no exception MATCH' # handle any nonstandard punctuation exoticPunctuation.normalize(c,self.inp) # print '2 <<' , self.inp.buf # handle parentheses as possible stop if nAN == 0 and self.stpx.inBracketing(): parenstop = True elif parenstop and not self.stpx.inBracketing(): break # treat as stop # print '3 <<' , self.inp.buf # check for dash if c == u'-': d = self.inp.read() if d == u'-': # print 'dash' while True: d = self.inp.read() if d != u'-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation if not c in Stops: continue else: # print 'stopping possible!' d = self.inp.read() # print '4 <<' , self.inp.buf if d == None: d = u'!' # print 'stop=' , '<' + c + '> <' + d + '>' # print 'ellipsis check' if c == u'.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(c) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append(' ') # if part of token, put in space as separator continue # special check for multiple stops # print 'Stops d=' , d , ord(d) if d != '' else 'NONE' if d in Stops: while True: d = self.inp.read() if d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = u' ' # break sentence except when in parentheses elif d in RBs: # print 'followed by' , '<' + d + '>' if not self.stpx.inBracketing(): break else: if self.drop: sent.pop() self.inp.unread(d) continue # special check for single or double quotes, which should # be included with current sentence after stop punctuation elif d in QUOs: # print 'QUO d=' , d , ord(d) x = self.inp.peek() if x == END or ellyChar.isWhiteSpace(x): sent.append(d) break else: self.inp.unread(SP) continue # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): sent.append(d) continue # if no match for lookahead, put back elif d != '': # print 'unread d=' , d self.inp.unread(d) # final check: is sentence long enough? # print '5 <<' , self.inp.buf cx = self.inp.peek() if cx == None: cx = u'!!' # print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent if nAN > 1: break if len(sent) > 0 or self.last != END: return sent else: return None