def atToken(self): """ look for combining token char at start of buffer arguments: self returns: True if found, False otherwise """ if len(self.buffer) == 0: return False x = self.buffer[0] if x == '-' or x == '+': # look for suffix or prefix return True else: return ellyChar.isCombining(self.buffer[0])
def atToken ( self ): """ look for token char at start of buffer arguments: self returns: True if found, False otherwise """ if len(self.buffer) == 0: return False x = self.buffer[0] if x == u'-' or x == u'+': # look for suffix or prefix return True else: return ellyChar.isCombining(self.buffer[0])
def getNext ( self ): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print 'getNext' self.resetBracketing() sent = [ ] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print 'x=' , '<' + x + '>' , ord(x) self.inp.unread(x,SP) # put first char back to restore input # print '0 <<" , self.inp.buf # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' # print 'sent=' , sent # check for table delimiters in text if len(sent) == 0: # print 'table' # print '1 <<' , self.inp.buf if x == u'.' or x == u'-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far ######################################### # accumulate chars and count alphanumeric ######################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # inBrkt = self.checkBracketing(x) # do bracket checking with modified chars # print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt sent.append(c) # but buffer original chars if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # char was not alphanumeric or space # look for stop punctuation exception z = self.inp.peek() # for context of match call # print '0 <<' , self.inp.buf # print 'sent=' , sent[:-1] # print 'punc=' , '<' + c + '>' # print 'next=' , '<' + z + '>' if c in Stops and self.stpx.match(sent[:-1],c,z): # print 'exception MATCH' if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print 'no stop exception MATCH for' , c # print '@1 <<' , self.inp.buf # handle any nonstandard punctuation exoticPunctuation.normalize(c,self.inp) # print '@2 <<' , self.inp.buf # check for dash if c == u'-': d = self.inp.read() if d == u'-': # print 'dash' while True: d = self.inp.read() if d != u'-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print '@3 c=' , c if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) if not inBrkt: # print sent , 'so far' z = self.inp.read() if self.shortBracketing(sent,z): break self.inp.unread(z) # print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break continue elif not c in Stops or inBrkt: continue else: # print 'check stopping!' d = self.inp.read() # print '@3 <<' , self.inp.buf if d == None: d = u'!' # print 'stop=' , '<' + c + '> <' + d + '>' # print 'ellipsis check' if c == u'.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(c) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append(SP) # if part of token, put in space as separator continue # special check for multiple stops # print 'next char d=' , d , ord(d) if d != END else 'NONE' if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent,d): break self.inp.unread(d) # print 'no space after punc' continue # if no match for lookahead, put back elif d != END: # print 'unread d=' , d self.inp.unread(d) # final check: is sentence long enough? # print '@4 <<' , self.inp.buf cx = self.inp.peek() if cx == None: cx = u'!!' # print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent if nAN > 1 and not inBrkt: break if len(sent) > 0 or self.last != END: return sent else: return None
def _getRaw ( self ): """ obtain next raw token from buffer arguments: self returns: EllyToken on success, None otherwise """ self.skipSpaces() # print "|",len(self.buffer) ln = len(self.buffer) # print "|",len(self.buffer) if ln == 0: return None # print "proceed" ## get length of next token and if it has ## initial - or +, check for word fragment k = 0 # number of chars for next token if self.match(MIN): # check for hyphen if self.match(DSH): # it is a dash when doubled k = 2 else: k = self.find(separators,1) elif self.match(PLS): # check for elly prefix k = self.find(separators,1) elif self.match(DOT): # check for period if self.match(ELP): # it is ellipsis when tripled k = 3 else: k = 1 elif not ellyChar.isCombining(self.buffer[0]): k = 1 # if next char cannot start a token, take it as a token else: k = self.find(separators) if k < 0: # break a token at next separator k = ln while k < ln: # look at separator if it exists x = self.buffer[k] if x != MIN and x != COM: break # a hyphen or comma is not absolute break if not ellyChar.isDigit(self.buffer[k+1]): break # accept hyphen or comma if NOT followed by digit else: # otherwise, look for another separator k = self.find(separators,k+2) if k < 0: k = ln ## if token not delimited, take rest of buffer as ## will fit into token working area if k < 0: k = ln # print "take",k,"chars from",len(self.buffer),self.buffer buf = self.extract(k) # get k characters ## special check for - next in buffer after extraction if self.match(MIN): # hyphen immediately following? self.skip() # if so, take it if self.atSpace(): # when followed by space buf.append(MIN) # append hyphen to candidate token k += 1 else: if not self.match(MIN): # when not followed by another hyphen self.prepend(ellyChar.SPC) # put back a space else: self.skip() # double hyphen = dash self.prepend(ellyChar.SPC) # put back space after dash self.prepend(MIN) # put back second hyphen self.prepend(MIN) # put back first self.prepend(ellyChar.SPC) # put extra space before hyphen or dash ## fill preallocated token for current position from working area # print "raw text for token:" , '[' + u''.join(buf).encode('utf8') + ']' to = ellyToken.EllyToken(u''.join(buf)) ## strip off trailing non-token chars from token and put back in buffer km = k - 1 while km > 0: x = buf[km] if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS: break if x == APO and km > 0 and buf[km - 1] == 's': break self.prepend(x) km -= 1 km += 1 if km < k: to.shortenBy(k - km,both=True) return to
def getNext(self): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print ( 'getNext' ) self.resetBracketing() inBrkt = False nspc = 0 # set space count sent = [] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print ( 'x=' , '<' + x + '>' , ord(x) ) self.inp.unread(x, SP) # put first char back to restore input # print ( '0 <<' , self.inp.buf ) # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print ( 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' ) # print ( 'sent=' , sent , 'nspc=' , nspc ) # check for table delimiters in text if len(sent) == 0: # print ( 'table' ) # print ( '1 <<' , self.inp.buf ) if x == '.' or x == '-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far #################################################### # accumulate chars and count alphanumeric and spaces #################################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print ( 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' ) if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # elif ellyChar.isWhiteSpace(c) and inBrkt: nspc += 1 svBrkt = inBrkt inBrkt = self.checkBracketing( x) # do bracketing check with modified chars if svBrkt and not inBrkt: nspc = 0 # print ( 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt ) sent.append(c) # put original char into sentence buffer if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # certain Unicode punctuation will always break if c in Hards: break # char was not alphanumeric or space # look for stop punctuation exception cx = self.inp.preview() # for context of match call # print ( '0 <<' , self.inp.buf ) # print ( 'sent=' , sent[:-1] ) # print ( 'punc=' , '<' + c + '>' ) # print ( 'next=' , cx ) if c in Stops and len(cx) > 0 and cx[0] == SP: if self.stpx.match(sent[:-1], c, cx): # print ( 'stop exception MATCH' ) if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print ( 'no stop exception MATCH for' , c ) # print ( '@1 <<' , self.inp.buf ) # handle any nonstandard punctuation exoticPunctuation.normalize(c, self.inp) # print ( '@2 <<' , self.inp.buf ) # check for dash if c == '-': d = self.inp.read() if d == '-': # print ( 'dash' ) while True: d = self.inp.read() if d != '-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print ( '@3 c=' , c , inBrkt ) if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print ( 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) ) if not inBrkt: # print ( sent , 'so far' ) z = self.inp.read() if self.shortBracketing(sent, z): break self.inp.unread(z) # print ( 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' ) if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break elif c in QUOs and lc in Stops: # print ( 'stop+quote' ) z = self.inp.read() if z in RBs: sent.append(z) y = self.inp.read() if y in Stops: sent.append(y) elif not ellyChar.isWhiteSpace(y): self.inp.unread(y) inBrkt = False break elif z in QUOs: # print ( 'stop+quote+quote' ) sent.append(z) inBrkt = False break self.inp.unread(z) # print ( 'continue' ) continue elif not c in Stops: continue else: # print ( 'check stopping!' ) d = self.inp.read() # print ( '@3 <<' , self.inp.buf ) if d == None: d = '!' # print ( 'stop=' , '<' + c + '> <' + d + '>' ) # print ( 'ellipsis check' ) if c == '.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(d) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append( SP ) # if part of token, put in space as separator continue if c == ELLP: # print ( 'found Unicode ellipsis, d=' , d ) if ellyChar.isUpperCaseLetter(d): self.inp.unread( d) # super special case of bad punctuation self.inp.unread(' ') # put in implied period and space self.inp.unread('.') # # special check for multiple stops # print ( 'next char d=' , d , ord(d) if d != END else 'NONE' ) if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent, d): break if d in self._cl and self._cl[d] == 1: dn = self.inp.peek() if ellyChar.isWhiteSpace(dn): sent.append(d) break self.inp.unread(d) # print ( 'no space after punc' ) continue # if no match for lookahead, put back elif d != END: # print ( 'unread d=' , d ) self.inp.unread(d) # print ( 'possible stop' ) # check special case of number ending in decimal point if c == '.': ixb = len(sent) - 2 ixn = ixb + 1 cxn = '' # print ( 'sent=' , sent ) # print ( 'ixn=' ,ixn ) while ixn > 0: ixn -= 1 cxn = sent[ixn] # print ( 'cxn=' , cxn ) if not ellyChar.isDigit(cxn): break # print ( 'break: ixn=' , ixn , 'ixb=' , ixb ) if ixn < ixb and cxn in [' ', '-', '+']: prvw = self.inp.preview() # print ( 'prvw=' , prvw ) if len(prvw) > 1 and not ellyChar.isUpperCaseLetter( prvw[1]): continue # final check: is sentence long enough? if inBrkt: # print ( 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() ) # print ( 'nspc=' , nspc ) if c in [':', ';'] or nspc < 3: sent.append(d) # print ( 'add' , '<' + d + '> to sentence' ) # print ( 'sent=' , sent ) self.inp.skip() nspc -= 1 continue # print ( '@4 <<' , self.inp.buf ) cx = self.inp.peek() if cx == None: cx = '!!' # print ( 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent ) # print ( 'nAN=' , nAN , 'inBrkt=' , inBrkt ) if nAN > 1: break if sent == ['\u2026']: # special case of sentence return list("-.-") # with lone ellipsis elif len(sent) > 0 or self.last != END: return sent else: return None
def getNext ( self ): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print 'getNext' self.resetBracketing() inBrkt = False nspc = 0 # set space count sent = [ ] # list buffer to fill x = self.inp.read() if x == SP: x = self.inp.read() if x == END: # EOF check return None c = END # reset lc = END # print 'x=' , '<' + x + '>' , ord(x) self.inp.unread(x,SP) # put first char back to restore input # print '0 <<" , self.inp.buf # fill sentence buffer up to next stop punctuation in input nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' # print 'sent=' , sent , 'nspc=' , nspc # check for table delimiters in text if len(sent) == 0: # print 'table' # print '1 <<' , self.inp.buf if x == u'.' or x == u'-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far #################################################### # accumulate chars and count alphanumeric and spaces #################################################### lc = c c = x nc = self.inp.peek() if ellyChar.isWhiteSpace(nc): nc = SP # print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' if lc == SP or lc == END: # normalize chars for proper bracketing if x == SQuo: # x = LSQm # a SQuo preceded by a space becomes LSQm elif x == DQuo: # x = LDQm # a DQuo preceded by a space becomes LDQm if nc == SP or nc == END: # if x == SQuo: # a SQuo followed by a space becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by a space becomes RDQm x = RDQm # elif not ellyChar.isLetterOrDigit(nc): if x == SQuo: # a SQuo followed by nonalphanumeric becomes RSQm x = RSQm # elif x == DQuo: # a DQuo followed by nonalphanumeric becomes RDQm x = RDQm # elif ellyChar.isWhiteSpace(c) and inBrkt: nspc += 1 svBrkt = inBrkt inBrkt = self.checkBracketing(x) # do bracketing check with modified chars if svBrkt and not inBrkt: nspc = 0 # print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt sent.append(c) # put original char into sentence buffer if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # char was not alphanumeric or space # look for stop punctuation exception cx = self.inp.preview() # for context of match call # print '0 <<' , self.inp.buf # print 'sent=' , sent[:-1] # print 'punc=' , '<' + c + '>' # print 'next=' , cx if c in Stops and len(cx) > 0 and cx[0] == SP: if self.stpx.match(sent[:-1],c,cx): # print 'stop exception MATCH' if self.drop: sent.pop() # remove punctuation char from sentence lc = SP continue # print 'no stop exception MATCH for' , c # print '@1 <<' , self.inp.buf # handle any nonstandard punctuation exoticPunctuation.normalize(c,self.inp) # print '@2 <<' , self.inp.buf # check for dash if c == u'-': d = self.inp.read() if d == u'-': # print 'dash' while True: d = self.inp.read() if d != u'-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation # print '@3 c=' , c , inBrkt if c in QUOs or c in RBs: # special check for single or double quotes or # bracketing, which can immediately follow stop # punctuation for current sentence # print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) if not inBrkt: # print sent , 'so far' z = self.inp.read() if self.shortBracketing(sent,z): break self.inp.unread(z) # print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' if z == END or ellyChar.isWhiteSpace(z) and lc in Stops: if nAN > 1: break elif c in QUOs and lc in Stops: # print 'stop+quote' z = self.inp.read() if z in RBs: sent.append(z) y = self.inp.read() if y in Stops: sent.append(y) elif not ellyChar.isWhiteSpace(y): self.inp.unread(y) inBrkt = False break elif z in QUOs: # print 'stop+quote+quote' sent.append(z) inBrkt = False break self.inp.unread(z) # print 'continue' continue elif not c in Stops: continue else: # print 'check stopping!' d = self.inp.read() # print '@3 <<' , self.inp.buf if d == None: d = u'!' # print 'stop=' , '<' + c + '> <' + d + '>' # print 'ellipsis check' if c == u'.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(d) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append(SP) # if part of token, put in space as separator continue if c == ELLP: # print 'found Unicode ellipsis, d=' , d if ellyChar.isUpperCaseLetter(d): self.inp.unread(d) # super special case of bad punctuation self.inp.unread(' ') # put in implied period and space self.inp.unread('.') # # special check for multiple stops # print 'next char d=' , d , ord(d) if d != END else 'NONE' if d in Stops: while True: d = self.inp.read() if not d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = SP # make rightside context for stop # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): if self.shortBracketing(sent,d): break if d in self._cl and self._cl[d] == 1: dn = self.inp.peek() if ellyChar.isWhiteSpace(dn): sent.append(d) break self.inp.unread(d) # print 'no space after punc' continue # if no match for lookahead, put back elif d != END: # print 'unread d=' , d self.inp.unread(d) # print 'possible stop' # check special case of number ending in decimal point if c == '.': ixb = len(sent) - 2 ixn = ixb + 1 cxn = '' # print 'sent=' , sent # print 'ixn=' ,ixn while ixn > 0: ixn -= 1 cxn = sent[ixn] # print 'cxn=' , cxn if not ellyChar.isDigit(cxn): break # print 'break: ixn=' , ixn , 'ixb=' , ixb if ixn < ixb and cxn in [ ' ' , '-' , '+' ]: prvw = self.inp.preview() # print 'prvw=' , prvw if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(prvw[1]): continue # final check: is sentence long enough? if inBrkt: # print 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() # print 'nspc=' , nspc if c in [ ':' , ';' ] or nspc < 3: sent.append(d) # print 'add' , '<' + d + '> to sentence' # print 'sent=' , sent self.inp.skip() nspc -= 1 continue # print '@4 <<' , self.inp.buf cx = self.inp.peek() if cx == None: cx = u'!!' # print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent # print 'nAN=' , nAN , 'inBrkt=' , inBrkt if nAN > 1: break if sent == [ u'\u2026' ]: # special case of sentence return list("-.-") # with lone ellipsis elif len(sent) > 0 or self.last != END: return sent else: return None
def getNext ( self ): """ extract next sentence for Elly translation from input stream arguments: self returns: list of chars for next sentence on success, None on empty stream """ # print 'getNext' sent = [ ] # list buffer to fill parenstop = False # initially, parentheses will NOT stop sentence c = self.inp.read() if c == SP: c = self.inp.read() if c == END: # EOF check return None # print 'c=' , ord(c) self.inp.unread(c,SP) # print '0 <<" , self.inp.buf # fill sentence buffer up to next stop punctuation nAN = 0 # alphanumeric count in sentence while True: x = self.inp.read() # next input char if x == END: # handle any EOF break # print 'x=' , '<' + x + '>' # print 'sent=' , sent # check for table delimiters in text if len(sent) == 0: # print 'table' # print '1 <<' , self.inp.buf if x == u'.' or x == u'-': # look for multiple '.' or '-' while True: # scan up to end of current buffering y = self.inp.read() # if y != x and y != SP: # no more delimiter chars or spaces? self.inp.unread(y) # if so, done break # continue # ignore everything seen so far ######################################### # accumulate chars and count alphanumeric ######################################### c = x sent.append(c) if ellyChar.isLetterOrDigit(c): nAN += 1 continue # if alphanumeric, just add to sentence if c == SP: continue # if space, just add to sentence # NL will break a sentence if c == NL: sent.pop() # remove from sentence chars break # char was not alphanumeric or space # look for stop punctuation exception z = self.inp.peek() # for context of match call # print 'peek z=' , z # print '0 <<' , self.inp.buf # print 'sent=' , sent[:-1] # print 'punc=' , '<' + c + '>' # print 'next=' , '<' + z + '>' if self.stpx.match(sent[:-1],c,z): # print 'exception MATCH' if self.drop: sent.pop() # remove punctuation char from sentence continue # print '1 <<' , self.inp.buf # print 'no exception MATCH' # handle any nonstandard punctuation exoticPunctuation.normalize(c,self.inp) # print '2 <<' , self.inp.buf # handle parentheses as possible stop if nAN == 0 and self.stpx.inBracketing(): parenstop = True elif parenstop and not self.stpx.inBracketing(): break # treat as stop # print '3 <<' , self.inp.buf # check for dash if c == u'-': d = self.inp.read() if d == u'-': # print 'dash' while True: d = self.inp.read() if d != u'-': break sent.append(c) self.inp.unread(d) continue # check for sentence break on punctuation if not c in Stops: continue else: # print 'stopping possible!' d = self.inp.read() # print '4 <<' , self.inp.buf if d == None: d = u'!' # print 'stop=' , '<' + c + '> <' + d + '>' # print 'ellipsis check' if c == u'.' and c == d: if self.inp.peek() != c: # look for third '.' in ellipsis self.inp.unread(c) # if none, keep only first '.' else: self.inp.skip() # found ellipsis sent.append(d) # complete it in sentence buffer sent.append(d) # x = self.inp.peek() # look at char after ellipsis if ellyChar.isCombining(x): sent.append(' ') # if part of token, put in space as separator continue # special check for multiple stops # print 'Stops d=' , d , ord(d) if d != '' else 'NONE' if d in Stops: while True: d = self.inp.read() if d in Stops: break self.inp.unread(d) if not ellyChar.isWhiteSpace(d): d = u' ' # break sentence except when in parentheses elif d in RBs: # print 'followed by' , '<' + d + '>' if not self.stpx.inBracketing(): break else: if self.drop: sent.pop() self.inp.unread(d) continue # special check for single or double quotes, which should # be included with current sentence after stop punctuation elif d in QUOs: # print 'QUO d=' , d , ord(d) x = self.inp.peek() if x == END or ellyChar.isWhiteSpace(x): sent.append(d) break else: self.inp.unread(SP) continue # special check for blank or null after stops elif d != END and not ellyChar.isWhiteSpace(d): sent.append(d) continue # if no match for lookahead, put back elif d != '': # print 'unread d=' , d self.inp.unread(d) # final check: is sentence long enough? # print '5 <<' , self.inp.buf cx = self.inp.peek() if cx == None: cx = u'!!' # print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent if nAN > 1: break if len(sent) > 0 or self.last != END: return sent else: return None