Python isCombining Examples

Programming Language: Python

Namespace/Package Name: ellyChar

Method/Function: isCombining

Examples at hotexamples.com: 7

Python isCombining - 7 examples found. These are the top rated real world Python examples of ellyChar.isCombining extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def atToken(self):
        """
        look for combining token char at start of buffer

        arguments:
            self

        returns:
            True if found, False otherwise
        """

        if len(self.buffer) == 0: return False
        x = self.buffer[0]
        if x == '-' or x == '+':  # look for suffix or prefix
            return True
        else:
            return ellyChar.isCombining(self.buffer[0])

Example #2

Show file

File: ellyBuffer.py Project: ivanjelinek/pyelly

    def atToken ( self ):

        """
        look for token char at start of buffer

        arguments:
            self

        returns:
            True if found, False otherwise
        """

        if len(self.buffer) == 0: return False
        x = self.buffer[0]
        if x == u'-' or x == u'+': # look for suffix or prefix
            return True
        else:
            return ellyChar.isCombining(self.buffer[0])

Example #3

Show file

    def getNext ( self ):

        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

#       print 'getNext'

        self.resetBracketing()

        sent = [ ]         # list buffer to fill

        x  = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:       # EOF check
            return None

        c  = END           # reset
        lc = END

#       print 'x=' , '<' + x + '>' , ord(x)
        self.inp.unread(x,SP)       # put first char back to restore input
#       print '0  <<" , self.inp.buf

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0                     # alphanumeric count in sentence

        while True:

            x = self.inp.read()     # next input char

            if x == END:            # handle any EOF
                break

#           print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>'
#           print 'sent=' , sent

            # check for table delimiters in text

            if len(sent) == 0:
#               print 'table'
#               print '1  <<' , self.inp.buf

                if x == u'.' or x == u'-':      # look for multiple '.' or '-'
                    while True:                 # scan up to end of current buffering
                        y = self.inp.read()     #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break               #
                    continue                    # ignore everything seen so far

            #########################################
            # accumulate chars and count alphanumeric
            #########################################

            lc = c
            c  = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

#           print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>'
            if lc == SP or lc == END: # normalize chars for proper bracketing
                if x == SQuo:         #
                    x = LSQm          # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:       #
                    x = LDQm          # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END: #
                if x == SQuo:         # a SQuo followed by a space becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by a space becomes RDQm
                    x = RDQm          #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:         # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm          #

            inBrkt = self.checkBracketing(x)    # do bracket checking with modified chars

#           print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt

            sent.append(c)                      # but buffer original chars
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue                        # if alphanumeric, just add to sentence

            if c == SP:
                continue                        # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()                      # remove from sentence chars
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            z = self.inp.peek()  # for context of match call

#           print '0  <<' , self.inp.buf

#           print 'sent=' , sent[:-1]
#           print 'punc=' , '<' + c + '>'
#           print 'next=' , '<' + z + '>'
            if c in Stops and self.stpx.match(sent[:-1],c,z):
#               print 'exception MATCH'
                if self.drop:
                    sent.pop()   # remove punctuation char from sentence
                    lc = SP
                continue

#           print 'no stop exception MATCH for' , c

#           print '@1  <<' , self.inp.buf

            # handle any nonstandard punctuation

            exoticPunctuation.normalize(c,self.inp)

#           print '@2  <<' , self.inp.buf

            # check for dash

            if c == u'-':
                d = self.inp.read()
                if d == u'-':
#                   print 'dash'
                    while True:
                        d = self.inp.read()
                        if d != u'-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print '@3  c=' , c

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

#               print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent)

                if not inBrkt:
#                   print sent , 'so far'
                    z = self.inp.read()
                    if self.shortBracketing(sent,z):
                        break
                    self.inp.unread(z)
#                   print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']'
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                continue

            elif not c in Stops or inBrkt:
                continue

            else:
#               print 'check stopping!'
                d = self.inp.read()
#               print '@3  <<' , self.inp.buf

                if d == None: d = u'!'
#               print 'stop=' , '<' + c + '> <' + d + '>'

#               print 'ellipsis check'
                if c == u'.' and c == d:
                    if self.inp.peek() != c: # look for third '.' in ellipsis
                        self.inp.unread(c)   # if none, keep only first '.'
                    else:
                        self.inp.skip()      # found ellipsis
                        sent.append(d)       # complete it in sentence buffer
                        sent.append(d)       #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(SP)  # if part of token, put in space as separator
                    continue

                # special check for multiple stops

#               print 'next char d=' , d , ord(d) if d != END else 'NONE'
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP               # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent,d): break
                    self.inp.unread(d)
#                   print 'no space after punc'
                    continue

                # if no match for lookahead, put back

                elif d != END:
#                   print 'unread d=' , d
                    self.inp.unread(d)

                # final check: is sentence long enough?

#               print '@4  <<' , self.inp.buf
                cx = self.inp.peek()
                if cx == None: cx = u'!!'
#               print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent
                if nAN > 1 and not inBrkt:
                    break

        if len(sent) > 0 or self.last != END:
            return sent
        else:
            return None

Example #4

Show file

File: ellyBuffer.py Project: ivanjelinek/pyelly

    def _getRaw ( self ):

        """
        obtain next raw token from buffer

        arguments:
            self

        returns:
            EllyToken on success, None otherwise
        """

        self.skipSpaces()
#       print "|",len(self.buffer)
        ln = len(self.buffer)
#       print "|",len(self.buffer)
        if ln == 0:
            return None
#       print "proceed"
            
        ## get length of next token and if it has
        ## initial - or +, check for word fragment

        k = 0                   # number of chars for next token
        
        if self.match(MIN):     # check for hyphen
            if self.match(DSH): # it is a dash when doubled
                k = 2
            else:
                k = self.find(separators,1)
        elif self.match(PLS):   # check for elly prefix
            k = self.find(separators,1)
        elif self.match(DOT):   # check for period
            if self.match(ELP): # it is ellipsis when tripled
                k = 3
            else:
                k = 1
        elif not ellyChar.isCombining(self.buffer[0]):
            k = 1               # if next char cannot start a token, take it as a token
        else:
            k = self.find(separators)
            if k < 0:           # break a token at next separator
                k = ln
            while k < ln:       # look at separator if it exists
                x = self.buffer[k]
                if x != MIN and x != COM:
                    break       # a hyphen or comma is not absolute break
                if not ellyChar.isDigit(self.buffer[k+1]):
                    break       # accept hyphen or comma if NOT followed by digit
                else:           # otherwise, look for another separator
                    k = self.find(separators,k+2)
                    if k < 0:
                        k = ln
        
        ## if token not delimited, take rest of buffer as
        ## will fit into token working area
        
        if k < 0: k = ln

#       print "take",k,"chars from",len(self.buffer),self.buffer
            
        buf = self.extract(k) # get k characters

        ## special check for - next in buffer after extraction

        if self.match(MIN):                    # hyphen immediately following?
            self.skip()                        # if so, take it
            if self.atSpace():                 # when followed by space
                buf.append(MIN)                # append hyphen to candidate token
                k += 1
            else:
                if not self.match(MIN):        # when not followed by another hyphen
                    self.prepend(ellyChar.SPC) # put back a space
                else:
                    self.skip()                # double hyphen = dash
                    self.prepend(ellyChar.SPC) # put back space after dash
                    self.prepend(MIN)          # put back second hyphen
                self.prepend(MIN)              # put back first
                self.prepend(ellyChar.SPC)     # put extra space before hyphen or dash
        
        ## fill preallocated token for current position from working area
        
#       print "raw text for token:" , '[' + u''.join(buf).encode('utf8') + ']'
        to = ellyToken.EllyToken(u''.join(buf))
        
        ## strip off trailing non-token chars from token and put back in buffer
        
        km = k - 1
        while km > 0:
            x = buf[km]
            if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS:
                break
            if x == APO and km > 0 and buf[km - 1] == 's':
                break
            self.prepend(x)
            km -= 1
        km += 1
        if km < k:
            to.shortenBy(k - km,both=True)
        
        return to

Example #5

Show file

    def getNext(self):
        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

        #       print ( 'getNext' )

        self.resetBracketing()
        inBrkt = False

        nspc = 0  # set space count

        sent = []  # list buffer to fill

        x = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:  # EOF check
            return None

        c = END  # reset
        lc = END

        #       print ( 'x=' , '<' + x + '>' , ord(x) )
        self.inp.unread(x, SP)  # put first char back to restore input
        #       print ( '0  <<' , self.inp.buf )

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0  # alphanumeric count in sentence

        while True:

            x = self.inp.read()  # next input char

            if x == END:  # handle any EOF
                break

#           print ( 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' )
#           print ( 'sent=' , sent , 'nspc=' , nspc )

# check for table delimiters in text

            if len(sent) == 0:
                #               print ( 'table' )
                #               print ( '1  <<' , self.inp.buf )

                if x == '.' or x == '-':  # look for multiple '.' or '-'
                    while True:  # scan up to end of current buffering
                        y = self.inp.read()  #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break  #
                    continue  # ignore everything seen so far

            ####################################################
            # accumulate chars and count alphanumeric and spaces
            ####################################################

            lc = c
            c = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

            #           print ( 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' )
            if lc == SP or lc == END:  # normalize chars for proper bracketing
                if x == SQuo:  #
                    x = LSQm  # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:  #
                    x = LDQm  # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END:  #
                if x == SQuo:  # a SQuo followed by a space becomes RSQm
                    x = RSQm  #
                elif x == DQuo:  # a DQuo followed by a space becomes RDQm
                    x = RDQm  #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:  # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm  #
                elif x == DQuo:  # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm  #
            elif ellyChar.isWhiteSpace(c) and inBrkt:
                nspc += 1

            svBrkt = inBrkt
            inBrkt = self.checkBracketing(
                x)  # do bracketing check with modified chars
            if svBrkt and not inBrkt: nspc = 0

            #           print ( 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt )

            sent.append(c)  # put original char into sentence buffer
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue  # if alphanumeric, just add to sentence

            if c == SP:
                continue  # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()  # remove from sentence chars
                break

            # certain Unicode punctuation will always break

            if c in Hards:
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            cx = self.inp.preview()  # for context of match call

            #           print ( '0  <<' , self.inp.buf )

            #           print ( 'sent=' , sent[:-1] )
            #           print ( 'punc=' , '<' + c + '>' )
            #           print ( 'next=' , cx )
            if c in Stops and len(cx) > 0 and cx[0] == SP:
                if self.stpx.match(sent[:-1], c, cx):
                    #                   print ( 'stop exception MATCH' )
                    if self.drop:
                        sent.pop()  # remove punctuation char from sentence
                        lc = SP
                    continue

#           print ( 'no stop exception MATCH for' , c )

#           print ( '@1  <<' , self.inp.buf )

# handle any nonstandard punctuation

            exoticPunctuation.normalize(c, self.inp)

            #           print ( '@2  <<' , self.inp.buf )

            # check for dash

            if c == '-':
                d = self.inp.read()
                if d == '-':
                    #                   print ( 'dash' )
                    while True:
                        d = self.inp.read()
                        if d != '-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print ( '@3  c=' , c , inBrkt )

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

                #               print ( 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) )

                if not inBrkt:
                    #                   print ( sent , 'so far' )
                    z = self.inp.read()
                    if self.shortBracketing(sent, z):
                        break
                    self.inp.unread(z)
                    #                   print ( 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' )
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                elif c in QUOs and lc in Stops:
                    #                   print ( 'stop+quote' )
                    z = self.inp.read()
                    if z in RBs:
                        sent.append(z)
                        y = self.inp.read()
                        if y in Stops:
                            sent.append(y)
                        elif not ellyChar.isWhiteSpace(y):
                            self.inp.unread(y)
                        inBrkt = False
                        break
                    elif z in QUOs:
                        #                       print ( 'stop+quote+quote' )
                        sent.append(z)
                        inBrkt = False
                        break
                    self.inp.unread(z)
#               print ( 'continue' )
                continue

            elif not c in Stops:
                continue

            else:
                #               print ( 'check stopping!' )
                d = self.inp.read()
                #               print ( '@3  <<' , self.inp.buf )

                if d == None: d = '!'
                #               print ( 'stop=' , '<' + c + '> <' + d + '>' )

                #               print ( 'ellipsis check' )
                if c == '.' and c == d:
                    if self.inp.peek() != c:  # look for third '.' in ellipsis
                        self.inp.unread(d)  # if none, keep only first '.'
                    else:
                        self.inp.skip()  # found ellipsis
                        sent.append(d)  # complete it in sentence buffer
                        sent.append(d)  #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(
                                SP
                            )  # if part of token, put in space as separator
                    continue

                if c == ELLP:
                    #                   print ( 'found Unicode ellipsis, d=' , d )
                    if ellyChar.isUpperCaseLetter(d):
                        self.inp.unread(
                            d)  # super special case of bad punctuation
                        self.inp.unread(' ')  # put in implied period and space
                        self.inp.unread('.')  #

                # special check for multiple stops

#               print ( 'next char d=' , d , ord(d) if d != END else 'NONE' )
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP  # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent, d): break
                    if d in self._cl and self._cl[d] == 1:
                        dn = self.inp.peek()
                        if ellyChar.isWhiteSpace(dn):
                            sent.append(d)
                            break
                    self.inp.unread(d)
                    #                   print ( 'no space after punc' )
                    continue

                # if no match for lookahead, put back

                elif d != END:
                    #                   print ( 'unread d=' , d )
                    self.inp.unread(d)

#               print ( 'possible stop' )

# check special case of number ending in decimal point

                if c == '.':
                    ixb = len(sent) - 2
                    ixn = ixb + 1
                    cxn = ''
                    #                   print ( 'sent=' , sent )
                    #                   print ( 'ixn=' ,ixn )
                    while ixn > 0:
                        ixn -= 1
                        cxn = sent[ixn]
                        #                       print ( 'cxn=' , cxn )
                        if not ellyChar.isDigit(cxn): break
#                   print ( 'break: ixn=' , ixn , 'ixb=' , ixb )
                    if ixn < ixb and cxn in [' ', '-', '+']:
                        prvw = self.inp.preview()
                        #                       print ( 'prvw=' , prvw )
                        if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(
                                prvw[1]):
                            continue

                # final check: is sentence long enough?

                if inBrkt:
                    #                   print ( 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() )
                    #                   print ( 'nspc=' , nspc )
                    if c in [':', ';'] or nspc < 3:
                        sent.append(d)
                        #                       print ( 'add' , '<' + d + '> to sentence' )
                        #                       print ( 'sent=' , sent )
                        self.inp.skip()
                        nspc -= 1
                        continue

#               print ( '@4  <<' , self.inp.buf )
                cx = self.inp.peek()
                if cx == None: cx = '!!'
                #               print ( 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent )
                #               print ( 'nAN=' , nAN , 'inBrkt=' , inBrkt )
                if nAN > 1:
                    break

        if sent == ['\u2026']:  # special case of sentence
            return list("-.-")  # with lone ellipsis
        elif len(sent) > 0 or self.last != END:
            return sent
        else:
            return None

Example #6

Show file

File: ellySentenceReader.py Project: prohippo/pyelly

    def getNext ( self ):

        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

#       print 'getNext'

        self.resetBracketing()
        inBrkt = False

        nspc = 0           # set space count

        sent = [ ]         # list buffer to fill

        x  = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:       # EOF check
            return None

        c  = END           # reset
        lc = END

#       print 'x=' , '<' + x + '>' , ord(x)
        self.inp.unread(x,SP)       # put first char back to restore input
#       print '0  <<" , self.inp.buf

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0                     # alphanumeric count in sentence

        while True:

            x = self.inp.read()     # next input char

            if x == END:            # handle any EOF
                break

#           print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>'
#           print 'sent=' , sent , 'nspc=' , nspc

            # check for table delimiters in text

            if len(sent) == 0:
#               print 'table'
#               print '1  <<' , self.inp.buf

                if x == u'.' or x == u'-':      # look for multiple '.' or '-'
                    while True:                 # scan up to end of current buffering
                        y = self.inp.read()     #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break               #
                    continue                    # ignore everything seen so far

            ####################################################
            # accumulate chars and count alphanumeric and spaces
            ####################################################

            lc = c
            c  = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

#           print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>'
            if lc == SP or lc == END: # normalize chars for proper bracketing
                if x == SQuo:         #
                    x = LSQm          # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:       #
                    x = LDQm          # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END: #
                if x == SQuo:         # a SQuo followed by a space becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by a space becomes RDQm
                    x = RDQm          #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:         # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm          #
            elif ellyChar.isWhiteSpace(c) and inBrkt:
                nspc += 1

            svBrkt = inBrkt
            inBrkt = self.checkBracketing(x)    # do bracketing check with modified chars
            if svBrkt and not inBrkt: nspc = 0

#           print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt

            sent.append(c)                      # put original char into sentence buffer
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue                        # if alphanumeric, just add to sentence

            if c == SP:
                continue                        # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()                      # remove from sentence chars
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            cx = self.inp.preview()  # for context of match call

#           print '0  <<' , self.inp.buf

#           print 'sent=' , sent[:-1]
#           print 'punc=' , '<' + c + '>'
#           print 'next=' , cx
            if c in Stops and len(cx) > 0 and cx[0] == SP:
                if self.stpx.match(sent[:-1],c,cx):
#                   print 'stop exception MATCH'
                    if self.drop:
                        sent.pop()   # remove punctuation char from sentence
                        lc = SP
                    continue

#           print 'no stop exception MATCH for' , c

#           print '@1  <<' , self.inp.buf

            # handle any nonstandard punctuation

            exoticPunctuation.normalize(c,self.inp)

#           print '@2  <<' , self.inp.buf

            # check for dash

            if c == u'-':
                d = self.inp.read()
                if d == u'-':
#                   print 'dash'
                    while True:
                        d = self.inp.read()
                        if d != u'-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print '@3  c=' , c , inBrkt

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

#               print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent)

                if not inBrkt:
#                   print sent , 'so far'
                    z = self.inp.read()
                    if self.shortBracketing(sent,z):
                        break
                    self.inp.unread(z)
#                   print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']'
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                elif c in QUOs and lc in Stops:
#                   print 'stop+quote'
                    z = self.inp.read()
                    if z in RBs:
                        sent.append(z)
                        y = self.inp.read()
                        if y in Stops:
                            sent.append(y)
                        elif not ellyChar.isWhiteSpace(y):
                            self.inp.unread(y)
                        inBrkt = False
                        break
                    elif z in QUOs:
#                       print 'stop+quote+quote'
                        sent.append(z)
                        inBrkt = False
                        break
                    self.inp.unread(z)
#               print 'continue'
                continue

            elif not c in Stops:
                continue

            else:
#               print 'check stopping!'
                d = self.inp.read()
#               print '@3  <<' , self.inp.buf

                if d == None: d = u'!'
#               print 'stop=' , '<' + c + '> <' + d + '>'

#               print 'ellipsis check'
                if c == u'.' and c == d:
                    if self.inp.peek() != c: # look for third '.' in ellipsis
                        self.inp.unread(d)   # if none, keep only first '.'
                    else:
                        self.inp.skip()      # found ellipsis
                        sent.append(d)       # complete it in sentence buffer
                        sent.append(d)       #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(SP)  # if part of token, put in space as separator
                    continue

                if c == ELLP:
#                   print 'found Unicode ellipsis, d=' , d
                    if ellyChar.isUpperCaseLetter(d):
                        self.inp.unread(d)   # super special case of bad punctuation
                        self.inp.unread(' ') # put in implied period and space
                        self.inp.unread('.') #

                # special check for multiple stops

#               print 'next char d=' , d , ord(d) if d != END else 'NONE'
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP               # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent,d): break
                    if d in self._cl and self._cl[d] == 1:
                        dn = self.inp.peek()
                        if ellyChar.isWhiteSpace(dn):
                            sent.append(d)
                            break
                    self.inp.unread(d)
#                   print 'no space after punc'
                    continue

                # if no match for lookahead, put back

                elif d != END:
#                   print 'unread d=' , d
                    self.inp.unread(d)

#               print 'possible stop'

                # check special case of number ending in decimal point

                if c == '.':
                    ixb = len(sent) - 2
                    ixn = ixb + 1
                    cxn = ''
#                   print 'sent=' , sent
#                   print 'ixn=' ,ixn
                    while ixn > 0:
                        ixn -= 1
                        cxn = sent[ixn]
#                       print 'cxn=' , cxn
                        if not ellyChar.isDigit(cxn): break
#                   print 'break: ixn=' , ixn , 'ixb=' , ixb
                    if ixn < ixb and cxn in [ ' ' , '-' , '+' ]:
                        prvw = self.inp.preview()
#                       print 'prvw=' , prvw
                        if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(prvw[1]):
                            continue

                # final check: is sentence long enough?

                if inBrkt:
#                   print 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview()
#                   print 'nspc=' , nspc
                    if c in [ ':' , ';' ] or nspc < 3:
                        sent.append(d)
#                       print 'add' , '<' + d + '> to sentence'
#                       print 'sent=' , sent
                        self.inp.skip()
                        nspc -= 1
                        continue

#               print '@4  <<' , self.inp.buf
                cx = self.inp.peek()
                if cx == None: cx = u'!!'
#               print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent
#               print 'nAN=' , nAN , 'inBrkt=' , inBrkt
                if nAN > 1:
                    break

        if sent == [ u'\u2026' ]:  # special case of sentence
            return list("-.-")     # with lone ellipsis
        elif len(sent) > 0 or self.last != END:
            return sent
        else:
            return None

Example #7

Show file

File: ellySentenceReader.py Project: belkhir-nacim/pyelly

    def getNext ( self ):

        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

#       print 'getNext'

        sent = [ ]         # list buffer to fill

        parenstop = False  # initially, parentheses will NOT stop sentence

        c = self.inp.read()
        if c == SP:
            c = self.inp.read()

        if c == END:       # EOF check
            return None

#       print 'c=' , ord(c)
        self.inp.unread(c,SP)
#       print '0  <<" , self.inp.buf

        # fill sentence buffer up to next stop punctuation

        nAN = 0            # alphanumeric count in sentence

        while True:

            x = self.inp.read()     # next input char

            if x == END:            # handle any EOF
                break

#           print 'x=' , '<' + x + '>'
#           print 'sent=' , sent

            # check for table delimiters in text

            if len(sent) == 0:
#               print 'table'
#               print '1  <<' , self.inp.buf

                if x == u'.' or x == u'-':      # look for multiple '.' or '-'
                    while True:                 # scan up to end of current buffering
                        y = self.inp.read()     #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break               #
                    continue                    # ignore everything seen so far

            #########################################
            # accumulate chars and count alphanumeric
            #########################################

            c = x
            sent.append(c)
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue                        # if alphanumeric, just add to sentence

            if c == SP:
                continue                        # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()                      # remove from sentence chars
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            z = self.inp.peek()  # for context of match call
#           print 'peek z=' , z

#           print '0  <<' , self.inp.buf

#           print 'sent=' , sent[:-1]
#           print 'punc=' , '<' + c + '>'
#           print 'next=' , '<' + z + '>'
            if self.stpx.match(sent[:-1],c,z):
#               print 'exception MATCH'
                if self.drop:
                    sent.pop()   # remove punctuation char from sentence
                continue

#           print '1  <<' , self.inp.buf

#           print 'no exception MATCH'

            # handle any nonstandard punctuation

            exoticPunctuation.normalize(c,self.inp)

#           print '2  <<' , self.inp.buf

            # handle parentheses as possible stop

            if nAN == 0 and self.stpx.inBracketing():
                parenstop = True
            elif parenstop and not self.stpx.inBracketing():
                break            # treat as stop

#           print '3  <<' , self.inp.buf

            # check for dash

            if c == u'-':
                d = self.inp.read()
                if d == u'-':
#                   print 'dash'
                    while True:
                        d = self.inp.read()
                        if d != u'-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

            if not c in Stops:
                continue

            else:
#               print 'stopping possible!'
                d = self.inp.read()
#               print '4  <<' , self.inp.buf

                if d == None: d = u'!'
#               print 'stop=' , '<' + c + '> <' + d + '>'

#               print 'ellipsis check'
                if c == u'.' and c == d:
                    if self.inp.peek() != c: # look for third '.' in ellipsis
                        self.inp.unread(c)   # if none, keep only first '.'
                    else:
                        self.inp.skip()      # found ellipsis
                        sent.append(d)       # complete it in sentence buffer
                        sent.append(d)       #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(' ') # if part of token, put in space as separator
                    continue

                # special check for multiple stops

#               print 'Stops d=' , d , ord(d) if d != '' else 'NONE'
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = u' '

                # break sentence except when in parentheses

                elif d in RBs:
#                   print 'followed by' , '<' + d + '>'
                    if not self.stpx.inBracketing():
                        break
                    else:
                        if self.drop:
                            sent.pop()
                        self.inp.unread(d)
                        continue

                # special check for single or double quotes, which should
                # be included with current sentence after stop punctuation

                elif d in QUOs:
#                   print 'QUO d=' , d , ord(d)
                    x = self.inp.peek()
                    if x == END or ellyChar.isWhiteSpace(x):
                        sent.append(d)
                        break
                    else:
                        self.inp.unread(SP)
                        continue

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    sent.append(d)
                    continue

                # if no match for lookahead, put back

                elif d != '':
#                   print 'unread d=' , d
                    self.inp.unread(d)

                # final check: is sentence long enough?

#               print '5  <<' , self.inp.buf
                cx = self.inp.peek()
                if cx == None: cx = u'!!'
#               print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent
                if nAN > 1:
                    break

        if len(sent) > 0 or self.last != END:
            return sent
        else:
            return None