Python isText Examples

Programming Language: Python

Namespace/Package Name: ellyChar

Method/Function: isText

Examples at hotexamples.com: 11

Python isText - 11 examples found. These are the top rated real world Python examples of ellyChar.isText extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def normalize(self, s):
        """
        convert all unrecognizable input chars to _ and any
        consecutive white spaces to a single space

        arguments:
            self -
            s    - Unicode string or char list to operate on
        returns:
            normalized sequence
        """

        #       print ( '__ normalize' )
        spaced = False
        n = len(s)
        ns = []
        for i in range(n):
            x = s[i]
            if ellyChar.isLetter(x):
                spaced = False
            elif ellyChar.isWhiteSpace(x):
                if spaced: continue
                x = ' '
                spaced = True
            elif not ellyChar.isText(x):
                x = '_'
                spaced = False
            else:
                spaced = False
            ns.append(x)
        return ns

Example #2

Show file

def normalize ( s ):

    """
    convert all unrecognizable input chars to _ and any
    consecutive white spaces to a single space

    arguments:
        s   - Unicode string or char list to operate on
    returns:
        normalized sequence
    """

    spaced = False
    n = len(s)
    ns = [ ]
    for i in range(n):
        x = s[i]
        if ellyChar.isLetter(x):
            spaced = False
        elif ellyChar.isWhiteSpace(x):
            if spaced: continue
            x = ' '
            spaced = True
        elif not ellyChar.isText(x):
            x = '_'
            spaced = False
        else:
            spaced = False
        ns.append(x)
    return ns

Example #3

Show file

def minMatch ( patn ):

    """
    compute minimum number of chars matched by pattern

    arguments:
        patn  - pattern with possible Elly wildcards

    returns:
        minimum count of chars matched
    """

    inOption = False

    k = 0
    m = 0
    ml = len(patn)

    while m < ml:

        tmc = patn[m]

        if tmc == ellyChar.SPC:        # space in pattern will stop scan
            if not inOption: break
        elif ellyChar.isText(tmc):     # ordinary text char is counted
            if not inOption: k += 1
        elif tmc == cSOS:              # optional start code
#           print ( "START optional match" , inOption )
            inOption = True
        elif tmc == cEOS:              # optional end   code
#           print ( "END optional match" , inOption )
            inOption = False
        elif tmc == cALL:              # ALL (*) wildcard?
            pass
        elif tmc == cEND:              # END code
            pass
        else:                          # any other wildcard
#           print ( "count up wildcard minimum match" )
            k += 1

        m += 1

    return k

Example #4

Show file

File: ellyWildcard.py Project: prohippo/pyelly

def minMatch ( patn ):

    """
    compute minimum number of chars matched by pattern

    arguments:
        patn  - pattern with possible Elly wildcards

    returns:
        minimum count of chars matched
    """

    inOption = False

    k = 0
    m = 0
    ml = len(patn)

    while m < ml:

        tmc = patn[m]

        if tmc == ellyChar.SPC:        # space in pattern will stop scan
            if not inOption: break
        elif ellyChar.isText(tmc):     # ordinary text char is counted
            if not inOption: k += 1
        elif tmc == cSOS:              # optional start code
#           print "START optional match" , inOption
            inOption = True
        elif tmc == cEOS:              # optional end   code
#           print "END optional match" , inOption
            inOption = False
        elif tmc == cALL:              # ALL (*) wildcard?
            pass
        elif tmc == cEND:              # END code
            pass
        else:                          # any other wildcard
#           print "count up wildcard minimum match"
            k += 1

        m += 1

    return k

Example #5

Show file

File: macroTable.py Project: ivanjelinek/pyelly

    def _store ( self , defs , nowarn ):

        """
        put macro substitutions into table with indexing by first char of pattern

        arguments:
            self   -
            defs   - list of macro definition as strings
            nowarn - whether to turn warnings off

        exceptions:
            TableFailure on error
        """

        while True:
            l = defs.readline()               # next macro rule
#           print "rule input=" , l
            if len(l) == 0: break             # EOF check
            dl = definitionLine.DefinitionLine(l,False)
            left = dl.left                    # pattern to be matched
            tail = dl.tail                    # transformation to apply to match
            if left == None or tail == None:
                self._err(l=l)
                continue
            mp = ellyWildcard.convert(left)
            if mp == None:
                self._err('bad wildcards',l)
                continue
            pe = mp[-1]
            if pe != ellyWildcard.cALL and pe != ellyWildcard.cEND:
                mp += ellyWildcard.cEND       # pattern must end in $ if it does not end in *
            if not _checkBindings(mp,tail):
                self._err('bad bindings in substitution',l)
                continue
            if not nowarn and not _checkExpansion(mp,tail):
                self._err('substitution longer than original string',l,0)
            r = [ mp , tail ]
#           print "rule =" , [ left , tail ]
            pat = r[0]                        # get coded pattern
            if pat == None:
                self._err('no pattern',l)
                continue
            c = pat[0]                        # first char of pattern
                                              # check type to see how to index rule
#           print 'c=' , ord(c)
            p = pat
            while c == ellyWildcard.cSOS:     # optional sequence?
                k = p.find(ellyWildcard.cEOS) # if so, find the end of sequence
                if k < 0 or k == 1: break     # if no end or empty sequence, stop
                k += 1
                if k == len(pat): break       # should be something after sequence
                m = ellyChar.toIndex(pat[1])  # index by first char of optional sequence
                self.index[m].append(r)       #   (must be non-wildcard)
                p = p[k:]                     # move up in pattern
                c = p[0]                      #   but check for another optional sequence

            if c == ellyWildcard.cSOS:
                self._err(l=l)
                continue                      # bad sequence, skip this rule

#           print 'c=' , ord(c)
            if ellyChar.isLetterOrDigit(c):   # check effective first char of pattern
                m = ellyChar.toIndex(c)
                self.index[m].append(r)       # add to index under alphanumeric char
            elif ellyChar.isText(c):
                self.index[0].append(r)       # add to index under punctuation
            elif not c in ellyWildcard.Matching:
                if c == ellyWildcard.cEND:
                    print >> sys.stderr , '** macro warning: pattern can have empty match'
                    print >> sys.stderr , '*  at [' , l , ']'
                else:
                    dc = '=' + str(ord(c) - ellyWildcard.X)
                    self._err('bad wildcard code' , dc)
                continue
            elif c == ellyWildcard.cANY or c == ellyWildcard.cALL:
                self.anyWx.append(r)          # under general wildcards
            elif c == ellyWildcard.cCAN:
                self.index[0].append(r)       # under punctuation
            elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG:
                self.digWx.append(r)          # under digit wildcards
            elif c == ellyWildcard.cSAN:
                self.digWx.append(r)          # under both digit and
                self.letWx.append(r)          #   letter wildcards
            elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND:
                self._err('bad wildcard in context',l)
                continue                      # wildcards unacceptable here
            else:
                self.letWx.append(r)          # everything else under letter wildcard

            self.count += 1                   # count up macro substitution

        if self._errcount > 0:
            print >> sys.stderr , '**' , self._errcount , 'macro errors in all'
            print >> sys.stderr , 'macro table definition FAILed'
            raise ellyException.TableFailure

Example #6

Show file

    def read(self):
        """
        get next char from input stream with filtering

        arguments:
            self

        returns:
            single Unicode char on success, null string otherwise
        """

        #       print 'reading: buf=' , self.buf

        while True:

            if not self._reload(
            ):  # check if buffer empty and reload if needed
                return END  # return EOF if no more chars available

#           print 'buf=' , self.buf

            c = self.buf.pop(0)  # next raw char in buffer

            if c == SHYP:  # ignore soft hyphen
                if len(self.buf) > 0:
                    if self.buf[0] == SP:
                        c = self.buf.pop(0)
                continue

            if not ellyChar.isText(c):  # unrecognizable Elly char?
                #               print 'c=' , '{0:04x}'.format(ord(c))
                if ellyChar.isCJK(c):
                    if ellyConfiguration.language != 'ZH':
                        c = '_'  # special handling for non-Chinese input
                elif not c in [u'\uff0c', u'\u3002']:
                    #                   print 'replace' , c , 'with NBSP'
                    c = NBSP  # by default, replace with no-break space

            lc = self._lc  # copy saved last char
            #           print 'lc=' , ord(lc)
            self._lc = c  # set new last char

            #           if c == "'":
            #               print 'apostrophe' , self.buf

            #           print 'c=' , '<' + c + '>'

            if c == HYPH:  # special treatment for isolated hyphens
                if spc(lc) and spc(self.peek()):
                    c = DASH
                break
            elif c == '.':  # check for ellipsis
                bb = self.buf
                bl = len(bb)
                #               print 'bl=' , bl , 'bb=' , bb
                if bl >= 2 and bb[0] == '.' and bb[1] == '.':
                    self.buf = bb[2:]
                    c = ELLP
                elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[
                        2] == ' ' and bb[3] == '.':
                    self.buf = bb[4:]
                    c = ELLP
                break
            elif c == RSQm:  # check for single quote
                #               print 'at single quote'
                nc = self.peek()  # look at next char
                #               print 'next=' , nc
                if nc == RSQm:  # doubling of single quote?
                    self.buf.pop(0)  # if so, combine two single quotes
                    c = RDQm  # into one double quote
            elif not ellyChar.isWhiteSpace(c):
                if ellyChar.isWhiteSpace(lc):
                    self._cap = ellyChar.isUpperCaseLetter(c)
                break
            elif c == CR:  # always ignore
                continue
            elif c == NL:  # special handling of \n
                #               print 'got NL'
                nc = self.peek()  # look at next char

                while nc == CR:
                    self.buf.pop(0)  # skip over CR's
                    nc = self.peek()
#               print "lc= '" + lc + "'"
                if lc != NL and nc == NL:
                    self.buf.pop(0)  # special case when NL can be returned
                    break

                if nc == NL:  # NL followed NL?
                    while nc == NL or nc == CR:
                        self.buf.pop(0)  # ignore subsequent new line chars
                        nc = self.peek()
                elif nc == END or ellyChar.isWhiteSpace(nc):
                    continue  # NL followed by space is ignored
                elif nc == u'.' or nc == u'-':
                    pass
                else:
                    #                   print 'NL to SP, lc=' , ord(lc)
                    c = SP  # convert NL to SP if not before another NL
            else:
                #               print 'lc=' , ord(lc) , 'c=' , ord(c)
                c = SP  # otherwise, convert white space to plain space

            self._cap = False

            if not ellyChar.isWhiteSpace(
                    lc):  # preceding char was not white space?
                #               print 'return SP'
                break  # if so, keep space in stream

        return c  # next filtered char

Example #7

Show file

File: macroTable.py Project: prohippo/pyelly

    def _store ( self , defs , nowarn ):

        """
        put macro substitutions into table with indexing by first char of pattern

        arguments:
            self   -
            defs   - list of macro definition as strings
            nowarn - whether to turn warnings off

        exceptions:
            TableFailure on error
        """

#       print defs.linecount() , 'lines'
        while True:
            l = defs.readline()               # next macro rule
#           print "rule input=" , l
            if len(l) == 0: break             # EOF check
            dl = definitionLine.DefinitionLine(l,False)
            left = dl.left                    # pattern to be matched
            tail = dl.tail                    # transformation to apply to match
#           print 'dl.left=' , left
            if left == None or tail == None:
                self._err(l=l)                # report missing part of rule
                continue
            if left.find(' ') >= 0:           # pattern side of macro rule
                ms = 'pattern in macro contains spaces'
                self._err(s=ms,l=l,d=1)       # cannot contain any space chars
                continue

            lefts = list(left)
#           print 'left=' , lefts
            nspm = ellyWildcard.numSpaces(lefts)
            pat = ellyWildcard.convert(left)  # get pattern with encoded wildcards
            if pat == None:
                self._err('bad wildcards',l)
                continue
#           print 'pat=' , ellyWildcard.deconvert(pat) , 'len=' , len(pat)
#           print 'pat=' , list(pat)
            pe = pat[-1]
            if not pe in [ ellyWildcard.cALL , ellyWildcard.cEND , ellyWildcard.cSPC ]:
                pat += ellyWildcard.cEND      # pattern must end in $ if it does not end in * or _
            if not _checkBindings(pat,tail):
                self._err('bad bindings in substitution',l)
                continue
            if not nowarn and not _checkExpansion(pat,tail):
                self._err('substitution may be longer than original string',l,0)

#           print "rule =" , [ left , nspm , tail ]
            if pat == None:
                self._err('no pattern',l)
                continue

            r = Rule( pat , nspm , tail )

            c = pat[0]                        # first char of pattern
                                              # check type to see how to index rule
#           print 'c=' , ellyWildcard.deconvert(c) , ', pat=' , ellyWildcard.deconvert(pat)
            p = pat
            while c == ellyWildcard.cSOS:     # optional sequence?
                if not cEOS in p:
                    break
                k = p.index(cEOS)             # if so, find the end of sequence
                if k < 0 or k == 1: break     # if no end or empty sequence, stop
                k += 1
                if k == len(pat): break       # should be something after sequence
                m = ellyChar.toIndex(pat[1])  # index by first char of optional sequence
                self.index[m].append(r)       #   (must be non-wildcard)
                p = p[k:]                     # move up in pattern
                c = p[0]                      #   but check for another optional sequence

            if c == ellyWildcard.cSOS:
                self._err(l=l)
                continue                      # bad sequence, skip this rule

#           print 'c=' , ord(c)
            if ellyChar.isLetterOrDigit(c):   # check effective first char of pattern
                m = ellyChar.toIndex(c)
                self.index[m].append(r)       # add to index under alphanumeric char
            elif ellyChar.isText(c):
                self.index[0].append(r)       # add to index under punctuation
            elif not c in ellyWildcard.Matching:
                if c == ellyWildcard.cEND:
                    print >> sys.stderr , '** macro warning: pattern can have empty match'
                    print >> sys.stderr , '*  at [' , l , ']'
                else:
                    dc = '=' + str(ord(c) - ellyWildcard.X)
                    self._err('bad wildcard code' , dc)
                continue
            elif c == ellyWildcard.cANY or c == ellyWildcard.cALL:
                self.anyWx.append(r)          # under general wildcards
            elif c == ellyWildcard.cCAN:
                self.index[0].append(r)       # under punctuation
            elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG:
                self.digWx.append(r)          # under digit wildcards
            elif c == ellyWildcard.cSAN:
                self.digWx.append(r)          # under both digit and
                self.letWx.append(r)          #   letter wildcards
            elif c == ellyWildcard.cAPO:      # right single quote or apostrophe
                self.apoWx.append(r)          #
            elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND:
                self._err('bad wildcard in context',l)
                continue                      # wildcards unacceptable here
            else:
                self.letWx.append(r)          # everything else under letter wildcard

            self.count += 1                   # count up macro substitution
#           print 'count=' , self.count

        if self._errcount > 0:
            print >> sys.stderr , '**' , self._errcount , 'macro errors in all'
            print >> sys.stderr , 'macro table definition FAILed'
            raise ellyException.TableFailure

Example #8

Show file

def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __str__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + str(self.kind)  +
                     ',ct=' + str(self.count) +
                     ',pa=' + str(self.pats)  +
                     ',tx=' + str(self.txts)  +
                     ',bd=' + str(self.bnds)  +
                     ',ns=' + str(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print ( "binding:",offs,ns )
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) )
#       print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp )
#       print ( "text to span:",text[offs:] )
#       print ( "pat rest=" , patn[mp:] )
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print ( "exclude=",k,"chars from possible span for rest of pattern" )

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print ( mx,"chars available to scan" )
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print ( "text at",offs,"maximum wildcard match=",mx )

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print ( 'span c=' , c )
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print ( "maximum wildcard span=",nm )

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print ( 'starting match, limt=',limt,text[offs:limt],":",patn )
#   print ( 'nsps=' , nsps )

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print ( '---- loop mp=' , mp , 'ml=' , ml )
        while mp < ml:
            if offs >= limt:
#               print ( "offs=",offs,"limt=",limt )
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print ( 'patn=' , patn )
            mc = patn[mp]
#           print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs )
#           print ( 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')' )
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print ( 'hyphen special matching, limt=', limt , 'offs=' , offs )
#                       print ( 'text[offs:]=' , text[offs:] )
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print ( 'no special matching of hyphen' )
                        break

#           print ( 'matched @mp=' , mp )
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print ( 'pat @',mp,"<",ml )
#       print ( "txt @",offs,'<',limt,'last=',last )
#       print ( '@',offs,text[offs:] )

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) )

        if tc == cALL:      # a * wildcard?

#           print ( "ALL last=< " + last + " >" )
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print ( "offs=",offs,'nm=',nm )
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print ( "END $:",last )
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue
            elif not ellyChar.isText(last):
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print ( "ANY:",last,offs )
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print ( 'at cCAN' )
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print ( "ALF:",last,offs )
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print ( "UPR:",last,'@',offs )
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print ( "LWR:",last,'@',offs )
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print ( "SPC:","["+last+"]" )
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print ( 'NO space' )

        elif tc == cAPO: # apostrophe wildcard?
#           print ( "APO: last=" , last )
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print ( "SOS" )
#           print ( last,'@',offs )
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print ( "EOS" )
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' )
            if last != '':               # still more to match?
                offs -= 1
#               print ( 'nsps=' , nsps )
#               print ( '@' , offs , text )
                nm = _span(tc,nsps)      # maximum match possible

#               print ( 'spanning=' , nm )
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print ( 'spanning=' , nm )
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print ( 'offs=' , offs )
                    last = text[offs] if offs < limt else ''
                    continue
#           print ( 'fail tc=' , deconvert(tc) )

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print ( "fail - unwinding" , unj )

        while unj > 0:               # try unwinding, if possible
#           print ( "unw:",unj )
            uf = unw[unj-1]          # get most recent unwinding record
#           print ( uf )
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print ( "no unwinding" )
            break                   # quit if unwinding is exhausted
#       print ( 'cnt=' , uf.count , 'off=' , offs )

    ##
    ## clean up on match mode or on no match possible
    ##

#   print ( "matched=",matched )

    if not matched: return None     # no bindings

#   print ( text,offs )

    ## consolidate contiguous bindings for subsequent substitutions

#   print ( "BEFORE consolidating consecutive bindings" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print ( "AFTER" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    return mbd             # consolidated bindings plus new offset

Example #9

Show file

    def _store(self, defs, nowarn):
        """
        put macro substitutions into table with indexing by first char of pattern

        arguments:
            self   -
            defs   - list of macro definition as strings
            nowarn - whether to turn warnings off

        exceptions:
            TableFailure on error
        """

        #       print ( defs.linecount() , 'lines' )
        while True:
            l = defs.readline()  # next macro rule
            #           print ( "rule input=" , l )
            if len(l) == 0: break  # EOF check
            dl = definitionLine.DefinitionLine(l, False)
            left = dl.left  # pattern to be matched
            tail = dl.tail  # transformation to apply to match
            #           print ( 'dl.left=' , left )
            if left == None or tail == None:
                self._err(l=l)  # report missing part of rule
                continue
            if left.find(' ') >= 0:  # pattern side of macro rule
                ms = 'pattern in macro contains spaces'
                self._err(s=ms, l=l, d=1)  # cannot contain any space chars
                continue

            lefts = list(left)
            #           print ( 'left=' , lefts )
            nspm = ellyWildcard.numSpaces(lefts)
            pat = ellyWildcard.convert(
                left)  # get pattern with encoded wildcards
            if pat == None:
                self._err('bad wildcards', l)
                continue
#           print ( 'pat=' , ellyWildcard.deconvert(pat) , 'len=' , len(pat) )
#           print ( 'pat=' , list(pat) )
            pe = pat[-1]
            if not pe in [
                    ellyWildcard.cALL, ellyWildcard.cEND, ellyWildcard.cSPC
            ]:
                pat += ellyWildcard.cEND  # pattern must end in $ if it does not end in * or _
            if not _checkBindings(pat, tail):
                self._err('bad bindings in substitution', l)
                continue
            if not nowarn and not _checkExpansion(pat, tail):
                self._err('substitution may be longer than original string', l,
                          0)

#           print ( "rule =" , [ left , nspm , tail ] )
            if pat == None:
                self._err('no pattern', l)
                continue

            r = Rule(pat, nspm, tail)

            c = pat[0]  # first char of pattern
            # check type to see how to index rule
            #           print ( 'c=' , ellyWildcard.deconvert(c) , ', pat=' , ellyWildcard.deconvert(pat) )
            p = pat
            while c == ellyWildcard.cSOS:  # optional sequence?
                if not cEOS in p:
                    break
                k = p.index(cEOS)  # if so, find the end of sequence
                if k < 0 or k == 1: break  # if no end or empty sequence, stop
                k += 1
                if k == len(pat): break  # should be something after sequence
                m = ellyChar.toIndex(
                    pat[1])  # index by first char of optional sequence
                self.index[m].append(r)  #   (must be non-wildcard)
                p = p[k:]  # move up in pattern
                c = p[0]  #   but check for another optional sequence

            if c == ellyWildcard.cSOS:
                self._err(l=l)
                continue  # bad sequence, skip this rule

#           print ( 'c=' , ord(c) )
            if ellyChar.isLetterOrDigit(
                    c):  # check effective first char of pattern
                m = ellyChar.toIndex(c)
                self.index[m].append(r)  # add to index under alphanumeric char
            elif ellyChar.isText(c):
                self.index[0].append(r)  # add to index under punctuation
            elif not c in ellyWildcard.Matching:
                if c == ellyWildcard.cEND:
                    print('** macro warning: pattern can have empty match',
                          file=sys.stderr)
                    print('*  at [', l, ']', file=sys.stderr)
                else:
                    dc = '=' + str(ord(c) - ellyWildcard.X)
                    self._err('bad wildcard code', dc)
                continue
            elif c == ellyWildcard.cANY or c == ellyWildcard.cALL:
                self.anyWx.append(r)  # under general wildcards
            elif c == ellyWildcard.cCAN:
                self.index[0].append(r)  # under punctuation
            elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG:
                self.digWx.append(r)  # under digit wildcards
            elif c == ellyWildcard.cSAN:
                self.digWx.append(r)  # under both digit and
                self.letWx.append(r)  #   letter wildcards
            elif c == ellyWildcard.cAPO:  # right single quote or apostrophe
                self.apoWx.append(r)  #
            elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND:
                self._err('bad wildcard in context', l)
                continue  # wildcards unacceptable here
            else:
                self.letWx.append(r)  # everything else under letter wildcard

            self.count += 1  # count up macro substitution


#           print ( 'count=' , self.count )

        if self._errcount > 0:
            print(self._errcount, 'macro errors in all', file=sys.stderr)
            print('macro table definition FAILed', file=sys.stderr)
            raise ellyException.TableFailure

Example #10

Show file

File: ellyCharInputStream.py Project: prohippo/pyelly

    def read ( self ):

        """
        get next char from input stream with filtering

        arguments:
            self

        returns:
            single Unicode char on success, null string otherwise
        """

#       print 'reading: buf=' , self.buf

        while True:

            if not self._reload():       # check if buffer empty and reload if needed
                return END               # return EOF if no more chars available

#           print 'buf=' , self.buf

            c = self.buf.pop(0)          # next raw char in buffer

            if c == SHYP:                # ignore soft hyphen
                if len(self.buf) > 0:
                    if self.buf[0] == SP:
                        c = self.buf.pop(0)
                continue

            if not ellyChar.isText(c):   # unrecognizable Elly char?
#               print 'c=' , '{0:04x}'.format(ord(c))
                if ellyChar.isCJK(c):
                    c = '_'              # special handling for Chinese
                else:
#                   print 'replace' , c , 'with NBSP'
                    c = NBSP             # by default, replace with no-break space

            lc = self._lc                # copy saved last char
#           print 'lc=' , ord(lc)
            self._lc = c                 # set new last char

#           if c == "'":
#               print 'apostrophe' , self.buf

#           print 'c=' , '<' + c + '>'

            if c == HYPH:                # special treatment for isolated hyphens
                if spc(lc) and spc(self.peek()):
                    c = DASH
                break
            elif c == '.':               # check for ellipsis
                bb = self.buf
                bl = len(bb)
#               print 'bl=' , bl , 'bb=' , bb
                if bl >= 2 and bb[0] == '.' and bb[1] == '.':
                    self.buf = bb[2:]
                    c = ELLP
                elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[2] == ' ' and bb[3] == '.':
                    self.buf = bb[4:]
                    c = ELLP
                break
            elif c == RSQm:              # check for single quote
#               print 'at single quote'
                nc = self.peek()         # look at next char
#               print 'next=' , nc
                if nc == RSQm:           # doubling of single quote?
                    self.buf.pop(0)      # if so, combine two single quotes
                    c = RDQm             # into one double quote
            elif not ellyChar.isWhiteSpace(c):
                if ellyChar.isWhiteSpace(lc):
                    self._cap = ellyChar.isUpperCaseLetter(c)
                break
            elif c == CR:                # always ignore
                continue
            elif c == NL:                # special handling of \n
#               print 'got NL'
                nc = self.peek()         # look at next char

                while nc == CR:
                    self.buf.pop(0)      # skip over CR's
                    nc = self.peek()
#               print "lc= '" + lc + "'"
                if lc != NL and nc == NL:
                    self.buf.pop(0)      # special case when NL can be returned
                    break

                if nc == NL:             # NL followed NL?
                    while nc == NL or nc == CR:
                        self.buf.pop(0)  # ignore subsequent new line chars
                        nc = self.peek()
                elif nc == END or ellyChar.isWhiteSpace(nc):
                    continue             # NL followed by space is ignored
                elif nc == u'.' or nc == u'-':
                    pass
                else:
#                   print 'NL to SP, lc=' , ord(lc)
                    c = SP               # convert NL to SP if not before another NL
            else:
#               print 'lc=' , ord(lc) , 'c=' , ord(c)
                c = SP                   # otherwise, convert white space to plain space

            self._cap = False

            if not ellyChar.isWhiteSpace(lc): # preceding char was not white space?
#               print 'return SP'
                break                    # if so, keep space in stream

        return c                         # next filtered char

Example #11

Show file

    def read ( self ):

        """
        get next char from input stream with filtering

        arguments:
            self

        returns:
            single Unicode char on success, null string otherwise
        """

#       print 'reading: buf=' , self.buf

        while True:

            if not self._reload():       # check if buffer empty and reload if needed
                return END               # return EOF if no more chars available

#           print 'buf=' , self.buf

            c = self.buf.pop(0)          # next raw char in buffer

            if not ellyChar.isText(c):   # unrecognizable Elly char?
#               print 'c=' , ord(c)
                c = NBSP                 # if so, replace with no-break space

            lc = self._lc                # copy saved last char
#           print 'lc=' , ord(lc)
            self._lc = c                 # set new last char

#           if c == "'":
#               print 'apostrophe' , self.buf

            if c == HYPH:                # special treatment for isolated hyphens
                if spc(lc) and spc(self.peek()):
                    c = DASH
                break
            elif not ellyChar.isWhiteSpace(c):
                break
            elif c == CR:                # always ignore
                continue
            elif c == NL:                # special handling of \n
#               print 'got NL'
                nc = self.peek()         # look at next char

                while nc == CR:
                    self.buf.pop(0)      # skip over CR's
                    nc = self.peek()
#               print "lc= '" + lc + "'"
                if lc != NL and nc == NL:
                    self.buf.pop(0)      # special case when NL can be returned
                    break

                if nc == NL:             # NL followed NL?
                    while nc == NL or nc == CR:
                        self.buf.pop(0)  # ignore subsequent new line chars
                        nc = self.peek()
                elif nc == END or ellyChar.isWhiteSpace(nc):
                    continue             # NL followed by space is ignored
                elif nc == u'.' or nc == u'-':
                    pass
                else:
#                   print 'NL to SP, lc=' , ord(lc)
                    c = SP               # convert NL to SP if not before another NL
            else:
#               print 'lc=' , ord(lc) , 'c=' , ord(c)
                c = SP                   # otherwise, convert white space to plain space

            if not ellyChar.isWhiteSpace(lc): # preceding char was not white space?
#               print 'return SP'
                break                    # if so, keep space in stream

        return c                         # next filtered char