Esempio n. 1
0
    def normalize(self, s):
        """
        convert all unrecognizable input chars to _ and any
        consecutive white spaces to a single space

        arguments:
            self -
            s    - Unicode string or char list to operate on
        returns:
            normalized sequence
        """

        #       print ( '__ normalize' )
        spaced = False
        n = len(s)
        ns = []
        for i in range(n):
            x = s[i]
            if ellyChar.isLetter(x):
                spaced = False
            elif ellyChar.isWhiteSpace(x):
                if spaced: continue
                x = ' '
                spaced = True
            elif not ellyChar.isText(x):
                x = '_'
                spaced = False
            else:
                spaced = False
            ns.append(x)
        return ns
Esempio n. 2
0
def normalize ( s ):

    """
    convert all unrecognizable input chars to _ and any
    consecutive white spaces to a single space

    arguments:
        s   - Unicode string or char list to operate on
    returns:
        normalized sequence
    """

    spaced = False
    n = len(s)
    ns = [ ]
    for i in range(n):
        x = s[i]
        if ellyChar.isLetter(x):
            spaced = False
        elif ellyChar.isWhiteSpace(x):
            if spaced: continue
            x = ' '
            spaced = True
        elif not ellyChar.isText(x):
            x = '_'
            spaced = False
        else:
            spaced = False
        ns.append(x)
    return ns
Esempio n. 3
0
def minMatch ( patn ):

    """
    compute minimum number of chars matched by pattern

    arguments:
        patn  - pattern with possible Elly wildcards

    returns:
        minimum count of chars matched
    """

    inOption = False

    k = 0
    m = 0
    ml = len(patn)

    while m < ml:

        tmc = patn[m]

        if tmc == ellyChar.SPC:        # space in pattern will stop scan
            if not inOption: break
        elif ellyChar.isText(tmc):     # ordinary text char is counted
            if not inOption: k += 1
        elif tmc == cSOS:              # optional start code
#           print ( "START optional match" , inOption )
            inOption = True
        elif tmc == cEOS:              # optional end   code
#           print ( "END optional match" , inOption )
            inOption = False
        elif tmc == cALL:              # ALL (*) wildcard?
            pass
        elif tmc == cEND:              # END code
            pass
        else:                          # any other wildcard
#           print ( "count up wildcard minimum match" )
            k += 1

        m += 1

    return k
Esempio n. 4
0
def minMatch ( patn ):

    """
    compute minimum number of chars matched by pattern

    arguments:
        patn  - pattern with possible Elly wildcards

    returns:
        minimum count of chars matched
    """

    inOption = False

    k = 0
    m = 0
    ml = len(patn)

    while m < ml:

        tmc = patn[m]

        if tmc == ellyChar.SPC:        # space in pattern will stop scan
            if not inOption: break
        elif ellyChar.isText(tmc):     # ordinary text char is counted
            if not inOption: k += 1
        elif tmc == cSOS:              # optional start code
#           print "START optional match" , inOption
            inOption = True
        elif tmc == cEOS:              # optional end   code
#           print "END optional match" , inOption
            inOption = False
        elif tmc == cALL:              # ALL (*) wildcard?
            pass
        elif tmc == cEND:              # END code
            pass
        else:                          # any other wildcard
#           print "count up wildcard minimum match"
            k += 1

        m += 1

    return k
Esempio n. 5
0
    def _store ( self , defs , nowarn ):

        """
        put macro substitutions into table with indexing by first char of pattern

        arguments:
            self   -
            defs   - list of macro definition as strings
            nowarn - whether to turn warnings off

        exceptions:
            TableFailure on error
        """

        while True:
            l = defs.readline()               # next macro rule
#           print "rule input=" , l
            if len(l) == 0: break             # EOF check
            dl = definitionLine.DefinitionLine(l,False)
            left = dl.left                    # pattern to be matched
            tail = dl.tail                    # transformation to apply to match
            if left == None or tail == None:
                self._err(l=l)
                continue
            mp = ellyWildcard.convert(left)
            if mp == None:
                self._err('bad wildcards',l)
                continue
            pe = mp[-1]
            if pe != ellyWildcard.cALL and pe != ellyWildcard.cEND:
                mp += ellyWildcard.cEND       # pattern must end in $ if it does not end in *
            if not _checkBindings(mp,tail):
                self._err('bad bindings in substitution',l)
                continue
            if not nowarn and not _checkExpansion(mp,tail):
                self._err('substitution longer than original string',l,0)
            r = [ mp , tail ]
#           print "rule =" , [ left , tail ]
            pat = r[0]                        # get coded pattern
            if pat == None:
                self._err('no pattern',l)
                continue
            c = pat[0]                        # first char of pattern
                                              # check type to see how to index rule
#           print 'c=' , ord(c)
            p = pat
            while c == ellyWildcard.cSOS:     # optional sequence?
                k = p.find(ellyWildcard.cEOS) # if so, find the end of sequence
                if k < 0 or k == 1: break     # if no end or empty sequence, stop
                k += 1
                if k == len(pat): break       # should be something after sequence
                m = ellyChar.toIndex(pat[1])  # index by first char of optional sequence
                self.index[m].append(r)       #   (must be non-wildcard)
                p = p[k:]                     # move up in pattern
                c = p[0]                      #   but check for another optional sequence

            if c == ellyWildcard.cSOS:
                self._err(l=l)
                continue                      # bad sequence, skip this rule

#           print 'c=' , ord(c)
            if ellyChar.isLetterOrDigit(c):   # check effective first char of pattern
                m = ellyChar.toIndex(c)
                self.index[m].append(r)       # add to index under alphanumeric char
            elif ellyChar.isText(c):
                self.index[0].append(r)       # add to index under punctuation
            elif not c in ellyWildcard.Matching:
                if c == ellyWildcard.cEND:
                    print >> sys.stderr , '** macro warning: pattern can have empty match'
                    print >> sys.stderr , '*  at [' , l , ']'
                else:
                    dc = '=' + str(ord(c) - ellyWildcard.X)
                    self._err('bad wildcard code' , dc)
                continue
            elif c == ellyWildcard.cANY or c == ellyWildcard.cALL:
                self.anyWx.append(r)          # under general wildcards
            elif c == ellyWildcard.cCAN:
                self.index[0].append(r)       # under punctuation
            elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG:
                self.digWx.append(r)          # under digit wildcards
            elif c == ellyWildcard.cSAN:
                self.digWx.append(r)          # under both digit and
                self.letWx.append(r)          #   letter wildcards
            elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND:
                self._err('bad wildcard in context',l)
                continue                      # wildcards unacceptable here
            else:
                self.letWx.append(r)          # everything else under letter wildcard

            self.count += 1                   # count up macro substitution

        if self._errcount > 0:
            print >> sys.stderr , '**' , self._errcount , 'macro errors in all'
            print >> sys.stderr , 'macro table definition FAILed'
            raise ellyException.TableFailure
Esempio n. 6
0
    def read(self):
        """
        get next char from input stream with filtering

        arguments:
            self

        returns:
            single Unicode char on success, null string otherwise
        """

        #       print 'reading: buf=' , self.buf

        while True:

            if not self._reload(
            ):  # check if buffer empty and reload if needed
                return END  # return EOF if no more chars available

#           print 'buf=' , self.buf

            c = self.buf.pop(0)  # next raw char in buffer

            if c == SHYP:  # ignore soft hyphen
                if len(self.buf) > 0:
                    if self.buf[0] == SP:
                        c = self.buf.pop(0)
                continue

            if not ellyChar.isText(c):  # unrecognizable Elly char?
                #               print 'c=' , '{0:04x}'.format(ord(c))
                if ellyChar.isCJK(c):
                    if ellyConfiguration.language != 'ZH':
                        c = '_'  # special handling for non-Chinese input
                elif not c in [u'\uff0c', u'\u3002']:
                    #                   print 'replace' , c , 'with NBSP'
                    c = NBSP  # by default, replace with no-break space

            lc = self._lc  # copy saved last char
            #           print 'lc=' , ord(lc)
            self._lc = c  # set new last char

            #           if c == "'":
            #               print 'apostrophe' , self.buf

            #           print 'c=' , '<' + c + '>'

            if c == HYPH:  # special treatment for isolated hyphens
                if spc(lc) and spc(self.peek()):
                    c = DASH
                break
            elif c == '.':  # check for ellipsis
                bb = self.buf
                bl = len(bb)
                #               print 'bl=' , bl , 'bb=' , bb
                if bl >= 2 and bb[0] == '.' and bb[1] == '.':
                    self.buf = bb[2:]
                    c = ELLP
                elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[
                        2] == ' ' and bb[3] == '.':
                    self.buf = bb[4:]
                    c = ELLP
                break
            elif c == RSQm:  # check for single quote
                #               print 'at single quote'
                nc = self.peek()  # look at next char
                #               print 'next=' , nc
                if nc == RSQm:  # doubling of single quote?
                    self.buf.pop(0)  # if so, combine two single quotes
                    c = RDQm  # into one double quote
            elif not ellyChar.isWhiteSpace(c):
                if ellyChar.isWhiteSpace(lc):
                    self._cap = ellyChar.isUpperCaseLetter(c)
                break
            elif c == CR:  # always ignore
                continue
            elif c == NL:  # special handling of \n
                #               print 'got NL'
                nc = self.peek()  # look at next char

                while nc == CR:
                    self.buf.pop(0)  # skip over CR's
                    nc = self.peek()
#               print "lc= '" + lc + "'"
                if lc != NL and nc == NL:
                    self.buf.pop(0)  # special case when NL can be returned
                    break

                if nc == NL:  # NL followed NL?
                    while nc == NL or nc == CR:
                        self.buf.pop(0)  # ignore subsequent new line chars
                        nc = self.peek()
                elif nc == END or ellyChar.isWhiteSpace(nc):
                    continue  # NL followed by space is ignored
                elif nc == u'.' or nc == u'-':
                    pass
                else:
                    #                   print 'NL to SP, lc=' , ord(lc)
                    c = SP  # convert NL to SP if not before another NL
            else:
                #               print 'lc=' , ord(lc) , 'c=' , ord(c)
                c = SP  # otherwise, convert white space to plain space

            self._cap = False

            if not ellyChar.isWhiteSpace(
                    lc):  # preceding char was not white space?
                #               print 'return SP'
                break  # if so, keep space in stream

        return c  # next filtered char
Esempio n. 7
0
    def _store ( self , defs , nowarn ):

        """
        put macro substitutions into table with indexing by first char of pattern

        arguments:
            self   -
            defs   - list of macro definition as strings
            nowarn - whether to turn warnings off

        exceptions:
            TableFailure on error
        """

#       print defs.linecount() , 'lines'
        while True:
            l = defs.readline()               # next macro rule
#           print "rule input=" , l
            if len(l) == 0: break             # EOF check
            dl = definitionLine.DefinitionLine(l,False)
            left = dl.left                    # pattern to be matched
            tail = dl.tail                    # transformation to apply to match
#           print 'dl.left=' , left
            if left == None or tail == None:
                self._err(l=l)                # report missing part of rule
                continue
            if left.find(' ') >= 0:           # pattern side of macro rule
                ms = 'pattern in macro contains spaces'
                self._err(s=ms,l=l,d=1)       # cannot contain any space chars
                continue

            lefts = list(left)
#           print 'left=' , lefts
            nspm = ellyWildcard.numSpaces(lefts)
            pat = ellyWildcard.convert(left)  # get pattern with encoded wildcards
            if pat == None:
                self._err('bad wildcards',l)
                continue
#           print 'pat=' , ellyWildcard.deconvert(pat) , 'len=' , len(pat)
#           print 'pat=' , list(pat)
            pe = pat[-1]
            if not pe in [ ellyWildcard.cALL , ellyWildcard.cEND , ellyWildcard.cSPC ]:
                pat += ellyWildcard.cEND      # pattern must end in $ if it does not end in * or _
            if not _checkBindings(pat,tail):
                self._err('bad bindings in substitution',l)
                continue
            if not nowarn and not _checkExpansion(pat,tail):
                self._err('substitution may be longer than original string',l,0)

#           print "rule =" , [ left , nspm , tail ]
            if pat == None:
                self._err('no pattern',l)
                continue

            r = Rule( pat , nspm , tail )

            c = pat[0]                        # first char of pattern
                                              # check type to see how to index rule
#           print 'c=' , ellyWildcard.deconvert(c) , ', pat=' , ellyWildcard.deconvert(pat)
            p = pat
            while c == ellyWildcard.cSOS:     # optional sequence?
                if not cEOS in p:
                    break
                k = p.index(cEOS)             # if so, find the end of sequence
                if k < 0 or k == 1: break     # if no end or empty sequence, stop
                k += 1
                if k == len(pat): break       # should be something after sequence
                m = ellyChar.toIndex(pat[1])  # index by first char of optional sequence
                self.index[m].append(r)       #   (must be non-wildcard)
                p = p[k:]                     # move up in pattern
                c = p[0]                      #   but check for another optional sequence

            if c == ellyWildcard.cSOS:
                self._err(l=l)
                continue                      # bad sequence, skip this rule

#           print 'c=' , ord(c)
            if ellyChar.isLetterOrDigit(c):   # check effective first char of pattern
                m = ellyChar.toIndex(c)
                self.index[m].append(r)       # add to index under alphanumeric char
            elif ellyChar.isText(c):
                self.index[0].append(r)       # add to index under punctuation
            elif not c in ellyWildcard.Matching:
                if c == ellyWildcard.cEND:
                    print >> sys.stderr , '** macro warning: pattern can have empty match'
                    print >> sys.stderr , '*  at [' , l , ']'
                else:
                    dc = '=' + str(ord(c) - ellyWildcard.X)
                    self._err('bad wildcard code' , dc)
                continue
            elif c == ellyWildcard.cANY or c == ellyWildcard.cALL:
                self.anyWx.append(r)          # under general wildcards
            elif c == ellyWildcard.cCAN:
                self.index[0].append(r)       # under punctuation
            elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG:
                self.digWx.append(r)          # under digit wildcards
            elif c == ellyWildcard.cSAN:
                self.digWx.append(r)          # under both digit and
                self.letWx.append(r)          #   letter wildcards
            elif c == ellyWildcard.cAPO:      # right single quote or apostrophe
                self.apoWx.append(r)          #
            elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND:
                self._err('bad wildcard in context',l)
                continue                      # wildcards unacceptable here
            else:
                self.letWx.append(r)          # everything else under letter wildcard

            self.count += 1                   # count up macro substitution
#           print 'count=' , self.count

        if self._errcount > 0:
            print >> sys.stderr , '**' , self._errcount , 'macro errors in all'
            print >> sys.stderr , 'macro table definition FAILed'
            raise ellyException.TableFailure
Esempio n. 8
0
def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __str__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + str(self.kind)  +
                     ',ct=' + str(self.count) +
                     ',pa=' + str(self.pats)  +
                     ',tx=' + str(self.txts)  +
                     ',bd=' + str(self.bnds)  +
                     ',ns=' + str(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print ( "binding:",offs,ns )
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) )
#       print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp )
#       print ( "text to span:",text[offs:] )
#       print ( "pat rest=" , patn[mp:] )
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print ( "exclude=",k,"chars from possible span for rest of pattern" )

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print ( mx,"chars available to scan" )
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print ( "text at",offs,"maximum wildcard match=",mx )

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print ( 'span c=' , c )
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print ( "maximum wildcard span=",nm )

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print ( 'starting match, limt=',limt,text[offs:limt],":",patn )
#   print ( 'nsps=' , nsps )

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print ( '---- loop mp=' , mp , 'ml=' , ml )
        while mp < ml:
            if offs >= limt:
#               print ( "offs=",offs,"limt=",limt )
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print ( 'patn=' , patn )
            mc = patn[mp]
#           print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs )
#           print ( 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')' )
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print ( 'hyphen special matching, limt=', limt , 'offs=' , offs )
#                       print ( 'text[offs:]=' , text[offs:] )
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print ( 'no special matching of hyphen' )
                        break

#           print ( 'matched @mp=' , mp )
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print ( 'pat @',mp,"<",ml )
#       print ( "txt @",offs,'<',limt,'last=',last )
#       print ( '@',offs,text[offs:] )

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) )

        if tc == cALL:      # a * wildcard?

#           print ( "ALL last=< " + last + " >" )
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print ( "offs=",offs,'nm=',nm )
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print ( "END $:",last )
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue
            elif not ellyChar.isText(last):
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print ( "ANY:",last,offs )
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print ( 'at cCAN' )
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print ( "ALF:",last,offs )
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print ( "UPR:",last,'@',offs )
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print ( "LWR:",last,'@',offs )
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print ( "SPC:","["+last+"]" )
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print ( 'NO space' )

        elif tc == cAPO: # apostrophe wildcard?
#           print ( "APO: last=" , last )
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print ( "SOS" )
#           print ( last,'@',offs )
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print ( "EOS" )
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' )
            if last != '':               # still more to match?
                offs -= 1
#               print ( 'nsps=' , nsps )
#               print ( '@' , offs , text )
                nm = _span(tc,nsps)      # maximum match possible

#               print ( 'spanning=' , nm )
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print ( 'spanning=' , nm )
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print ( 'offs=' , offs )
                    last = text[offs] if offs < limt else ''
                    continue
#           print ( 'fail tc=' , deconvert(tc) )

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print ( "fail - unwinding" , unj )

        while unj > 0:               # try unwinding, if possible
#           print ( "unw:",unj )
            uf = unw[unj-1]          # get most recent unwinding record
#           print ( uf )
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print ( "no unwinding" )
            break                   # quit if unwinding is exhausted
#       print ( 'cnt=' , uf.count , 'off=' , offs )

    ##
    ## clean up on match mode or on no match possible
    ##

#   print ( "matched=",matched )

    if not matched: return None     # no bindings

#   print ( text,offs )

    ## consolidate contiguous bindings for subsequent substitutions

#   print ( "BEFORE consolidating consecutive bindings" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print ( "AFTER" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    return mbd             # consolidated bindings plus new offset
Esempio n. 9
0
    def _store(self, defs, nowarn):
        """
        put macro substitutions into table with indexing by first char of pattern

        arguments:
            self   -
            defs   - list of macro definition as strings
            nowarn - whether to turn warnings off

        exceptions:
            TableFailure on error
        """

        #       print ( defs.linecount() , 'lines' )
        while True:
            l = defs.readline()  # next macro rule
            #           print ( "rule input=" , l )
            if len(l) == 0: break  # EOF check
            dl = definitionLine.DefinitionLine(l, False)
            left = dl.left  # pattern to be matched
            tail = dl.tail  # transformation to apply to match
            #           print ( 'dl.left=' , left )
            if left == None or tail == None:
                self._err(l=l)  # report missing part of rule
                continue
            if left.find(' ') >= 0:  # pattern side of macro rule
                ms = 'pattern in macro contains spaces'
                self._err(s=ms, l=l, d=1)  # cannot contain any space chars
                continue

            lefts = list(left)
            #           print ( 'left=' , lefts )
            nspm = ellyWildcard.numSpaces(lefts)
            pat = ellyWildcard.convert(
                left)  # get pattern with encoded wildcards
            if pat == None:
                self._err('bad wildcards', l)
                continue
#           print ( 'pat=' , ellyWildcard.deconvert(pat) , 'len=' , len(pat) )
#           print ( 'pat=' , list(pat) )
            pe = pat[-1]
            if not pe in [
                    ellyWildcard.cALL, ellyWildcard.cEND, ellyWildcard.cSPC
            ]:
                pat += ellyWildcard.cEND  # pattern must end in $ if it does not end in * or _
            if not _checkBindings(pat, tail):
                self._err('bad bindings in substitution', l)
                continue
            if not nowarn and not _checkExpansion(pat, tail):
                self._err('substitution may be longer than original string', l,
                          0)

#           print ( "rule =" , [ left , nspm , tail ] )
            if pat == None:
                self._err('no pattern', l)
                continue

            r = Rule(pat, nspm, tail)

            c = pat[0]  # first char of pattern
            # check type to see how to index rule
            #           print ( 'c=' , ellyWildcard.deconvert(c) , ', pat=' , ellyWildcard.deconvert(pat) )
            p = pat
            while c == ellyWildcard.cSOS:  # optional sequence?
                if not cEOS in p:
                    break
                k = p.index(cEOS)  # if so, find the end of sequence
                if k < 0 or k == 1: break  # if no end or empty sequence, stop
                k += 1
                if k == len(pat): break  # should be something after sequence
                m = ellyChar.toIndex(
                    pat[1])  # index by first char of optional sequence
                self.index[m].append(r)  #   (must be non-wildcard)
                p = p[k:]  # move up in pattern
                c = p[0]  #   but check for another optional sequence

            if c == ellyWildcard.cSOS:
                self._err(l=l)
                continue  # bad sequence, skip this rule

#           print ( 'c=' , ord(c) )
            if ellyChar.isLetterOrDigit(
                    c):  # check effective first char of pattern
                m = ellyChar.toIndex(c)
                self.index[m].append(r)  # add to index under alphanumeric char
            elif ellyChar.isText(c):
                self.index[0].append(r)  # add to index under punctuation
            elif not c in ellyWildcard.Matching:
                if c == ellyWildcard.cEND:
                    print('** macro warning: pattern can have empty match',
                          file=sys.stderr)
                    print('*  at [', l, ']', file=sys.stderr)
                else:
                    dc = '=' + str(ord(c) - ellyWildcard.X)
                    self._err('bad wildcard code', dc)
                continue
            elif c == ellyWildcard.cANY or c == ellyWildcard.cALL:
                self.anyWx.append(r)  # under general wildcards
            elif c == ellyWildcard.cCAN:
                self.index[0].append(r)  # under punctuation
            elif c == ellyWildcard.cDIG or c == ellyWildcard.cSDG:
                self.digWx.append(r)  # under digit wildcards
            elif c == ellyWildcard.cSAN:
                self.digWx.append(r)  # under both digit and
                self.letWx.append(r)  #   letter wildcards
            elif c == ellyWildcard.cAPO:  # right single quote or apostrophe
                self.apoWx.append(r)  #
            elif c == ellyWildcard.cSPC or c == ellyWildcard.cEND:
                self._err('bad wildcard in context', l)
                continue  # wildcards unacceptable here
            else:
                self.letWx.append(r)  # everything else under letter wildcard

            self.count += 1  # count up macro substitution


#           print ( 'count=' , self.count )

        if self._errcount > 0:
            print(self._errcount, 'macro errors in all', file=sys.stderr)
            print('macro table definition FAILed', file=sys.stderr)
            raise ellyException.TableFailure
Esempio n. 10
0
    def read ( self ):

        """
        get next char from input stream with filtering

        arguments:
            self

        returns:
            single Unicode char on success, null string otherwise
        """

#       print 'reading: buf=' , self.buf

        while True:

            if not self._reload():       # check if buffer empty and reload if needed
                return END               # return EOF if no more chars available

#           print 'buf=' , self.buf

            c = self.buf.pop(0)          # next raw char in buffer

            if c == SHYP:                # ignore soft hyphen
                if len(self.buf) > 0:
                    if self.buf[0] == SP:
                        c = self.buf.pop(0)
                continue

            if not ellyChar.isText(c):   # unrecognizable Elly char?
#               print 'c=' , '{0:04x}'.format(ord(c))
                if ellyChar.isCJK(c):
                    c = '_'              # special handling for Chinese
                else:
#                   print 'replace' , c , 'with NBSP'
                    c = NBSP             # by default, replace with no-break space

            lc = self._lc                # copy saved last char
#           print 'lc=' , ord(lc)
            self._lc = c                 # set new last char

#           if c == "'":
#               print 'apostrophe' , self.buf

#           print 'c=' , '<' + c + '>'

            if c == HYPH:                # special treatment for isolated hyphens
                if spc(lc) and spc(self.peek()):
                    c = DASH
                break
            elif c == '.':               # check for ellipsis
                bb = self.buf
                bl = len(bb)
#               print 'bl=' , bl , 'bb=' , bb
                if bl >= 2 and bb[0] == '.' and bb[1] == '.':
                    self.buf = bb[2:]
                    c = ELLP
                elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[2] == ' ' and bb[3] == '.':
                    self.buf = bb[4:]
                    c = ELLP
                break
            elif c == RSQm:              # check for single quote
#               print 'at single quote'
                nc = self.peek()         # look at next char
#               print 'next=' , nc
                if nc == RSQm:           # doubling of single quote?
                    self.buf.pop(0)      # if so, combine two single quotes
                    c = RDQm             # into one double quote
            elif not ellyChar.isWhiteSpace(c):
                if ellyChar.isWhiteSpace(lc):
                    self._cap = ellyChar.isUpperCaseLetter(c)
                break
            elif c == CR:                # always ignore
                continue
            elif c == NL:                # special handling of \n
#               print 'got NL'
                nc = self.peek()         # look at next char

                while nc == CR:
                    self.buf.pop(0)      # skip over CR's
                    nc = self.peek()
#               print "lc= '" + lc + "'"
                if lc != NL and nc == NL:
                    self.buf.pop(0)      # special case when NL can be returned
                    break

                if nc == NL:             # NL followed NL?
                    while nc == NL or nc == CR:
                        self.buf.pop(0)  # ignore subsequent new line chars
                        nc = self.peek()
                elif nc == END or ellyChar.isWhiteSpace(nc):
                    continue             # NL followed by space is ignored
                elif nc == u'.' or nc == u'-':
                    pass
                else:
#                   print 'NL to SP, lc=' , ord(lc)
                    c = SP               # convert NL to SP if not before another NL
            else:
#               print 'lc=' , ord(lc) , 'c=' , ord(c)
                c = SP                   # otherwise, convert white space to plain space

            self._cap = False

            if not ellyChar.isWhiteSpace(lc): # preceding char was not white space?
#               print 'return SP'
                break                    # if so, keep space in stream

        return c                         # next filtered char
Esempio n. 11
0
    def read ( self ):

        """
        get next char from input stream with filtering

        arguments:
            self

        returns:
            single Unicode char on success, null string otherwise
        """

#       print 'reading: buf=' , self.buf

        while True:

            if not self._reload():       # check if buffer empty and reload if needed
                return END               # return EOF if no more chars available

#           print 'buf=' , self.buf

            c = self.buf.pop(0)          # next raw char in buffer

            if not ellyChar.isText(c):   # unrecognizable Elly char?
#               print 'c=' , ord(c)
                c = NBSP                 # if so, replace with no-break space

            lc = self._lc                # copy saved last char
#           print 'lc=' , ord(lc)
            self._lc = c                 # set new last char

#           if c == "'":
#               print 'apostrophe' , self.buf

            if c == HYPH:                # special treatment for isolated hyphens
                if spc(lc) and spc(self.peek()):
                    c = DASH
                break
            elif not ellyChar.isWhiteSpace(c):
                break
            elif c == CR:                # always ignore
                continue
            elif c == NL:                # special handling of \n
#               print 'got NL'
                nc = self.peek()         # look at next char

                while nc == CR:
                    self.buf.pop(0)      # skip over CR's
                    nc = self.peek()
#               print "lc= '" + lc + "'"
                if lc != NL and nc == NL:
                    self.buf.pop(0)      # special case when NL can be returned
                    break

                if nc == NL:             # NL followed NL?
                    while nc == NL or nc == CR:
                        self.buf.pop(0)  # ignore subsequent new line chars
                        nc = self.peek()
                elif nc == END or ellyChar.isWhiteSpace(nc):
                    continue             # NL followed by space is ignored
                elif nc == u'.' or nc == u'-':
                    pass
                else:
#                   print 'NL to SP, lc=' , ord(lc)
                    c = SP               # convert NL to SP if not before another NL
            else:
#               print 'lc=' , ord(lc) , 'c=' , ord(c)
                c = SP                   # otherwise, convert white space to plain space

            if not ellyChar.isWhiteSpace(lc): # preceding char was not white space?
#               print 'return SP'
                break                    # if so, keep space in stream

        return c                         # next filtered char