Beispiel #1
0
    def run ( self , segm ):

        """
        execute each extractor and store results

        arguments:
            self  -
            segm  - input buffer

        returns:
            returns number of chars matched on success, 0 otherwise
        """

        mx = 0
        ms = [ ]
        capd = ellyChar.isUpperCaseLetter(segm[0])
        for xr in self.exs:       # try each extraction procedure in order
            m = xr[0](segm)       #
            if m > 0:             # match?
                if mx > m:        # if so, it has to be longer than the longest previous
                    continue
                elif mx < m:      # if longer than longest previous, discard the previous
                    mx = m
                    ms = [ ]
                ms.append(xr[1:]) # add to match list
        if mx > 0:                # any matches?
            for mr in ms:         # if so, make phrases for them
                sbs = mr[2] if len(mr) > 2 else noBits
                bia = mr[3] if len(mr) > 3 else 0
                if self.ptr.addLiteralPhraseWithSemantics(mr[0],mr[1],sbs,bia,None,False,capd):
                    self.ptr.lastph.lens = mx
        return mx
Beispiel #2
0
    def run(self, segm):
        """
        execute each extractor and store results

        arguments:
            self  -
            segm  - input buffer

        returns:
            returns number of chars matched on success, 0 otherwise
        """

        mx = 0
        ms = []
        capd = ellyChar.isUpperCaseLetter(segm[0])
        for xr in self.exs:  # try each extraction procedure in order
            m = xr[0](segm)  #
            if m > 0:  # match?
                if mx > m:  # if so, it has to be longer than the longest previous
                    continue
                elif mx < m:  # if longer than longest previous, discard the previous
                    mx = m
                    ms = []
                ms.append(xr[1:])  # add to match list
        nmatch = 0
        if mx > 0:  # any matches?
            for mr in ms:  # if so, make phrases for them
                sbs = mr[2] if len(mr) > 2 else noBits
                bia = mr[3] if len(mr) > 3 else 0
                if self.ptr.addLiteralPhraseWithSemantics(
                        mr[0], mr[1], sbs, bia, None, False, capd):
                    self.ptr.lastph.lens = mx
                    nmatch += 1

        return mx if nmatch > 0 else 0
Beispiel #3
0
def acronym ( buffr ):

    """
    recognize parenthesized introduction of acronym in text

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    lb = len(buffr)
    if lb > Lmax: lb = Lmax
    if lb < Lmin or buffr[0] != '(': return 0

    nu = 0          # uppercase count
    ib = 1
    while ib < lb:
        bc = buffr[ib]
        ib += 1
        if bc == ')':
            break
        if not ellyChar.isLetter(bc): return 0
        if ellyChar.isUpperCaseLetter(bc): nu += 1
    else:
        return 0    # must have enclosing ')'

    if ib < Lmin or ib - 2*nu > 0: return 0
    if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0

    return ib
Beispiel #4
0
def acronym ( buffr ):

    """
    recognize parenthesized introduction of acronym in text

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    lb = len(buffr)
    if lb > Lmax: lb = Lmax
    if lb < Lmin or buffr[0] != '(': return 0

    nu = 0          # uppercase count
    ib = 1
    while ib < lb:
        bc = buffr[ib]
        ib += 1
        if bc == ')':
            break
        if not ellyChar.isLetter(bc): return 0
        if ellyChar.isUpperCaseLetter(bc): nu += 1
    else:
        return 0    # must have enclosing ')'

    if ib < Lmin or ib - 2*nu > 0: return 0
    if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0

    return ib
Beispiel #5
0
def matchtoo(txt, pnc, ctx):
    """
    complex checks - currently only for rightmost period of A.M. or P.M.

    arguments:
        txt   - list of text chars leading up to punctuation char
        pnc   - punctuation char
        ctx   - list of chars in context after punctuation

    returns:
        True on match, False otherwise
    """

    ln = len(txt)
    #   print ( 'nomatch() ln=' , ln , txt )
    nxt = ctx[0] if len(ctx) > 0 else ''
    if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5:
        return False
#   print ( 'check' , txt[-3:] )
    if not txt[-1] in ['M', 'm'] or txt[-2] != '.' or not txt[-3] in [
            'P', 'p', 'A', 'a'
    ] or txt[-4] != ' ':
        return False
    ch = txt[-5]
    #   print ( 'ch=' , ch )
    if ellyChar.isDigit(ch):  # only 1 digit will be checked here!
        #       print ( 'ONE DIGIT' )
        return True  # erring on the side of not to break sentence
    elif not ellyChar.isLetter(ch):
        return False

#
#   the following code is needed only when number transforms are turned off
#

    nn = 6
    while nn <= ln and ellyChar.isLetter(txt[-nn]):
        nn += 1

#   print ( 'nn=' , nn )
    if nn < 3 or nn > 6:
        return False
    elif nn > ln:
        if not txt[-nn] in [' ', '-']:
            return False
    wd = ''.join(txt[:-nn]).lower()

    #   print ( 'wd=' , wd )
    if wd in [
            'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight',
            'nine', 'ten', 'eleven', 'twelve'
    ]:
        if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]):
            return False
        else:
            return True
    else:
        return False
Beispiel #6
0
def matchtoo ( txt , pnc , ctx ):

    """
    complex checks - currently only for rightmost period of A.M. or P.M.

    arguments:
        txt   - list of text chars leading up to punctuation char
        pnc   - punctuation char
        ctx   - list of chars in context after punctuation

    returns:
        True on match, False otherwise
    """

    ln = len(txt)
#   print 'nomatch() ln=' , ln , txt
    nxt = ctx[0] if len(ctx) > 0 else ''
    if pnc != '.' or not ellyChar.isWhiteSpace(nxt) or ln < 5:
        return False
#   print 'check' , txt[-3:]
    if not txt[-1] in ['M','m'] or txt[-2] != '.' or not txt[-3] in ['P','p','A','a'] or txt[-4] != ' ':
        return False
    ch = txt[-5]
#   print 'ch=' , ch
    if ellyChar.isDigit(ch):        # only 1 digit will be checked here!
#       print 'ONE DIGIT'
        return True                 # erring on the side of not to break sentence
    elif not ellyChar.isLetter(ch):
        return False

#
#   the following code is needed only when number transforms are turned off
#

    nn = 6
    while nn <= ln and ellyChar.isLetter(txt[-nn]):
        nn += 1

#   print 'nn=' , nn
    if nn < 3 or nn > 6:
        return False
    elif nn > ln:
        if not txt[-nn] in [ ' ' , '-' ]:
            return False
    wd = ''.join(txt[:-nn]).lower()

#   print 'wd=' , wd
    if wd in [ 'one' , 'two' , 'three' , 'four' , 'five' , 'six' , 'seven' ,
               'eight' , 'nine' , 'ten' , 'eleven' , 'twelve' ]:
        if len(ctx) < 2 or ellyChar.isUpperCaseLetter(ctx[1]):
            return False
        else:
            return True
    else:
        return False
Beispiel #7
0
def _planAhead ( buf ):

    """
    check for possible problems in the next scan while context
    is still available and set flags if needed

    arguments:
        buf  - buffer to be scanned
    """

    global _toscan

    nsk = 0                     # total skip count
    lb = len(buf)
    if lb > 4:
        if buf[0] == '(':       # skip initial '('
            nsk += 1
            buf = buf[1:]
        if buf[0] == '"':       # skip initial '"'
            nsk += 1
            buf = buf[1:]
        lb -= nsk

    nix = 0                    # scan count
    if lb > 8:
        for chx in buf:        # go to first non-letter
            if not ellyChar.isLetter(chx):
                if ellyChar.isWhiteSpace(chx):
                    break      # must be space
                return
            nix += 1

        sst = ''.join(buf[:nix]).lower()
        if not sst in _det:
            return            # must find determiner

        nix += 1              # skip space
        if ellyChar.isUpperCaseLetter(buf[nix]):
            nix += 1          # skip first letter
            buf = buf[nix:]
            for ch in buf:    # go to next non-letter
                if not ellyChar.isLetter(ch):
                    if ellyChar.isWhiteSpace(ch):
                        break
                    return
                nix += 1

            _toscan = lb + nsk - nix
Beispiel #8
0
def _planAhead ( buf ):

    """
    check for possible problems in the next scan while context
    is still available and set flags if needed

    arguments:
        buf  - buffer to be scanned
    """

    global _toscan

    nsk = 0                     # total skip count
    lb = len(buf)
    if lb > 4:
        if buf[0] == '(':       # skip initial '('
            nsk += 1
            buf = buf[1:]
        if buf[0] == '"':       # skip initial '"'
            nsk += 1
            buf = buf[1:]
        lb -= nsk

    nix = 0                    # scan count
    if lb > 8:
        for chx in buf:        # go to first non-letter
            if not ellyChar.isLetter(chx):
                if ellyChar.isWhiteSpace(chx):
                    break      # must be space
                return
            nix += 1

        sst = ''.join(buf[:nix]).lower()
        if not sst in _det:
            return            # must find determiner

        nix += 1              # skip space
        if ellyChar.isUpperCaseLetter(buf[nix]):
            nix += 1          # skip first letter
            buf = buf[nix:]
            for ch in buf:    # go to next non-letter
                if not ellyChar.isLetter(ch):
                    if ellyChar.isWhiteSpace(ch):
                        break
                    return
                nix += 1

            _toscan = lb + nsk - nix
Beispiel #9
0
def title ( buffr ):

    """
    recognize double-quoted title in text

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    lb = len(buffr)
    if lb > Tmax: lb = Tmax
    if lb < Tmin: return 0
    qm = buffr[0]
    if qm != aDQ and qm != lDQ: return 0

    ib = 1
    while ib < lb:
        bc = buffr[ib]
        ib += 1
        if bc == rDQ:
            break
        if not ellyChar.isUpperCaseLetter(bc): return 0

        while ib < lb:
            bc = buffr[ib]
            ib += 1
            if bc == ' ': break
            if qm == aDQ:
                if bc == aDQ: break
            else:
                if bc == rDQ: break
            if bc in [ '!' , '?' ]:
                return 0
        if bc == rDQ or bc == aDQ: break
    else:
        return 0    # must have enclosing rDQ or aDQ

    if ib < Tmin: return 0
    if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0

    return ib
Beispiel #10
0
def title ( buffr ):

    """
    recognize double-quoted title in text

    arguments:
        buffr - current contents as list of chars

    returns:
        char count matched on success, 0 otherwise
    """

    lb = len(buffr)
    if lb > Tmax: lb = Tmax
    if lb < Tmin: return 0
    qm = buffr[0]
    if qm != aDQ and qm != lDQ: return 0

    ib = 1
    while ib < lb:
        bc = buffr[ib]
        ib += 1
        if bc == rDQ:
            break
        if not ellyChar.isUpperCaseLetter(bc): return 0

        while ib < lb:
            bc = buffr[ib]
            ib += 1
            if bc == ' ': break
            if qm == aDQ:
                if bc == aDQ: break
            else:
                if bc == rDQ: break
            if bc in [ '!' , '?' ]:
                return 0
        if bc == rDQ or bc == aDQ: break
    else:
        return 0    # must have enclosing rDQ or aDQ

    if ib < Tmin: return 0
    if len(buffr) > ib and ellyChar.isLetterOrDigit(buffr[ib]): return 0

    return ib
Beispiel #11
0
def infer ( tok ):

    """
    infer a token as a possible name component with
    side effect of converting it to lowercase ASCII

    arguments:
        tok  - token as list of chars

    returns:
        True if inferred, False otherwise
    """

#   print 'inferring tok=' , tok
    nch = len(tok)
    if (nch < 5 or not ellyChar.isUpperCaseLetter(tok[0]) or
        len(ellyConfiguration.digraph) == 0): return False

    ellyChar.toLowerCaseASCII(tok,True)
    toks = ''.join(tok)

    miss = 0
    last = ''
    for i in range(1,nch):   # check plausibility of all digraphs
        digr = toks[i-1:i+1]
#       print 'digr=' , digr , 'last=' , last
        if (digr == last or
            not digr in ellyConfiguration.digraph):
            miss += 1
        last = digr

#   print 'miss=' , miss
    if nch < 7:
        return (miss == 0)
    else:
        return (miss <= 1)
Beispiel #12
0
def infer ( tok ):

    """
    infer a token as a possible name component with
    side effect of converting it to lowercase ASCII

    arguments:
        tok  - token as list of chars

    returns:
        True if inferred, False otherwise
    """

#   print ( 'inferring tok=' , tok )
    nch = len(tok)
    if (nch < 5 or not ellyChar.isUpperCaseLetter(tok[0]) or
        len(ellyConfiguration.digraph) == 0): return False

    ellyChar.toLowerCaseASCII(tok,True)
    toks = ''.join(tok)

    miss = 0
    last = ''
    for i in range(1,nch):   # check plausibility of all digraphs
        digr = toks[i-1:i+1]
#       print ( 'digr=' , digr , 'last=' , last )
        if (digr == last or
            not digr in ellyConfiguration.digraph):
            miss += 1
        last = digr

#   print ( 'miss=' , miss )
    if nch < 7:
        return (miss == 0)
    else:
        return (miss <= 1)
Beispiel #13
0
    def match ( self , segm , tree ):

        """
        compare text segment against FSA patterns

        arguments:
            self  -
            segm  - segment to match against
            tree  - parse tree in which to put leaf nodes for final matches

        returns:
            text length matched by FSA
        """

#       print 'comparing' , segm

        if len(self.indx) == 0: return 0  # no matches if FSA is empty

        lim = bound(segm)                 # get limit for matching

        mtl  = 0        # accumulated match length
        mtls = 0        # saved final match length

        state = 0       # set to mandatory initial state for FSA

        stk = [ ]       # for tracking multiple possible matches

        ls = self.indx[state]
        ix = 0
        sg = segm[:lim] # text subsegment for matching
        capd = False if len(sg) == 0 else ellyChar.isUpperCaseLetter(sg[0])

        while True:                 # run FSA to find all possible matches
#           print 'state=' , state
#           print 'count=' , mtl , 'matched so far'
#           print 'links=' , len(ls)
            nls = len(ls)           # how many links from current state

            if ix == nls:           # if none, then must back up
                if len(stk) == 0: break
                r = stk.pop()       # restore match status
                state = r[0]        # FSA state
                ls  = r[1]          # remaining links to check
                sg  = r[2]          # input string
                mtl = r[3]          # total match length
                ix = 0
                continue

            m = 0
            while ix < nls:
                lk = ls[ix]         # get next link at current state
                ix += 1             # and increment link index
#               print 'lk= [' , unicode(lk), '] , sg=' , sg
                if lk.patn == u'\x00': # do state change without matching?
                    m = 0           # no match length
                else:
#                   print 'match lk=' , unicode(lk) , 'sg=' , sg
                    bds = ellyWildcard.match(lk.patn,sg)
                    if bds == None: continue
#                   print 'bds=' , bds

                    m = bds[0]      # get match length, ignore wildcard bindings

                    if lk.nxts < 0: # final state?
#                       print 'flags=' , lk.synf , '/' , lk.semf
                        if tree.addLiteralPhraseWithSemantics(lk.catg,lk.synf,lk.semf,lk.bias,
                                                              cap=capd): # make phrase
                            mtls = mtl + m
                            tree.lastph.lens = mtls                      # save its length
#                           print 'match state=' , state , 'length=' , mtls
#                       else:
#                           print 'lastph=' , tree.lastph
#                           print 'seg=' , sg
#                           print 'cat=' , lk.catg, 'synf=' , lk.synf

#               print 'ix=' , ix , 'nls=' , nls
                if ix < nls:        # any links not yet checked?
                    r = [ state , ls[ix:] , sg , mtl ]
#                   print 'r=' , r
                    stk.append(r)   # if not, save info for later continuation

                mtl += m            # update match length
                break               # leave loop at this state, go to next state
            else:
#               print 'no matches'
                continue            # all patterns exhausted for state

            ix = 0
            sg = sg[m:]             # move up in text input
            state = lk.nxts         # next state
            if state < 0:
                ls = [ ]
            else:
                ls = self.indx[state]
#           print 'sg=' , sg
#           print 'state=' , state
#           print 'len(ls)=' , len(ls)

        return mtls
Beispiel #14
0
    def getNext ( self ):

        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

#       print 'getNext'

        self.resetBracketing()
        inBrkt = False

        nspc = 0           # set space count

        sent = [ ]         # list buffer to fill

        x  = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:       # EOF check
            return None

        c  = END           # reset
        lc = END

#       print 'x=' , '<' + x + '>' , ord(x)
        self.inp.unread(x,SP)       # put first char back to restore input
#       print '0  <<" , self.inp.buf

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0                     # alphanumeric count in sentence

        while True:

            x = self.inp.read()     # next input char

            if x == END:            # handle any EOF
                break

#           print 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>'
#           print 'sent=' , sent , 'nspc=' , nspc

            # check for table delimiters in text

            if len(sent) == 0:
#               print 'table'
#               print '1  <<' , self.inp.buf

                if x == u'.' or x == u'-':      # look for multiple '.' or '-'
                    while True:                 # scan up to end of current buffering
                        y = self.inp.read()     #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break               #
                    continue                    # ignore everything seen so far

            ####################################################
            # accumulate chars and count alphanumeric and spaces
            ####################################################

            lc = c
            c  = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

#           print 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>'
            if lc == SP or lc == END: # normalize chars for proper bracketing
                if x == SQuo:         #
                    x = LSQm          # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:       #
                    x = LDQm          # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END: #
                if x == SQuo:         # a SQuo followed by a space becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by a space becomes RDQm
                    x = RDQm          #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:         # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm          #
                elif x == DQuo:       # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm          #
            elif ellyChar.isWhiteSpace(c) and inBrkt:
                nspc += 1

            svBrkt = inBrkt
            inBrkt = self.checkBracketing(x)    # do bracketing check with modified chars
            if svBrkt and not inBrkt: nspc = 0

#           print 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt

            sent.append(c)                      # put original char into sentence buffer
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue                        # if alphanumeric, just add to sentence

            if c == SP:
                continue                        # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()                      # remove from sentence chars
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            cx = self.inp.preview()  # for context of match call

#           print '0  <<' , self.inp.buf

#           print 'sent=' , sent[:-1]
#           print 'punc=' , '<' + c + '>'
#           print 'next=' , cx
            if c in Stops and len(cx) > 0 and cx[0] == SP:
                if self.stpx.match(sent[:-1],c,cx):
#                   print 'stop exception MATCH'
                    if self.drop:
                        sent.pop()   # remove punctuation char from sentence
                        lc = SP
                    continue

#           print 'no stop exception MATCH for' , c

#           print '@1  <<' , self.inp.buf

            # handle any nonstandard punctuation

            exoticPunctuation.normalize(c,self.inp)

#           print '@2  <<' , self.inp.buf

            # check for dash

            if c == u'-':
                d = self.inp.read()
                if d == u'-':
#                   print 'dash'
                    while True:
                        d = self.inp.read()
                        if d != u'-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print '@3  c=' , c , inBrkt

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

#               print 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent)

                if not inBrkt:
#                   print sent , 'so far'
                    z = self.inp.read()
                    if self.shortBracketing(sent,z):
                        break
                    self.inp.unread(z)
#                   print 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']'
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                elif c in QUOs and lc in Stops:
#                   print 'stop+quote'
                    z = self.inp.read()
                    if z in RBs:
                        sent.append(z)
                        y = self.inp.read()
                        if y in Stops:
                            sent.append(y)
                        elif not ellyChar.isWhiteSpace(y):
                            self.inp.unread(y)
                        inBrkt = False
                        break
                    elif z in QUOs:
#                       print 'stop+quote+quote'
                        sent.append(z)
                        inBrkt = False
                        break
                    self.inp.unread(z)
#               print 'continue'
                continue

            elif not c in Stops:
                continue

            else:
#               print 'check stopping!'
                d = self.inp.read()
#               print '@3  <<' , self.inp.buf

                if d == None: d = u'!'
#               print 'stop=' , '<' + c + '> <' + d + '>'

#               print 'ellipsis check'
                if c == u'.' and c == d:
                    if self.inp.peek() != c: # look for third '.' in ellipsis
                        self.inp.unread(d)   # if none, keep only first '.'
                    else:
                        self.inp.skip()      # found ellipsis
                        sent.append(d)       # complete it in sentence buffer
                        sent.append(d)       #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(SP)  # if part of token, put in space as separator
                    continue

                if c == ELLP:
#                   print 'found Unicode ellipsis, d=' , d
                    if ellyChar.isUpperCaseLetter(d):
                        self.inp.unread(d)   # super special case of bad punctuation
                        self.inp.unread(' ') # put in implied period and space
                        self.inp.unread('.') #

                # special check for multiple stops

#               print 'next char d=' , d , ord(d) if d != END else 'NONE'
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP               # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent,d): break
                    if d in self._cl and self._cl[d] == 1:
                        dn = self.inp.peek()
                        if ellyChar.isWhiteSpace(dn):
                            sent.append(d)
                            break
                    self.inp.unread(d)
#                   print 'no space after punc'
                    continue

                # if no match for lookahead, put back

                elif d != END:
#                   print 'unread d=' , d
                    self.inp.unread(d)

#               print 'possible stop'

                # check special case of number ending in decimal point

                if c == '.':
                    ixb = len(sent) - 2
                    ixn = ixb + 1
                    cxn = ''
#                   print 'sent=' , sent
#                   print 'ixn=' ,ixn
                    while ixn > 0:
                        ixn -= 1
                        cxn = sent[ixn]
#                       print 'cxn=' , cxn
                        if not ellyChar.isDigit(cxn): break
#                   print 'break: ixn=' , ixn , 'ixb=' , ixb
                    if ixn < ixb and cxn in [ ' ' , '-' , '+' ]:
                        prvw = self.inp.preview()
#                       print 'prvw=' , prvw
                        if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(prvw[1]):
                            continue

                # final check: is sentence long enough?

                if inBrkt:
#                   print 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview()
#                   print 'nspc=' , nspc
                    if c in [ ':' , ';' ] or nspc < 3:
                        sent.append(d)
#                       print 'add' , '<' + d + '> to sentence'
#                       print 'sent=' , sent
                        self.inp.skip()
                        nspc -= 1
                        continue

#               print '@4  <<' , self.inp.buf
                cx = self.inp.peek()
                if cx == None: cx = u'!!'
#               print 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent
#               print 'nAN=' , nAN , 'inBrkt=' , inBrkt
                if nAN > 1:
                    break

        if sent == [ u'\u2026' ]:  # special case of sentence
            return list("-.-")     # with lone ellipsis
        elif len(sent) > 0 or self.last != END:
            return sent
        else:
            return None
Beispiel #15
0
def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __str__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + str(self.kind)  +
                     ',ct=' + str(self.count) +
                     ',pa=' + str(self.pats)  +
                     ',tx=' + str(self.txts)  +
                     ',bd=' + str(self.bnds)  +
                     ',ns=' + str(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print ( "binding:",offs,ns )
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) )
#       print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp )
#       print ( "text to span:",text[offs:] )
#       print ( "pat rest=" , patn[mp:] )
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print ( "exclude=",k,"chars from possible span for rest of pattern" )

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print ( mx,"chars available to scan" )
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print ( "text at",offs,"maximum wildcard match=",mx )

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print ( 'span c=' , c )
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print ( "maximum wildcard span=",nm )

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print ( 'starting match, limt=',limt,text[offs:limt],":",patn )
#   print ( 'nsps=' , nsps )

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print ( '---- loop mp=' , mp , 'ml=' , ml )
        while mp < ml:
            if offs >= limt:
#               print ( "offs=",offs,"limt=",limt )
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print ( 'patn=' , patn )
            mc = patn[mp]
#           print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs )
#           print ( 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')' )
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print ( 'hyphen special matching, limt=', limt , 'offs=' , offs )
#                       print ( 'text[offs:]=' , text[offs:] )
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print ( 'no special matching of hyphen' )
                        break

#           print ( 'matched @mp=' , mp )
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print ( 'pat @',mp,"<",ml )
#       print ( "txt @",offs,'<',limt,'last=',last )
#       print ( '@',offs,text[offs:] )

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) )

        if tc == cALL:      # a * wildcard?

#           print ( "ALL last=< " + last + " >" )
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print ( "offs=",offs,'nm=',nm )
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print ( "END $:",last )
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue
            elif not ellyChar.isText(last):
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print ( "ANY:",last,offs )
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print ( 'at cCAN' )
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print ( "ALF:",last,offs )
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print ( "UPR:",last,'@',offs )
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print ( "LWR:",last,'@',offs )
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print ( "SPC:","["+last+"]" )
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print ( 'NO space' )

        elif tc == cAPO: # apostrophe wildcard?
#           print ( "APO: last=" , last )
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print ( "SOS" )
#           print ( last,'@',offs )
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print ( "EOS" )
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' )
            if last != '':               # still more to match?
                offs -= 1
#               print ( 'nsps=' , nsps )
#               print ( '@' , offs , text )
                nm = _span(tc,nsps)      # maximum match possible

#               print ( 'spanning=' , nm )
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print ( 'spanning=' , nm )
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print ( 'offs=' , offs )
                    last = text[offs] if offs < limt else ''
                    continue
#           print ( 'fail tc=' , deconvert(tc) )

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print ( "fail - unwinding" , unj )

        while unj > 0:               # try unwinding, if possible
#           print ( "unw:",unj )
            uf = unw[unj-1]          # get most recent unwinding record
#           print ( uf )
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print ( "no unwinding" )
            break                   # quit if unwinding is exhausted
#       print ( 'cnt=' , uf.count , 'off=' , offs )

    ##
    ## clean up on match mode or on no match possible
    ##

#   print ( "matched=",matched )

    if not matched: return None     # no bindings

#   print ( text,offs )

    ## consolidate contiguous bindings for subsequent substitutions

#   print ( "BEFORE consolidating consecutive bindings" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print ( "AFTER" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    return mbd             # consolidated bindings plus new offset
Beispiel #16
0
def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  +
                     ',ns=' + unicode(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw)
#       print "_span: txt @",offs,"pat @",mp,"nsp=",nsp
#       print "text to span:",text[offs:]
#       print "pat rest=" , patn[mp:]
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print "exclude=",k,"chars from possible span for rest of pattern"

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print mx,"chars available to scan"
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print 'span c=' , c
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print 'starting match, limt=',limt,text[offs:limt],":",patn
#   print 'nsps=' , nsps

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print '---- loop mp=' , mp , 'ml=' , ml
        while mp < ml:
            if offs >= limt:
#               print "offs=",offs,"limt=",limt
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print 'patn=' , patn
            mc = patn[mp]
#           print 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs
#           print 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')'
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print 'hyphen special matching, limt=', limt , 'offs=' , offs
#                       print 'text[offs:]=' , text[offs:]
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print 'no special matching of hyphen'
                        break

#           print 'matched @mp=' , mp
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat @',mp,"<",ml
#       print "txt @",offs,'<',limt,'last=',last
#       print '@',offs,text[offs:]

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",'{:04x}'.format(ord(tc)),deconvert(tc)

        if tc == cALL:      # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print "offs=",offs,'nm=',nm
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print "ANY:",last,offs
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print 'at cCAN'
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print "UPR:",last,'@',offs
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print "LWR:",last,'@',offs
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:","["+last+"]"
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print 'NO space'

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print 'spanning wildcard, offs=' , offs , 'last=(' + last + ')'
            if last != '':               # still more to match?
                offs -= 1
#               print 'nsps=' , nsps
#               print '@' , offs , text
                nm = _span(tc,nsps)      # maximum match possible

#               print 'spanning=' , nm
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print 'offs=' , offs
                    last = text[offs] if offs < limt else ''
                    continue
#           print 'fail tc=' , deconvert(tc)

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print "fail - unwinding" , unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted
#       print 'cnt=' , uf.count , 'off=' , offs

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating consecutive bindings"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    return mbd             # consolidated bindings plus new offset
Beispiel #17
0
    def _scanText(self, k):
        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary, pattern, amd template tables an
        also running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            match parameters [ text span of match , vocabulary match , suffix removed ]

        exceptions:
            ParseOverflow
        """

        #       print ( '_scanText k=' , k )
        sb = self.sbu.buffer  # input buffer
        tr = self.ptr  # parse tree for results

        #       print ( '_scanText sb=' , sb )
        # initialize match status
        nspan = 0  #   total span of match
        vmchs = []  #   chars of vocabulary entry matched
        suffx = ''  #   any suffix removed in match

        d = self.rul  # grammar rule definitions

        m = d.ptb.match(sb, tr)  # try token by pattern match next
        #       print ( 'pattern m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

        m = d.ctb.match(sb, tr)  # try multi-word template  match next
        #       print ( 'template m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

        m = self.iex.run(sb)  # try entity extractors next
        #       print ( 'extractor m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

#       print ( 'nspan=' , nspan, sb[:nspan] )

        lm = len(sb)  # scan limit
        #       print ( 'lm=' , lm , 'm=' , m )
        capd = ellyChar.isUpperCaseLetter(sb[0])
        #       print ( 'next component=' , sb[:k] , ', context=' , sb[k:lm] )

        if self.vtb != None:  # look in external dictionary, if it exists
            ls = list(sb[:k])
            #           print ( 'vtb ls 0=' , ls )
            ellyChar.toLowerCaseASCII(ls)
            ss = ''.join(ls)  # where to start for vocabulary indexing
            #           print ( 'vtb ls 1=' , ls )
            n = vocabularyTable.delimitKey(ss)  # get actual indexing
            #           print ( 'delimiting n=' , n , ':' , '<' + ss[:n] + '>' )
            #           print ( vocabularyTable.listDBKeys(self.vtb.cdb) )

            rl = self.vtb.lookUp(sb, n)  # get list of the maximum text matches
            #           print ( 'external matches=' , len(rl) )
            #           print ( 'input text=' , sb )

            if len(rl) > 0:  #
                r0 = rl[0]  # look at first record
                #               print ( 'r0=' , r0 )
                vmln = r0.nspan  # should be same for all matches
                vchs = r0.vem.chs  #
                vsfx = r0.suffx  #
                #               print ( 'nspan=' , vmln , vsfx )

                if (vmln > nspan or vmln == nspan and vsfx == ''):

                    nspan = vmln  # keep vocabulary matches
                    vmchs = vchs  #
                    suffx = vsfx  #

                    for r in rl:
                        ve = r.vem  # get vocabulary entry
                        #                       print ( 've=' , ve )
                        #                       if ve.gen != None: print ( 've.gen=' , ve.gen )
                        if tr.addLiteralPhraseWithSemantics(
                                ve.cat, ve.syf, ve.smf, ve.bia, ve.gen,
                                len(suffx) > 0):
                            tr.lastph.lens = nspan  # char length of leaf phrase node
                            # needed for later selection
                            tr.lastph.krnl.cncp = ve.con
                            if capd:
                                tr.lastph.krnl.semf.set(0)
#                           print ( 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens )
                            if suffx != '':
                                if ellyChar.isApostrophe(suffx[1]):
                                    tr.lastph.krnl.usen = 0

#               print ( 'vocabulary m=' , vmln )
#               print ( 'queue after table lookup:' , len(self.ptr.queue) )

#           print ( 'vtb sb=' , sb )

#       print ( 'maximum match=' , nspan )
#       print ( 'next input=' , sb[:nspan] )

        if nspan > 0:  # any matches at all?
            tr.requeue()  # if so, keep only longest of them
#       print ( 'queue after scan:' , len(self.ptr.queue) )

#       print ( 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']' )
        return [nspan, vmchs, suffx]
Beispiel #18
0
    def _lookUpNext(self):
        """
        look up possible next segments in input buffer by various means,
        keeping tokens only for the LONGEST segment

        arguments:
            self

        returns:
            True on successful lookup, False otherwise

        exceptions:
            ParseOverflow
        """

        self.sbu.skipSpaces()  # skip leading spaces
        s = self.sbu.buffer
        #       print ( '_lookUp@0 buffer=' , s )

        if len(s) == 0:  # check for end of input
            return False  # if so, done

#       print ( 'in =' , str(self.sbu) )
        if self.trs != None:  # preanalysis of number expressions
            self.trs.rewriteNumber(s)

#       print ( '_lookUp@1 buffer=' , self.sbu.buffer )
#       print ( 'macro expansion s[0]=' , s[0] )
        self.sbu.expand()  # apply macro substitutions
        #       print ( 'macro expanded  s[0]=' , s[0] )
        #       print ( '_lookUp@2 buffer=' , self.sbu.buffer )

        s = self.sbu.buffer

        #       print ( 'expanded len=' , len(s) )
        if len(s) == 0: return True  # macros can empty out buffer

        k = self.sbu.findBreak()  # find extent of first component for lookup
        if k == 0:
            k = 1  # must have at least one char in token

#       print ( 'break at k=' , k )
        kl = len(s)
        if k + 1 < kl and s[k] == '+' and s[k + 1] == ' ':
            k += 1  # recognize possible prefix

#       print ( 'len(s)=' , kl , 'k=' , k , 's=', s )

#       print ( '_lookUp@3 buffer=' , self.sbu.buffer )
        mr = self._scanText(k)  # text matching in various ways
        mx = mr[0]  # overall maximum match length
        chs = mr[1]  # any vocabulary element matched
        suf = mr[2]  # any suffix removed in matching
        #       print ( '_lookUp@4 buffer=' , self.sbu.buffer )
        s = self.sbu.buffer
        #       print ( 'k=' , k )
        #       print ( 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf )
        #       print ( 'len(s)=' , len(s) , 's=' , s )

        if (k < mx or k == mx and suf != ''
            ):  # next word cannot produce token as long as already seen?

            #           print ( 'queue:' , len(self.ptr.queue) )
            #           print ( 'chs=' , chs )
            if len(chs) > 0:  # any vocabulary matches?
                #               print ( 'put back' , suf , mx , s )
                self.sbu.skip(mx)  # if so, they supersede
                if suf != '':  # handle any suffix removal
                    self.sbu.prepend(list(suf))
#                   print ( 'suf=' , suf )
            else:
                chs = self.sbu.extract(mx)

#           print ( 'extract chs=' , chs )
            to = ellyToken.EllyToken(chs)
            #           print ( 'token=' , str(to) )
            self.ctx.addTokenToListing(to)
            if suf != '':
                if not ellyChar.isApostrophe(suf[1]):
                    to.dvdd = True  # must note suffix removal for token!
#           print ( 'only queue:' , len(self.ptr.queue) )
            return True

#       print ( 'mx=' , mx )
#       print ( 'plus queue:' , len(self.ptr.queue) )
        wsk = self.sbu.buffer[:k]
        cap = ellyChar.isUpperCaseLetter(wsk[0])
        #       print ( 'wsk=' , wsk )
        rws = ''.join(wsk)
        found = self.ptr.createPhrasesFromDictionary(rws.lower(), False, cap)
        if not found:
            if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]):
                k -= 1
                rws = rws[:-1]
                found = self.ptr.createPhrasesFromDictionary(
                    rws.lower(), False, cap)
#       print ( rws , 'found in dictionary=' , found )
        if found or mx > 0:  # match found in dictionary or by text scan
            if not found:
                k = mx  # if by text scan, must make token longer
                rws = rws[:k]  # if mx > k
            self.sbu.skip(k)
            #           print ( 'next=' , self.sbu.buffer[self.sbu.index:] )
            #           print ( 'queue after =' , len(self.ptr.queue) )
            to = ellyToken.EllyToken(rws[:k])
            if len(suf) > 1:  # change token to show suffix properly
                #               print ( 'suf=' , suf )
                cs = suf[1]  # first char in suffix after '-'
                rt = to.root  # this is a list!
                lk = -1  # start at last char in token
                while rt[lk] != cs:
                    lk -= 1
                sn = len(rt) + lk  # where to divide suffix from root
                #               print ( 'sn=' , sn , rt )
                to.root = rt[:sn]  # root without suffix
                self.sbu.prepend(suf)  # restore suffix to input for processing
            else:  # no suffix
                chx = self.sbu.peek()  # look at next char after match
                if chx == '-':  # if hyphen, need to separate it
                    self.sbu.skip()
                    if ellyChar.isLetter(self.sbu.peek()):
                        self.sbu.prepend(' ')
                    self.sbu.prepend('-')
#           print ( 'add' , str(to) )
            self.ctx.addTokenToListing(to)  # add token to listing for sentence
            return True

#       print ( '[' + rws + ']' , 'still unrecognized' )

        chx = rws[0]  # special hyphen check
        if chx == '-' and k > 1:
            #           print ( 'look in  internal dictionary' )
            if self.ptr.createPhrasesFromDictionary(chx, False, False):
                #               print ( 'found!' )
                to = ellyToken.EllyToken(chx)  # treat hyphen as token
                self.ctx.addTokenToListing(to)  # add it to token list
                self.sbu.skip()  # remove from input
                return True

        to = self._extractToken(
            mx)  # single-word matching with analysis and lookup

        #       print ( 'extracted to=' , str(to) )
        if to == None:  # if no match, we are done and will return
            #           print ( 'mx=' , mx )
            return False if mx == 0 else True  # still success if _scanText() found something
        if self.ptr.lastph != None:
            self.ptr.lastph.lens = to.getLength()

#       print ( 'to=' , str(to) , 'len(s)=' , len(s) , s )
#       posn = self.ctx.countTokensInListing()
#       print ( 'at', posn , 'in token list' )
        self.ctx.addTokenToListing(to)  # add token to listing for sentence
        #       tol = self.ctx.getNthTokenInListing(-1)
        #       print ( 'last token root=' , tol.root )
        return True  # successful lookup
Beispiel #19
0
    def match(self, segm, tree):
        """
        compare text segment against all FSA patterns from state 0

        arguments:
            self  -
            segm  - segment to match against
            tree  - parse tree in which to put leaf nodes for final matches

        returns:
            text length matched by FSA
        """

        #       print 'comparing' , segm

        if len(self.indx) == 0: return 0  # no matches if FSA is empty

        if len(segm) == 0: return 0  # string is empty

        lim = bound(segm)  # get text limit for matching

        mtl = 0  # accumulated match length
        mtls = 0  # saved final match length

        state = 0  # set to mandatory initial state for FSA

        stk = []  # for tracking possible multiple matches

        ls = self.indx[state]  # for state 0!
        ix = 0  # index into current possible transitions
        sg = segm[:lim]  # text subsegment for matching
        #       print 'initial sg=' , sg
        #       print len(ls) , 'transitions from state 0'
        capd = False if len(sg) == 0 else ellyChar.isUpperCaseLetter(sg[0])

        while True:  # run FSA to find all possible matches
            #           print 'state=' , state
            #           print 'count=' , mtl , 'matched so far'
            #           print 'links=' , len(ls) , 'ix=' , ix
            nls = len(ls)  # how many links from current state

            if ix == nls:  # if none, then must back up
                if len(stk) == 0: break
                r = stk.pop()  # restore match status
                #               print 'pop r= [' , r[0] , r[1][0].shortcode() , ']'
                state = r[0]  # FSA state
                ls = r[1]  # remaining links to check
                sg = r[2]  # input string
                mtl = r[3]  # total match length
                ix = 0
                #               print 'pop sg=' , sg
                continue

#           print 'substring to match, sg=' , sg , 'nls=' , nls
            m = 0
            while ix < nls:
                lk = ls[ix]  # get next link at current state
                ix += 1  # and increment link index
                #               print '@' , state , 'lk= [' , unicode(lk), ']' , 'ix=' , ix
                #               print 'patn=' , lk.patn
                po = lk.patn[0]
                if po == u'\x00':  # do state change without matching?
                    m = 0  # no match length
                elif po != ellyWildcard.cEND:
                    #                   print 'po=' , po
                    bds = ellyWildcard.match(lk.patn, sg)
                    #                   print 'bds=' , bds
                    if bds == None: continue
                    m = bds[0]  # get match length, ignore wildcard bindings
                elif (len(sg) > 0 and (ellyChar.isLetterOrDigit(sg[0])
                                       or sg[0] == ellyChar.PRME)):
                    #                   print 'unmatched solitary $'
                    continue
                else:
                    #                   print 'matched solitary $, state=' , state
                    m = 0

#               print 'm=' , m

                if lk.nxts < 0:  # final state?
                    if lk.nxts == -2: m = 0  # last part of match not counted
                    #                   print 'state=' , state , unicode(lk)
                    #                   print 'flags=' , lk.synf , '/' , lk.semf
                    if tree.addLiteralPhraseWithSemantics(
                            lk.catg, lk.synf, lk.semf, lk.bias,
                            cap=capd):  # make phrase
                        ml = mtl + m
                        if mtls < ml: mtls = ml
                        #                       print 'success!' , 'mtls=' , mtls
                        tree.lastph.lens = mtls  # save its length
#                       print 'match state=' , state , 'length=' , mtls

#               print 'ix=' , ix , 'nls=' , nls
                if ix < nls:  # any links not yet checked?
                    r = [state, ls[ix:], sg, mtl]
                    #                   print 'saved r= ' , state ,
                    #                   print [ x.shortcode() for x in ls[ix:] ]
                    stk.append(r)  # if not, save info for later continuation

                mtl += m  # update match length
                break  # leave loop at this state, go to next state
            else:
                #               print 'no matches'
                continue  # all patterns exhausted for state

            ix = 0
            sg = sg[m:]  # move up in text input
            state = lk.nxts  # next state
            if state < 0:
                ls = []
            else:
                ls = self.indx[state]
#           print 'sg=' , sg
#           print 'state=' , state
#           print 'len(ls)=' , len(ls)

#       print 'mtls=' , mtls
        return mtls
Beispiel #20
0
    def getNext(self):
        """
        extract next sentence for Elly translation from input stream

        arguments:
            self

        returns:
            list of chars for next sentence on success, None on empty stream
        """

        #       print ( 'getNext' )

        self.resetBracketing()
        inBrkt = False

        nspc = 0  # set space count

        sent = []  # list buffer to fill

        x = self.inp.read()
        if x == SP:
            x = self.inp.read()

        if x == END:  # EOF check
            return None

        c = END  # reset
        lc = END

        #       print ( 'x=' , '<' + x + '>' , ord(x) )
        self.inp.unread(x, SP)  # put first char back to restore input
        #       print ( '0  <<' , self.inp.buf )

        # fill sentence buffer up to next stop punctuation in input

        nAN = 0  # alphanumeric count in sentence

        while True:

            x = self.inp.read()  # next input char

            if x == END:  # handle any EOF
                break

#           print ( 'x=' , '<' + x + '>' , 'c=' , '<' + c + '>' )
#           print ( 'sent=' , sent , 'nspc=' , nspc )

# check for table delimiters in text

            if len(sent) == 0:
                #               print ( 'table' )
                #               print ( '1  <<' , self.inp.buf )

                if x == '.' or x == '-':  # look for multiple '.' or '-'
                    while True:  # scan up to end of current buffering
                        y = self.inp.read()  #
                        if y != x and y != SP:  # no more delimiter chars or spaces?
                            self.inp.unread(y)  # if so, done
                            break  #
                    continue  # ignore everything seen so far

            ####################################################
            # accumulate chars and count alphanumeric and spaces
            ####################################################

            lc = c
            c = x
            nc = self.inp.peek()
            if ellyChar.isWhiteSpace(nc): nc = SP

            #           print ( 'lc=' , '<' + lc + '>, nc=' , '<' + nc + '>' )
            if lc == SP or lc == END:  # normalize chars for proper bracketing
                if x == SQuo:  #
                    x = LSQm  # a SQuo preceded by a space becomes LSQm
                elif x == DQuo:  #
                    x = LDQm  # a DQuo preceded by a space becomes LDQm
            if nc == SP or nc == END:  #
                if x == SQuo:  # a SQuo followed by a space becomes RSQm
                    x = RSQm  #
                elif x == DQuo:  # a DQuo followed by a space becomes RDQm
                    x = RDQm  #
            elif not ellyChar.isLetterOrDigit(nc):
                if x == SQuo:  # a SQuo followed by nonalphanumeric becomes RSQm
                    x = RSQm  #
                elif x == DQuo:  # a DQuo followed by nonalphanumeric becomes RDQm
                    x = RDQm  #
            elif ellyChar.isWhiteSpace(c) and inBrkt:
                nspc += 1

            svBrkt = inBrkt
            inBrkt = self.checkBracketing(
                x)  # do bracketing check with modified chars
            if svBrkt and not inBrkt: nspc = 0

            #           print ( 'lc=' , '<' + lc + '>' , 'bracketing x=' , '<' + x + '>' , inBrkt )

            sent.append(c)  # put original char into sentence buffer
            if ellyChar.isLetterOrDigit(c):
                nAN += 1
                continue  # if alphanumeric, just add to sentence

            if c == SP:
                continue  # if space, just add to sentence

            # NL will break a sentence

            if c == NL:
                sent.pop()  # remove from sentence chars
                break

            # certain Unicode punctuation will always break

            if c in Hards:
                break

            # char was not alphanumeric or space
            # look for stop punctuation exception

            cx = self.inp.preview()  # for context of match call

            #           print ( '0  <<' , self.inp.buf )

            #           print ( 'sent=' , sent[:-1] )
            #           print ( 'punc=' , '<' + c + '>' )
            #           print ( 'next=' , cx )
            if c in Stops and len(cx) > 0 and cx[0] == SP:
                if self.stpx.match(sent[:-1], c, cx):
                    #                   print ( 'stop exception MATCH' )
                    if self.drop:
                        sent.pop()  # remove punctuation char from sentence
                        lc = SP
                    continue

#           print ( 'no stop exception MATCH for' , c )

#           print ( '@1  <<' , self.inp.buf )

# handle any nonstandard punctuation

            exoticPunctuation.normalize(c, self.inp)

            #           print ( '@2  <<' , self.inp.buf )

            # check for dash

            if c == '-':
                d = self.inp.read()
                if d == '-':
                    #                   print ( 'dash' )
                    while True:
                        d = self.inp.read()
                        if d != '-': break
                    sent.append(c)
                self.inp.unread(d)
                continue

            # check for sentence break on punctuation

#           print ( '@3  c=' , c , inBrkt )

            if c in QUOs or c in RBs:

                # special check for single or double quotes or
                # bracketing, which can immediately follow stop
                # punctuation for current sentence

                #               print ( 'bracketing c=' , c , ord(c) , inBrkt , 'at' , len(sent) )

                if not inBrkt:
                    #                   print ( sent , 'so far' )
                    z = self.inp.read()
                    if self.shortBracketing(sent, z):
                        break
                    self.inp.unread(z)
                    #                   print ( 'z=' , '[' + z + ']' , 'lc=' , '[' + lc + ']' )
                    if z == END or ellyChar.isWhiteSpace(z) and lc in Stops:
                        if nAN > 1:
                            break
                elif c in QUOs and lc in Stops:
                    #                   print ( 'stop+quote' )
                    z = self.inp.read()
                    if z in RBs:
                        sent.append(z)
                        y = self.inp.read()
                        if y in Stops:
                            sent.append(y)
                        elif not ellyChar.isWhiteSpace(y):
                            self.inp.unread(y)
                        inBrkt = False
                        break
                    elif z in QUOs:
                        #                       print ( 'stop+quote+quote' )
                        sent.append(z)
                        inBrkt = False
                        break
                    self.inp.unread(z)
#               print ( 'continue' )
                continue

            elif not c in Stops:
                continue

            else:
                #               print ( 'check stopping!' )
                d = self.inp.read()
                #               print ( '@3  <<' , self.inp.buf )

                if d == None: d = '!'
                #               print ( 'stop=' , '<' + c + '> <' + d + '>' )

                #               print ( 'ellipsis check' )
                if c == '.' and c == d:
                    if self.inp.peek() != c:  # look for third '.' in ellipsis
                        self.inp.unread(d)  # if none, keep only first '.'
                    else:
                        self.inp.skip()  # found ellipsis
                        sent.append(d)  # complete it in sentence buffer
                        sent.append(d)  #
                        x = self.inp.peek()  # look at char after ellipsis
                        if ellyChar.isCombining(x):
                            sent.append(
                                SP
                            )  # if part of token, put in space as separator
                    continue

                if c == ELLP:
                    #                   print ( 'found Unicode ellipsis, d=' , d )
                    if ellyChar.isUpperCaseLetter(d):
                        self.inp.unread(
                            d)  # super special case of bad punctuation
                        self.inp.unread(' ')  # put in implied period and space
                        self.inp.unread('.')  #

                # special check for multiple stops

#               print ( 'next char d=' , d , ord(d) if d != END else 'NONE' )
                if d in Stops:
                    while True:
                        d = self.inp.read()
                        if not d in Stops: break
                    self.inp.unread(d)
                    if not ellyChar.isWhiteSpace(d):
                        d = SP  # make rightside context for stop

                # special check for blank or null after stops

                elif d != END and not ellyChar.isWhiteSpace(d):
                    if self.shortBracketing(sent, d): break
                    if d in self._cl and self._cl[d] == 1:
                        dn = self.inp.peek()
                        if ellyChar.isWhiteSpace(dn):
                            sent.append(d)
                            break
                    self.inp.unread(d)
                    #                   print ( 'no space after punc' )
                    continue

                # if no match for lookahead, put back

                elif d != END:
                    #                   print ( 'unread d=' , d )
                    self.inp.unread(d)

#               print ( 'possible stop' )

# check special case of number ending in decimal point

                if c == '.':
                    ixb = len(sent) - 2
                    ixn = ixb + 1
                    cxn = ''
                    #                   print ( 'sent=' , sent )
                    #                   print ( 'ixn=' ,ixn )
                    while ixn > 0:
                        ixn -= 1
                        cxn = sent[ixn]
                        #                       print ( 'cxn=' , cxn )
                        if not ellyChar.isDigit(cxn): break
#                   print ( 'break: ixn=' , ixn , 'ixb=' , ixb )
                    if ixn < ixb and cxn in [' ', '-', '+']:
                        prvw = self.inp.preview()
                        #                       print ( 'prvw=' , prvw )
                        if len(prvw) > 1 and not ellyChar.isUpperCaseLetter(
                                prvw[1]):
                            continue

                # final check: is sentence long enough?

                if inBrkt:
                    #                   print ( 'c=' , '<' + c + '>' , 'd=' , '<' + d + '>' , 'preview=' , self.inp.preview() )
                    #                   print ( 'nspc=' , nspc )
                    if c in [':', ';'] or nspc < 3:
                        sent.append(d)
                        #                       print ( 'add' , '<' + d + '> to sentence' )
                        #                       print ( 'sent=' , sent )
                        self.inp.skip()
                        nspc -= 1
                        continue

#               print ( '@4  <<' , self.inp.buf )
                cx = self.inp.peek()
                if cx == None: cx = '!!'
                #               print ( 'sentence break: next=' , '<' + cx + '>' , len(cx) , sent )
                #               print ( 'nAN=' , nAN , 'inBrkt=' , inBrkt )
                if nAN > 1:
                    break

        if sent == ['\u2026']:  # special case of sentence
            return list("-.-")  # with lone ellipsis
        elif len(sent) > 0 or self.last != END:
            return sent
        else:
            return None
Beispiel #21
0
    def read ( self ):

        """
        get next char from input stream with filtering

        arguments:
            self

        returns:
            single Unicode char on success, null string otherwise
        """

#       print 'reading: buf=' , self.buf

        while True:

            if not self._reload():       # check if buffer empty and reload if needed
                return END               # return EOF if no more chars available

#           print 'buf=' , self.buf

            c = self.buf.pop(0)          # next raw char in buffer

            if c == SHYP:                # ignore soft hyphen
                if len(self.buf) > 0:
                    if self.buf[0] == SP:
                        c = self.buf.pop(0)
                continue

            if not ellyChar.isText(c):   # unrecognizable Elly char?
#               print 'c=' , '{0:04x}'.format(ord(c))
                if ellyChar.isCJK(c):
                    c = '_'              # special handling for Chinese
                else:
#                   print 'replace' , c , 'with NBSP'
                    c = NBSP             # by default, replace with no-break space

            lc = self._lc                # copy saved last char
#           print 'lc=' , ord(lc)
            self._lc = c                 # set new last char

#           if c == "'":
#               print 'apostrophe' , self.buf

#           print 'c=' , '<' + c + '>'

            if c == HYPH:                # special treatment for isolated hyphens
                if spc(lc) and spc(self.peek()):
                    c = DASH
                break
            elif c == '.':               # check for ellipsis
                bb = self.buf
                bl = len(bb)
#               print 'bl=' , bl , 'bb=' , bb
                if bl >= 2 and bb[0] == '.' and bb[1] == '.':
                    self.buf = bb[2:]
                    c = ELLP
                elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[2] == ' ' and bb[3] == '.':
                    self.buf = bb[4:]
                    c = ELLP
                break
            elif c == RSQm:              # check for single quote
#               print 'at single quote'
                nc = self.peek()         # look at next char
#               print 'next=' , nc
                if nc == RSQm:           # doubling of single quote?
                    self.buf.pop(0)      # if so, combine two single quotes
                    c = RDQm             # into one double quote
            elif not ellyChar.isWhiteSpace(c):
                if ellyChar.isWhiteSpace(lc):
                    self._cap = ellyChar.isUpperCaseLetter(c)
                break
            elif c == CR:                # always ignore
                continue
            elif c == NL:                # special handling of \n
#               print 'got NL'
                nc = self.peek()         # look at next char

                while nc == CR:
                    self.buf.pop(0)      # skip over CR's
                    nc = self.peek()
#               print "lc= '" + lc + "'"
                if lc != NL and nc == NL:
                    self.buf.pop(0)      # special case when NL can be returned
                    break

                if nc == NL:             # NL followed NL?
                    while nc == NL or nc == CR:
                        self.buf.pop(0)  # ignore subsequent new line chars
                        nc = self.peek()
                elif nc == END or ellyChar.isWhiteSpace(nc):
                    continue             # NL followed by space is ignored
                elif nc == u'.' or nc == u'-':
                    pass
                else:
#                   print 'NL to SP, lc=' , ord(lc)
                    c = SP               # convert NL to SP if not before another NL
            else:
#               print 'lc=' , ord(lc) , 'c=' , ord(c)
                c = SP                   # otherwise, convert white space to plain space

            self._cap = False

            if not ellyChar.isWhiteSpace(lc): # preceding char was not white space?
#               print 'return SP'
                break                    # if so, keep space in stream

        return c                         # next filtered char
Beispiel #22
0
    def match(self, txt, pnc, ctx):
        """
        compare a punctuation mark and its context with a pattern

        arguments:
            self  -
            txt   - list of text chars leading up to punctuation char
            pnc   - punctuation char
            ctx   - next chars after punctuation

        returns:
            True on match, False otherwise
        """

        #       print ( 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx )

        if matchtoo(txt, pnc, ctx):  # exception by complex match?
            return True
#       print ( 'matchtoo() returned False' )

        sep = ctx[0] if len(ctx) > 0 else ''
        if sep == ellyChar.THS:
            return True
        nxt = ctx[1] if len(ctx) > 1 else ''

        #       print ( 'lstg=' , self.lstg.keys() )
        if not pnc in self.lstg:  # get stored patterns for punctuation
            return False

        lp = self.lstg[pnc]

        #       print ( len(lp) , 'patterns' )

        ltx = len(txt)  # current length of accumulated text so far
        ntr = 1
        while ntr <= ltx:
            if not ellyChar.isLetterOrDigit(txt[-ntr]):
                break
            ntr += 1
        nrg = ntr
        ntr -= 1  # available trailing chars for  wildcard * match

        while nrg <= ltx:
            c = txt[-nrg]
            if not ellyChar.isLetterOrDigit(
                    c) and not ellyChar.isEmbeddedCombining(c):
                #               print ( 'break at nrg=' , nrg , txt[-nrg] )
                break
            nrg += 1
        nrg -= 1  # end of range for all pattern matching

        #       print ( 'ntr=' , ntr , 'nrg=' , nrg )

        txt = txt[-nrg:]  # reset text to limit for matching
        ltx = len(txt)  # its new length

        #       print ( 'txt= ' + str(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']' )

        for p in lp:  # try matching each listed exception pattern

            if p.left != None and len(p.left) > 0:

                pat = p.left
                star = pat[-1] == ellyWildcard.cALL
                n = len(
                    pat)  # it each pattern element matches one sequence char
                if star:  # except for a final wildcard *
                    #                   print ( 'pattern ending with *' )
                    n -= 1
                    #                   print ( 'ltx=' , ltx , 'n=' , n )
                    if ltx < n:
                        continue  # cannot match pattern properly
                    pat = pat[:-1]
                    t = txt[:n]
                else:
                    if ltx < n:
                        continue  # cannot match pattern properly
                    t = txt[-n:]

                if not ellyWildcard.match(pat, t, 0):
                    #                   print ( 'no possible pattern match' )
                    continue

                k = ltx - n  # extra chars beyond any match
                #               print ( 'k=' , k , 't=' , t )
                #               print ( 'txt=' , txt )
                #               print ( 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']' )
                #               print ( 'matches' , n , 'chars' )
                if not star and k > 0:
                    #                   print ( 'check text before [' , txt[-n] , ']' )
                    if ellyChar.isLetterOrDigit(txt[-n]):
                        c = txt[-n - 1]
                        #                       print ( 'preceding= [', c , ']' )
                        if ellyChar.isLetterOrDigit(c) or c == '&':
                            continue  # because break in text is required

#           print ( 'pat=' , ellyWildcard.deconvert(p.left) )
#           print ( 'n=' , n , 'ltx=' , ltx )
#           print ( 'txt=' , txt )

#           nc = '\\n' if nxt == '\n' else nxt
#           print ( 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']' )
#           print ( 'versus c=' , nc )

            rp = p.right
            if rp == [] or rp[0] == ellyWildcard.cALL:
                return True
            pcx = rp[0]
            if pcx == nxt:  # check for specific char after possible stop )
                #               print ( 'right=' , nxt )
                return True
            elif pcx == ellyWildcard.cALF:  # check for alphabetic
                if ellyChar.isLetter(nxt):
                    #                   print ( 'right is alphabetic=' , nxt )
                    return True
            elif pcx == ellyWildcard.cDIG:  # check for numeric
                if ellyChar.isDigit(nxt):
                    #                   print ( 'right is numeric=' , nxt 0
                    return True
            elif pcx == ellyWildcard.cUPR:  # check for upper case
                if ellyChar.isUpperCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cLWR:  # check for lower case
                if ellyChar.isLowerCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cCAN:  # check for non-alphanumeric
                if ellyChar.isLetter(nxt):
                    #                   print ( 'right is alphabetic=' , nxt )
                    return True

#       print ( "no matches" )
        return False
Beispiel #23
0
    def _lookUpNext ( self ):

        """
        look up possible next segments in input buffer by various means,
        keeping tokens only for the LONGEST segment

        arguments:
            self

        returns:
            True on successful lookup, False otherwise

        exceptions:
            ParseOverflow
        """

        self.sbu.skipSpaces()          # skip leading spaces
        s = self.sbu.buffer
#       print '_lookUp@0 buffer=' , s

        if len(s) == 0:                # check for end of input
            return False               # if so, done

#       print 'in =' , unicode(self.sbu)
        if self.trs != None:           # preanalysis of number expressions
            self.trs.rewriteNumber(s)

#       print '_lookUp@1 buffer=' , self.sbu.buffer
#       print 'macro expansion s[0]=' , s[0]
        self.sbu.expand()              # apply macro substitutions
#       print 'macro expanded  s[0]=' , s[0]
#       print '_lookUp@2 buffer=' , self.sbu.buffer

        s = self.sbu.buffer

#       print 'expanded len=' , len(s)
        if len(s) == 0: return True    # macros can empty out buffer

        k = self.sbu.findBreak()       # find extent of first component for lookup
        if k == 0:
            k = 1                      # must have at least one char in token

#       print 'break at k=' , k
        kl = len(s)
        if  k + 1 < kl and s[k] == '+' and s[k+1] == ' ':
            k += 1                     # recognize possible prefix

#       print 'len(s)=' , kl , 'k=' , k , 's=', s

#       print '_lookUp@3 buffer=' , self.sbu.buffer
        mr = self._scanText(k)         # text matching in various ways
        mx  = mr[0]                    # overall maximum match length
        chs = mr[1]                    # any vocabulary element matched
        suf = mr[2]                    # any suffix removed in matching
#       print '_lookUp@4 buffer=' , self.sbu.buffer
        s = self.sbu.buffer
#       print 'k=' , k
#       print 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf
#       print 'len(s)=' , len(s) , 's=' , s

        if ( k < mx or
             k == mx and suf != '' ):  # next word cannot produce token as long as already seen?

#           print 'queue:' , len(self.ptr.queue)
#           print 'chs=' , chs
            if len(chs) > 0:           # any vocabulary matches?
#               print 'put back' , suf , mx , s
                self.sbu.skip(mx)      # if so, they supersede
                if suf != '':          # handle any suffix removal
                    self.sbu.prepend(list(suf))
#                   print 'suf=' , suf
            else:
                chs = self.sbu.extract(mx)
#               print 'extracted chs=' , chs
#           print 'token chs=' , chs
            to = ellyToken.EllyToken(chs)
#           print 'long token=' , unicode(to)
            self.ctx.addTokenToListing(to)
            if suf != '':
                if not ellyChar.isApostrophe(suf[1]):
                    to.dvdd = True     # must note suffix removal for token!
#           print 'only queue:' , len(self.ptr.queue)
            return True

#       print 'mx=' , mx
#       print 'plus queue:' , len(self.ptr.queue)
        wsk = self.sbu.buffer[:k]
        cap = ellyChar.isUpperCaseLetter(wsk[0])
#       print 'wsk=' , wsk
        rws = u''.join(wsk)
        found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap)
        if not found:
#           print 'not found, k=' , k
            if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]):
                k -= 1
                rws = rws[:-1]
                found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap)
#       print 'found in dictionary=' , found
        if found or mx > 0:            # match found in dictionary or by text scan
            if not found:
                k = mx                 # if by text scan, must make token longer
                rws = rws[:k]          # if mx > k
            self.sbu.skip(k)
#           print 'next=' , self.sbu.buffer[self.sbu.index:]
#           print 'queue after =' , len(self.ptr.queue)
            to = ellyToken.EllyToken(rws[:k])
            if len(suf) > 1:           # change token to show suffix properly
#               print 'suf=' , suf
                cs = suf[1]            # first char in suffix after '-'
                rt = to.root           # this is a list!
                lk = -1                # start at last char in token
                while rt[lk] != cs: lk -= 1
                sn = len(rt) + lk      # where to divide suffix from root
#               print 'sn=' , sn , rt
                to.root = rt[:sn]      # root without suffix
                self.sbu.prepend(suf)  # restore suffix to input for processing
            else:                      # no suffix
                chx = self.sbu.peek()  # look at next char after match
                if chx == '-':         # if hyphen, need to separate it
                    self.sbu.skip()
                    if ellyChar.isLetter(self.sbu.peek()):
                        self.sbu.prepend(' ')
                    self.sbu.prepend('-')
#           print 'add' , unicode(to)
            self.ctx.addTokenToListing(to)  # add token to listing for sentence
            return True

#       print '[' + rws + ']' , 'still unrecognized'

        chx = rws[0]                   # special hyphen check
        if chx == '-' and k > 1:
#           print 'look in  internal dictionary'
            if self.ptr.createPhrasesFromDictionary(chx,False,False):
#               print 'found!'
                to = ellyToken.EllyToken(chx)  # treat hyphen as token
                self.ctx.addTokenToListing(to) # add it to token list
                self.sbu.skip()                # remove from input
                return True

        to = self._extractToken(mx)    # single-word matching with analysis and lookup

#       print 'extracted to=' , unicode(to)
        if to == None:                 # if no match, we are done and will return
#           print 'mx=' , mx
            return False if mx == 0 else True  # still success if _scanText() found something
        if self.ptr.lastph != None:
            self.ptr.lastph.lens = to.getLength()

#       print 'to=' , unicode(to) , 'len(s)=' , len(s) , s
#       posn = self.ctx.countTokensInListing()
#       print 'at', posn , 'in token list'
        self.ctx.addTokenToListing(to) # add token to listing for sentence
#       tol = self.ctx.getNthTokenInListing(-1)
#       print 'last token root=' , tol.root
        return True                    # successful lookup
Beispiel #24
0
    def _scanText ( self , k ):

        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary, pattern, amd template tables an
        also running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            match parameters [ text span of match , vocabulary match , suffix removed ]

        exceptions:
            ParseOverflow
        """

#       print '_scanText k=' , k
        sb = self.sbu.buffer           # input buffer
        tr = self.ptr                  # parse tree for results

#       print '_scanText sb=' , sb
                                       # initialize match status
        nspan = 0                      #   total span of match
        vmchs = [ ]                    #   chars of vocabulary entry matched
        suffx = ''                     #   any suffix removed in match

        d = self.rul                   # grammar rule definitions

        m = d.ptb.match(sb,tr)         # try token by pattern match next
#       print 'pattern m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

        m = d.ctb.match(sb,tr)         # try multi-word template  match next
#       print 'template m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

        m = self.iex.run(sb)           # try entity extractors next
#       print 'extractor m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

#       lm = len(sb)                   # scan limit
#       print 'lm=' , lm , 'm=' , m
        capd = ellyChar.isUpperCaseLetter(sb[0])
#       print 'next component=' , sb[:k] , ', context=' , sb[k:lm]

        if self.vtb != None:           # look in external dictionary, if it exists
            ls = list(sb[:k])
#           print 'ls 0=' , ls
            ellyChar.toLowerCaseASCII(ls)
            ss = u''.join(ls)                   # where to start for vocabulary indexing
#           print 'ls 1=' , ls
            n = vocabularyTable.delimitKey(ss)  # get actual indexing
#           print 'delimiting n=' , n , '=' , '<' + ss[:n] + '>'
#           print vocabularyTable.listDBKeys(self.vtb.cdb)

            rl = self.vtb.lookUp(sb,n) # get list of the maximum text matches
#           print len(rl) , 'matches'
            if len(rl) > 0:            #
                r0 = rl[0]             # look at first record
#               print 'r0=' , r0
                vmln = r0.nspan        # should be same for all matches
                vchs = r0.vem.chs      #
                vsfx = r0.suffx        #
#               print 'nspan=' , vmln , vsfx

                if ( vmln > nspan or
                     vmln == nspan and vsfx == '' ):

                    nspan = vmln       # keep vocabulary matches
                    vmchs = vchs       #
                    suffx = vsfx       #

                    for r in rl:
                        ve = r.vem     # get vocabulary entry
#                       print 've=' , ve
#                       if ve.gen != None: print 've.gen=' , ve.gen
                        if tr.addLiteralPhraseWithSemantics(
                                ve.cat,ve.syf,ve.smf,ve.bia,ve.gen,len(suffx) > 0):
                            tr.lastph.lens = nspan  # char length of leaf phrase node
                                                    # needed for later selection
                            tr.lastph.krnl.cncp = ve.con
                            if capd:
                                tr.lastph.krnl.semf.set(0)
#                           print 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens
                            if suffx != '':
                                if ellyChar.isApostrophe(suffx[1]):
                                    tr.lastph.krnl.usen = 0

#               print 'vocabulary m=' , vmln
#               print 'queue after table lookup:' , len(self.ptr.queue)

#           print 'sb=' , sb

#       print 'maximum match=' , nspan
#       print 'input=' , self.sbu.buffer[:nspan]

        if nspan > 0:                  # any matches at all?
            tr.requeue()               # if so, keep only longest of them
#       print 'queue after scan:' , len(self.ptr.queue)

#       print 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']'
        return [ nspan , vmchs , suffx ]
Beispiel #25
0
    def read(self):
        """
        get next char from input stream with filtering

        arguments:
            self

        returns:
            single Unicode char on success, null string otherwise
        """

        #       print 'reading: buf=' , self.buf

        while True:

            if not self._reload(
            ):  # check if buffer empty and reload if needed
                return END  # return EOF if no more chars available

#           print 'buf=' , self.buf

            c = self.buf.pop(0)  # next raw char in buffer

            if c == SHYP:  # ignore soft hyphen
                if len(self.buf) > 0:
                    if self.buf[0] == SP:
                        c = self.buf.pop(0)
                continue

            if not ellyChar.isText(c):  # unrecognizable Elly char?
                #               print 'c=' , '{0:04x}'.format(ord(c))
                if ellyChar.isCJK(c):
                    if ellyConfiguration.language != 'ZH':
                        c = '_'  # special handling for non-Chinese input
                elif not c in [u'\uff0c', u'\u3002']:
                    #                   print 'replace' , c , 'with NBSP'
                    c = NBSP  # by default, replace with no-break space

            lc = self._lc  # copy saved last char
            #           print 'lc=' , ord(lc)
            self._lc = c  # set new last char

            #           if c == "'":
            #               print 'apostrophe' , self.buf

            #           print 'c=' , '<' + c + '>'

            if c == HYPH:  # special treatment for isolated hyphens
                if spc(lc) and spc(self.peek()):
                    c = DASH
                break
            elif c == '.':  # check for ellipsis
                bb = self.buf
                bl = len(bb)
                #               print 'bl=' , bl , 'bb=' , bb
                if bl >= 2 and bb[0] == '.' and bb[1] == '.':
                    self.buf = bb[2:]
                    c = ELLP
                elif bl >= 4 and bb[0] == ' ' and bb[1] == '.' and bb[
                        2] == ' ' and bb[3] == '.':
                    self.buf = bb[4:]
                    c = ELLP
                break
            elif c == RSQm:  # check for single quote
                #               print 'at single quote'
                nc = self.peek()  # look at next char
                #               print 'next=' , nc
                if nc == RSQm:  # doubling of single quote?
                    self.buf.pop(0)  # if so, combine two single quotes
                    c = RDQm  # into one double quote
            elif not ellyChar.isWhiteSpace(c):
                if ellyChar.isWhiteSpace(lc):
                    self._cap = ellyChar.isUpperCaseLetter(c)
                break
            elif c == CR:  # always ignore
                continue
            elif c == NL:  # special handling of \n
                #               print 'got NL'
                nc = self.peek()  # look at next char

                while nc == CR:
                    self.buf.pop(0)  # skip over CR's
                    nc = self.peek()
#               print "lc= '" + lc + "'"
                if lc != NL and nc == NL:
                    self.buf.pop(0)  # special case when NL can be returned
                    break

                if nc == NL:  # NL followed NL?
                    while nc == NL or nc == CR:
                        self.buf.pop(0)  # ignore subsequent new line chars
                        nc = self.peek()
                elif nc == END or ellyChar.isWhiteSpace(nc):
                    continue  # NL followed by space is ignored
                elif nc == u'.' or nc == u'-':
                    pass
                else:
                    #                   print 'NL to SP, lc=' , ord(lc)
                    c = SP  # convert NL to SP if not before another NL
            else:
                #               print 'lc=' , ord(lc) , 'c=' , ord(c)
                c = SP  # otherwise, convert white space to plain space

            self._cap = False

            if not ellyChar.isWhiteSpace(
                    lc):  # preceding char was not white space?
                #               print 'return SP'
                break  # if so, keep space in stream

        return c  # next filtered char
Beispiel #26
0
    def match ( self , txt , pnc , ctx ):

        """
        compare a punctuation mark and its context with a pattern

        arguments:
            self  -
            txt   - list of text chars leading up to punctuation char
            pnc   - punctuation char
            ctx   - next chars after punctuation

        returns:
            True on match, False otherwise
        """

#       print 'matching for txt=' , txt , 'pnc= [' , pnc , ' ] ctx=' , ctx

        if matchtoo(txt,pnc,ctx):     # exception by complex match?
            return True
#       print 'matchtoo() returned False'

        sep = ctx[0] if len(ctx) > 0 else ''
        if sep == ellyChar.THS:
            return True
        nxt = ctx[1] if len(ctx) > 1 else ''

#       print 'lstg=' , self.lstg.keys()
        if not pnc in self.lstg:     # get stored patterns for punctuation
            return False

        lp = self.lstg[pnc]

#       print len(lp) , 'patterns'

        ltx = len(txt)               # current length of accumulated text so far
        ntr = 1
        while ntr <= ltx:
            if not ellyChar.isLetterOrDigit(txt[-ntr]):
                break
            ntr += 1
        nrg = ntr
        ntr -= 1                     # available trailing chars for  wildcard * match

        while nrg <= ltx:
            c = txt[-nrg]
            if not ellyChar.isLetterOrDigit(c) and not ellyChar.isEmbeddedCombining(c):
#               print 'break at nrg=' , nrg , txt[-nrg]
                break
            nrg += 1
        nrg -= 1                     # end of range for all pattern matching

#       print 'ntr=' , ntr , 'nrg=' , nrg

        txt = txt[-nrg:]             # reset text to limit for matching
        ltx = len(txt)               # its new length

#       print 'txt= ' + unicode(txt) + ' pnc= [' + pnc + '] nxt=[' + nxt + ']'

        for p in lp:                 # try matching each listed exception pattern

            if p.left != None and len(p.left) > 0:

                pat = p.left
                star = pat[-1] == ellyWildcard.cALL
                n = len(pat)         # it each pattern element matches one sequence char
                if star:             # except for a final wildcard *
#                   print 'pattern ending with *'
                    n -= 1
#                   print 'ltx=' , ltx , 'n=' , n
                    if ltx < n:
                        continue     # cannot match pattern properly
                    pat = pat[:-1]
                    t = txt[:n]
                else:
                    if ltx < n:
                        continue     # cannot match pattern properly
                    t = txt[-n:]

                if not ellyWildcard.match(pat,t,0):
#                   print 'no possible pattern match'
                    continue

                k = ltx - n          # extra chars beyond any match
#               print 'k=' , k , 't=' , t
#               print 'txt=' , txt
#               print 'pat=' , '[' + ellyWildcard.deconvert(pat) + ']'
#               print 'matches' , n , 'chars'
                if not star and k > 0:
#                   print 'check text before [' , txt[-n] , ']'
                    if ellyChar.isLetterOrDigit(txt[-n]):
                        c = txt[-n-1]
#                       print 'preceding= [', c , ']'
                        if ellyChar.isLetterOrDigit(c) or c == '&':
                            continue # because break in text is required

#           print 'pat=' , ellyWildcard.deconvert(p.left)
#           print 'n=' , n , 'ltx=' , ltx
#           print 'txt=' , txt

#           nc = '\\n' if nxt == '\n' else nxt
#           print 'right pat=' , '[' + ellyWildcard.deconvert(p.right) + ']'
#           print 'versus c=' , nc

            rp = p.right
            if rp == [] or rp[0] == ellyWildcard.cALL:
                return True
            pcx = rp[0]
            if pcx == nxt:                     # check for specific char after possible stop
#               print 'right=' , nxt
                return True
            elif pcx == ellyWildcard.cALF:     # check for alphabetic
                if ellyChar.isLetter(nxt):
#                   print 'right is alphabetic=' , nxt
                    return True
            elif pcx == ellyWildcard.cDIG:     # check for numeric
                if ellyChar.isDigit(nxt):
#                   print 'right is numeric=' , nxt
                    return True
            elif pcx == ellyWildcard.cUPR:     # check for upper case
                if ellyChar.isUpperCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cLWR:     # check for lower case
                if ellyChar.isLowerCaseLetter(nxt):
                    return True
            elif pcx == ellyWildcard.cCAN:     # check for non-alphanumeric
                if ellyChar.isLetter(nxt):
#                   print 'right is alphabetic=' , nxt
                    return True

#       print "no matches"
        return False