Beispiel #1
0
def finAPO ( ss , sp ):

    """
    handle final apostrophes

    arguments:
        ss  - character stream
        sp  - last char of word in stream
    """

    lss = len(ss)
#   print 'finAPO lss=' , lss , ss[sp:]
    if lss > sp + 1:
#       print 'ending=' , ss[sp:]
        if ellyChar.isApostrophe(ss[sp+1]):
            if lss > sp + 2:
#               print 'ss=' , ss[sp:]
                if ss[sp+2].lower() == 's':
                    if terminate(ss,sp+3,lss):
                        sp += 1
#                       print 'sp=' , sp
                        ss[sp] = "'"
                        return
            if ss[sp].lower() == 's' and terminate(ss,sp+2,lss):
                sp += 1
                ss[sp] = Ls
Beispiel #2
0
    def getRules ( self , a ):

        """
        get appropriate macros for text with specified starting char

        arguments:
            self  -
            a     - first letter of current buffer contents (NOT space!)

        returns:
            a list of unpacked macro rules to try out
        """

#       print 'getRules(a=' , a , ')'
        if a == '': return [ ]
        if ellyChar.isLetterOrDigit(a):
            k = ellyChar.toIndex(a)
            ls = self.index[k]
#           print 'index a=' , a , 'k=' , k
            ws = self.letWx if ellyChar.isLetter(a) else self.digWx
            uniqueAdd(ls,ws)
            uniqueAdd(ls,self.anyWx)
        elif ellyChar.isApostrophe(a):
            ls = self.apoWx
        else:
            ls = self.index[0]
            uniqueAdd(ls,self.anyWx)
#       print len(ls) , ' rules to check'
        return [ r.unpack() for r in ls ]
Beispiel #3
0
def finAPO(ss, sp):
    """
    handle final apostrophes

    arguments:
        ss  - character stream
        sp  - last char of word in stream
    """

    lss = len(ss)
    #   print 'finAPO lss=' , lss , ss[sp:]
    if lss > sp + 1:
        #       print 'ending=' , ss[sp:]
        if ellyChar.isApostrophe(ss[sp + 1]):
            if lss > sp + 2:
                #               print 'ss=' , ss[sp:]
                if ss[sp + 2].lower() == 's':
                    if terminate(ss, sp + 3, lss):
                        sp += 1
                        #                       print 'sp=' , sp
                        ss[sp] = "'"
                        return
            if ss[sp].lower() == 's' and terminate(ss, sp + 2, lss):
                sp += 1
                ss[sp] = Ls
def delimitKey ( t ):

    """
    get part of term for vocabulary table indexing that
    ends in alphanumeric or is a single nonalphanumeric
    with special stripping of 'S at the end

    arguments:
        t  - text string to scan

    returns:
        count of chars to put into search key
    """

    ln = len(t)                   # number of chars in input text
    if ln == 0: return 0
    n = t.find(' ')               # find rough range of key for SQLite in text
    if n < 0: n = ln              # if undivided by spaces, take everything
    n -= 1                        # index of last char in range
    while n > 0:                  # scan input text backwards
        c = t[n]                  # check char for alphanumeric
        if ellyChar.isLetterOrDigit(c):
#           print 'n=' , n , 'c=' , c
            if n > 1:             # check for 'S as special case!
                if ( c in [ 's' , 'S' ] and
                     ellyChar.isApostrophe(t[n-1]) ):
#                   print 'drop \'S from SQLite key'
                    n -= 1
                else:
                    break
            else:
                break
        n -= 1                    # continue scanning backwards
    return n + 1                  # to get key length ending in alphanumeric
Beispiel #5
0
    def getRules(self, a):
        """
        get appropriate macros for text with specified starting char

        arguments:
            self  -
            a     - first letter of current buffer contents (NOT space!)

        returns:
            a list of unpacked macro rules to try out
        """

        #       print ( 'getRules(a=' , a , ')' )
        if a == '': return []
        if ellyChar.isLetterOrDigit(a):
            k = ellyChar.toIndex(a)
            ls = self.index[k]
            #           print ( 'index a=' , a , 'k=' , k )
            ws = self.letWx if ellyChar.isLetter(a) else self.digWx
            uniqueAdd(ls, ws)
            uniqueAdd(ls, self.anyWx)
        elif ellyChar.isApostrophe(a):
            ls = self.apoWx
        else:
            ls = self.index[0]
            uniqueAdd(ls, self.anyWx)
#       print ( len(ls) , ' rules to check' )
        return [r.unpack() for r in ls]
Beispiel #6
0
    def simplify(self, strg):
        """
        apply inflectional stemming to string

        arguments:
           self  -
           strg  - input Unicode string

        returns:
           stemmed Unicode string
        """

        if len(strg) < 4: return strg
        if strg[-1] == "s" and ellyChar.isApostrophe(strg[-2]):
            return strg[:-2]
        else:
            t = ellyToken.EllyToken(strg)
            self.apply(t)
            return t.toUnicode()
Beispiel #7
0
def delimitKey ( t ):

    """
    get bounds of vocabulary table key for looking up a term
    starting at the front of a given text string
    with special stripping of 'S at the end

    arguments:
        t  - text string to scan

    returns:
        count of chars to take for search key
    """

    ln = len(t)                   # number of chars in input text
    if ln == 0: return 0
    if not ellyChar.isLetterOrDigit(t[0]): return 1

#   print 'delimitKey t=' , t

    k = t.find('-')               # find rough range of SQLite key in text
    n = t.find(' ')               # delimited by either a hyphen or a space
    if n < 0: n = ln              # if space, take everything
    if k > 1 and n > k: n = k     # hyphen delimits if it comes first
    n -= 1                        # index of last char of candidate key
#   print 'k=' , k , 'n=' , n

    while n > 0:                  # scan input text backwards
        c = t[n]                  # check char for alphanumeric
        if ellyChar.isLetterOrDigit(c):
#           print 'n=' , n , 'c=' , c
            if n > 1:             # check for 'S as special case!
                if ( c in [ 's' , 'S' ] and
                     ellyChar.isApostrophe(t[n-1]) ):
#                   print 'drop \'S from SQLite key'
                    n -= 1
                else:
                    break
            else:
                break
        n -= 1                    # continue scanning backwards
#   print 'key=' , t[:n+1]
    return n + 1                  # to get key length ending in alphanumeric
Beispiel #8
0
def delimitKey(t):
    """
    get bounds of vocabulary table key for looking up a term
    starting at the front of a given text string
    with special stripping of 'S at the end

    arguments:
        t  - text string to scan

    returns:
        count of chars to take for search key
    """

    ln = len(t)  # number of chars in input text
    if ln == 0: return 0
    if not ellyChar.isLetterOrDigit(t[0]): return 1

    #   print ( 'delimitKey t=' , t )

    k = t.find('-')  # find rough range of SQLite key in text
    n = t.find(' ')  # delimited by either a hyphen or a space
    if n < 0: n = ln  # if space, take everything
    if k > 1 and n > k: n = k  # hyphen delimits if it comes first
    n -= 1  # index of last char of candidate key
    #   print ( 'k=' , k , 'n=' , n )

    while n > 0:  # scan input text backwards
        c = t[n]  # check char for alphanumeric
        if ellyChar.isLetterOrDigit(c):
            #           print ( 'n=' , n , 'c=' , c )
            if n > 1:  # check for 'S as special case!
                if (c in ['s', 'S'] and ellyChar.isApostrophe(t[n - 1])):
                    #                   print ( 'drop \'S from SQLite key' )
                    n -= 1
                else:
                    break
            else:
                break
        n -= 1  # continue scanning backwards


#   print ( 'key=' , t[:n+1] )
    return n + 1  # to get key length ending in alphanumeric
    def simplify ( self , strg ):

        """
        apply inflectional stemming to string

        arguments:
           self  -
           strg  - input Unicode string

        returns:
           stemmed Unicode string
        """

        if len(strg) < 4: return strg
        if strg[-1] == "s" and ellyChar.isApostrophe(strg[-2]):
            return strg[:-2]
        else:
            t = ellyToken.EllyToken(strg)
            self.apply(t)
            return t.toUnicode()
Beispiel #10
0
    def _scanText(self, k):
        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary, pattern, amd template tables an
        also running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            match parameters [ text span of match , vocabulary match , suffix removed ]

        exceptions:
            ParseOverflow
        """

        #       print ( '_scanText k=' , k )
        sb = self.sbu.buffer  # input buffer
        tr = self.ptr  # parse tree for results

        #       print ( '_scanText sb=' , sb )
        # initialize match status
        nspan = 0  #   total span of match
        vmchs = []  #   chars of vocabulary entry matched
        suffx = ''  #   any suffix removed in match

        d = self.rul  # grammar rule definitions

        m = d.ptb.match(sb, tr)  # try token by pattern match next
        #       print ( 'pattern m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

        m = d.ctb.match(sb, tr)  # try multi-word template  match next
        #       print ( 'template m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

        m = self.iex.run(sb)  # try entity extractors next
        #       print ( 'extractor m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

#       print ( 'nspan=' , nspan, sb[:nspan] )

        lm = len(sb)  # scan limit
        #       print ( 'lm=' , lm , 'm=' , m )
        capd = ellyChar.isUpperCaseLetter(sb[0])
        #       print ( 'next component=' , sb[:k] , ', context=' , sb[k:lm] )

        if self.vtb != None:  # look in external dictionary, if it exists
            ls = list(sb[:k])
            #           print ( 'vtb ls 0=' , ls )
            ellyChar.toLowerCaseASCII(ls)
            ss = ''.join(ls)  # where to start for vocabulary indexing
            #           print ( 'vtb ls 1=' , ls )
            n = vocabularyTable.delimitKey(ss)  # get actual indexing
            #           print ( 'delimiting n=' , n , ':' , '<' + ss[:n] + '>' )
            #           print ( vocabularyTable.listDBKeys(self.vtb.cdb) )

            rl = self.vtb.lookUp(sb, n)  # get list of the maximum text matches
            #           print ( 'external matches=' , len(rl) )
            #           print ( 'input text=' , sb )

            if len(rl) > 0:  #
                r0 = rl[0]  # look at first record
                #               print ( 'r0=' , r0 )
                vmln = r0.nspan  # should be same for all matches
                vchs = r0.vem.chs  #
                vsfx = r0.suffx  #
                #               print ( 'nspan=' , vmln , vsfx )

                if (vmln > nspan or vmln == nspan and vsfx == ''):

                    nspan = vmln  # keep vocabulary matches
                    vmchs = vchs  #
                    suffx = vsfx  #

                    for r in rl:
                        ve = r.vem  # get vocabulary entry
                        #                       print ( 've=' , ve )
                        #                       if ve.gen != None: print ( 've.gen=' , ve.gen )
                        if tr.addLiteralPhraseWithSemantics(
                                ve.cat, ve.syf, ve.smf, ve.bia, ve.gen,
                                len(suffx) > 0):
                            tr.lastph.lens = nspan  # char length of leaf phrase node
                            # needed for later selection
                            tr.lastph.krnl.cncp = ve.con
                            if capd:
                                tr.lastph.krnl.semf.set(0)
#                           print ( 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens )
                            if suffx != '':
                                if ellyChar.isApostrophe(suffx[1]):
                                    tr.lastph.krnl.usen = 0

#               print ( 'vocabulary m=' , vmln )
#               print ( 'queue after table lookup:' , len(self.ptr.queue) )

#           print ( 'vtb sb=' , sb )

#       print ( 'maximum match=' , nspan )
#       print ( 'next input=' , sb[:nspan] )

        if nspan > 0:  # any matches at all?
            tr.requeue()  # if so, keep only longest of them
#       print ( 'queue after scan:' , len(self.ptr.queue) )

#       print ( 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']' )
        return [nspan, vmchs, suffx]
Beispiel #11
0
def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __str__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + str(self.kind)  +
                     ',ct=' + str(self.count) +
                     ',pa=' + str(self.pats)  +
                     ',tx=' + str(self.txts)  +
                     ',bd=' + str(self.bnds)  +
                     ',ns=' + str(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print ( "binding:",offs,ns )
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) )
#       print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp )
#       print ( "text to span:",text[offs:] )
#       print ( "pat rest=" , patn[mp:] )
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print ( "exclude=",k,"chars from possible span for rest of pattern" )

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print ( mx,"chars available to scan" )
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print ( "text at",offs,"maximum wildcard match=",mx )

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print ( 'span c=' , c )
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print ( "maximum wildcard span=",nm )

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print ( 'starting match, limt=',limt,text[offs:limt],":",patn )
#   print ( 'nsps=' , nsps )

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print ( '---- loop mp=' , mp , 'ml=' , ml )
        while mp < ml:
            if offs >= limt:
#               print ( "offs=",offs,"limt=",limt )
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print ( 'patn=' , patn )
            mc = patn[mp]
#           print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs )
#           print ( 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')' )
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print ( 'hyphen special matching, limt=', limt , 'offs=' , offs )
#                       print ( 'text[offs:]=' , text[offs:] )
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print ( 'no special matching of hyphen' )
                        break

#           print ( 'matched @mp=' , mp )
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print ( 'pat @',mp,"<",ml )
#       print ( "txt @",offs,'<',limt,'last=',last )
#       print ( '@',offs,text[offs:] )

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) )

        if tc == cALL:      # a * wildcard?

#           print ( "ALL last=< " + last + " >" )
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print ( "offs=",offs,'nm=',nm )
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print ( "END $:",last )
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue
            elif not ellyChar.isText(last):
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print ( "ANY:",last,offs )
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print ( 'at cCAN' )
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print ( "ALF:",last,offs )
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print ( "UPR:",last,'@',offs )
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print ( "LWR:",last,'@',offs )
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print ( "SPC:","["+last+"]" )
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print ( 'NO space' )

        elif tc == cAPO: # apostrophe wildcard?
#           print ( "APO: last=" , last )
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print ( "SOS" )
#           print ( last,'@',offs )
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print ( "EOS" )
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' )
            if last != '':               # still more to match?
                offs -= 1
#               print ( 'nsps=' , nsps )
#               print ( '@' , offs , text )
                nm = _span(tc,nsps)      # maximum match possible

#               print ( 'spanning=' , nm )
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print ( 'spanning=' , nm )
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print ( 'offs=' , offs )
                    last = text[offs] if offs < limt else ''
                    continue
#           print ( 'fail tc=' , deconvert(tc) )

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print ( "fail - unwinding" , unj )

        while unj > 0:               # try unwinding, if possible
#           print ( "unw:",unj )
            uf = unw[unj-1]          # get most recent unwinding record
#           print ( uf )
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print ( "no unwinding" )
            break                   # quit if unwinding is exhausted
#       print ( 'cnt=' , uf.count , 'off=' , offs )

    ##
    ## clean up on match mode or on no match possible
    ##

#   print ( "matched=",matched )

    if not matched: return None     # no bindings

#   print ( text,offs )

    ## consolidate contiguous bindings for subsequent substitutions

#   print ( "BEFORE consolidating consecutive bindings" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print ( "AFTER" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    return mbd             # consolidated bindings plus new offset
Beispiel #12
0
    def doMatchUp ( self , vcs , txs ):

        """
        match current text with vocabulary entry, possibly removing final inflection
        (this method assumes English; override for other languages)

        arguments:
            self  -
            vcs   - vocabulary entry chars
            txs   - text chars to be matched

        returns:
            count of txs chars matched, 0 on mismatch
        """

#       print 'match up vcs=' , vcs
        self.endg = ''                       # default inflection
        lvc = len(vcs)
        ltx = len(txs)
        if ltx < lvc: return 0

        nr = icmpr(vcs,txs)                  # do match on lists of chars
#       print 'nr=' , nr , 'nt='
#       print 'txs=' , txs , 'ltx=' , ltx

        if nr == 0:                          # vocabulary entry fully matched?
            if ltx == lvc:
                return ltx                   # if no more text, done
            dnc = ltx - lvc                  # otherwise, check text past match
#           print 'dnc=' , dnc

            if ellyChar.isApostrophe(txs[lvc]):             # apostrophe check
                if dnc > 1 and txs[lvc+1] in [ 's' , 'S' ]: # 'S found?
                    if dnc == 2 or _terminate(txs[lvc+2]):
                        self.endg = '-\'s'                  #
                        return lvc + 2                      # if so, remove ending
                    return 0
                if txs[lvc-1] in [ 's' , 'S' ]:             # S' found?
                    if dnc == 1 or _terminate(txs[lvc+1]):
                        self.endg = '-\'s'                  # put in implied S!
                        return lvc + 1                      # if so, remove ending
                    return 0
                return lvc                                  # break at
            if _terminate(txs[lvc]):
                return lvc                                  # successful match

#       alphanumeric follows match either full or partial
#       try inflectional stemming to align a match here

        if self.stm == None:
            return 0                     # if no stemmer, no match possible

        k = lvc - nr + 1                 # get extent of text to match against
        while k < ltx:
            if _terminate(txs[k]):
                break                    # find current end of text to match
            k += 1
        n = k - 1
        while n > 0:
            if _terminate(txs[n]):
                n += 1
                break                    # find start of stemming
            n -= 1

#       print 'k=' , k , 'n=' , n , 'nr=' , nr

        if k - n < nr:                   # check if stemming could help match
            return 0

        tc = txs[k-1]                    # last char at end of text to match
#       print 'tc=' , tc
        if tc != 's' and tc != 'd' and tc != 'g':
            return 0                     # only -S, -ED, and -ING checked

        tw = ''.join(txs[n:k])           # segment of text for stemming
        sw = self.stm.simplify(tw)       # inflectional stemming
#       print 'sw=' , sw , 'tw=' , tw
        if len(sw) + n != lvc:           # stemmed result should now align
            return 0                     #   with vocabulary entry

#       print 'nr=' , nr
        ns = 0 if nr == 0 else icmpr(vcs[-nr:],sw[-nr:]) # continue from previous match
#       print 'ns=' , ns
        if ns == 0:                      # mismatch gone?
            self.endg = ( '-s'   if tc == 's' else
                          '-ed'  if tc == 'd' else
                          '-ing' )       # indicate ending removed
#           print 'txs=' , txs
#           print 'ltx=' , ltx , 'endg=' , self.endg
            return k                     # successful match

        return 0                         # no match by default
Beispiel #13
0
def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  +
                     ',ns=' + unicode(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw)
#       print "_span: txt @",offs,"pat @",mp,"nsp=",nsp
#       print "text to span:",text[offs:]
#       print "pat rest=" , patn[mp:]
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print "exclude=",k,"chars from possible span for rest of pattern"

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print mx,"chars available to scan"
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print 'span c=' , c
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print 'starting match, limt=',limt,text[offs:limt],":",patn
#   print 'nsps=' , nsps

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print '---- loop mp=' , mp , 'ml=' , ml
        while mp < ml:
            if offs >= limt:
#               print "offs=",offs,"limt=",limt
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print 'patn=' , patn
            mc = patn[mp]
#           print 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs
#           print 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')'
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print 'hyphen special matching, limt=', limt , 'offs=' , offs
#                       print 'text[offs:]=' , text[offs:]
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print 'no special matching of hyphen'
                        break

#           print 'matched @mp=' , mp
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat @',mp,"<",ml
#       print "txt @",offs,'<',limt,'last=',last
#       print '@',offs,text[offs:]

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",'{:04x}'.format(ord(tc)),deconvert(tc)

        if tc == cALL:      # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print "offs=",offs,'nm=',nm
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print "ANY:",last,offs
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print 'at cCAN'
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print "UPR:",last,'@',offs
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print "LWR:",last,'@',offs
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:","["+last+"]"
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print 'NO space'

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print 'spanning wildcard, offs=' , offs , 'last=(' + last + ')'
            if last != '':               # still more to match?
                offs -= 1
#               print 'nsps=' , nsps
#               print '@' , offs , text
                nm = _span(tc,nsps)      # maximum match possible

#               print 'spanning=' , nm
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print 'offs=' , offs
                    last = text[offs] if offs < limt else ''
                    continue
#           print 'fail tc=' , deconvert(tc)

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print "fail - unwinding" , unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted
#       print 'cnt=' , uf.count , 'off=' , offs

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating consecutive bindings"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    return mbd             # consolidated bindings plus new offset
Beispiel #14
0
    def _lookUpNext(self):
        """
        look up possible next segments in input buffer by various means,
        keeping tokens only for the LONGEST segment

        arguments:
            self

        returns:
            True on successful lookup, False otherwise

        exceptions:
            ParseOverflow
        """

        self.sbu.skipSpaces()  # skip leading spaces
        s = self.sbu.buffer
        #       print ( '_lookUp@0 buffer=' , s )

        if len(s) == 0:  # check for end of input
            return False  # if so, done

#       print ( 'in =' , str(self.sbu) )
        if self.trs != None:  # preanalysis of number expressions
            self.trs.rewriteNumber(s)

#       print ( '_lookUp@1 buffer=' , self.sbu.buffer )
#       print ( 'macro expansion s[0]=' , s[0] )
        self.sbu.expand()  # apply macro substitutions
        #       print ( 'macro expanded  s[0]=' , s[0] )
        #       print ( '_lookUp@2 buffer=' , self.sbu.buffer )

        s = self.sbu.buffer

        #       print ( 'expanded len=' , len(s) )
        if len(s) == 0: return True  # macros can empty out buffer

        k = self.sbu.findBreak()  # find extent of first component for lookup
        if k == 0:
            k = 1  # must have at least one char in token

#       print ( 'break at k=' , k )
        kl = len(s)
        if k + 1 < kl and s[k] == '+' and s[k + 1] == ' ':
            k += 1  # recognize possible prefix

#       print ( 'len(s)=' , kl , 'k=' , k , 's=', s )

#       print ( '_lookUp@3 buffer=' , self.sbu.buffer )
        mr = self._scanText(k)  # text matching in various ways
        mx = mr[0]  # overall maximum match length
        chs = mr[1]  # any vocabulary element matched
        suf = mr[2]  # any suffix removed in matching
        #       print ( '_lookUp@4 buffer=' , self.sbu.buffer )
        s = self.sbu.buffer
        #       print ( 'k=' , k )
        #       print ( 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf )
        #       print ( 'len(s)=' , len(s) , 's=' , s )

        if (k < mx or k == mx and suf != ''
            ):  # next word cannot produce token as long as already seen?

            #           print ( 'queue:' , len(self.ptr.queue) )
            #           print ( 'chs=' , chs )
            if len(chs) > 0:  # any vocabulary matches?
                #               print ( 'put back' , suf , mx , s )
                self.sbu.skip(mx)  # if so, they supersede
                if suf != '':  # handle any suffix removal
                    self.sbu.prepend(list(suf))
#                   print ( 'suf=' , suf )
            else:
                chs = self.sbu.extract(mx)

#           print ( 'extract chs=' , chs )
            to = ellyToken.EllyToken(chs)
            #           print ( 'token=' , str(to) )
            self.ctx.addTokenToListing(to)
            if suf != '':
                if not ellyChar.isApostrophe(suf[1]):
                    to.dvdd = True  # must note suffix removal for token!
#           print ( 'only queue:' , len(self.ptr.queue) )
            return True

#       print ( 'mx=' , mx )
#       print ( 'plus queue:' , len(self.ptr.queue) )
        wsk = self.sbu.buffer[:k]
        cap = ellyChar.isUpperCaseLetter(wsk[0])
        #       print ( 'wsk=' , wsk )
        rws = ''.join(wsk)
        found = self.ptr.createPhrasesFromDictionary(rws.lower(), False, cap)
        if not found:
            if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]):
                k -= 1
                rws = rws[:-1]
                found = self.ptr.createPhrasesFromDictionary(
                    rws.lower(), False, cap)
#       print ( rws , 'found in dictionary=' , found )
        if found or mx > 0:  # match found in dictionary or by text scan
            if not found:
                k = mx  # if by text scan, must make token longer
                rws = rws[:k]  # if mx > k
            self.sbu.skip(k)
            #           print ( 'next=' , self.sbu.buffer[self.sbu.index:] )
            #           print ( 'queue after =' , len(self.ptr.queue) )
            to = ellyToken.EllyToken(rws[:k])
            if len(suf) > 1:  # change token to show suffix properly
                #               print ( 'suf=' , suf )
                cs = suf[1]  # first char in suffix after '-'
                rt = to.root  # this is a list!
                lk = -1  # start at last char in token
                while rt[lk] != cs:
                    lk -= 1
                sn = len(rt) + lk  # where to divide suffix from root
                #               print ( 'sn=' , sn , rt )
                to.root = rt[:sn]  # root without suffix
                self.sbu.prepend(suf)  # restore suffix to input for processing
            else:  # no suffix
                chx = self.sbu.peek()  # look at next char after match
                if chx == '-':  # if hyphen, need to separate it
                    self.sbu.skip()
                    if ellyChar.isLetter(self.sbu.peek()):
                        self.sbu.prepend(' ')
                    self.sbu.prepend('-')
#           print ( 'add' , str(to) )
            self.ctx.addTokenToListing(to)  # add token to listing for sentence
            return True

#       print ( '[' + rws + ']' , 'still unrecognized' )

        chx = rws[0]  # special hyphen check
        if chx == '-' and k > 1:
            #           print ( 'look in  internal dictionary' )
            if self.ptr.createPhrasesFromDictionary(chx, False, False):
                #               print ( 'found!' )
                to = ellyToken.EllyToken(chx)  # treat hyphen as token
                self.ctx.addTokenToListing(to)  # add it to token list
                self.sbu.skip()  # remove from input
                return True

        to = self._extractToken(
            mx)  # single-word matching with analysis and lookup

        #       print ( 'extracted to=' , str(to) )
        if to == None:  # if no match, we are done and will return
            #           print ( 'mx=' , mx )
            return False if mx == 0 else True  # still success if _scanText() found something
        if self.ptr.lastph != None:
            self.ptr.lastph.lens = to.getLength()

#       print ( 'to=' , str(to) , 'len(s)=' , len(s) , s )
#       posn = self.ctx.countTokensInListing()
#       print ( 'at', posn , 'in token list' )
        self.ctx.addTokenToListing(to)  # add token to listing for sentence
        #       tol = self.ctx.getNthTokenInListing(-1)
        #       print ( 'last token root=' , tol.root )
        return True  # successful lookup
Beispiel #15
0
    def _extractToken ( self , mnl ):

        """
        extract next token from input buffer and look up in grammar table

        arguments:
            self  -
            mnl   - minimum length for any previous match

        returns:
            ellyToken on success, otherwise None

        exceptions:
            ParseOverflow
        """

        d = self.rul                        # grammar rule definitions

        tree = self.ptr                     # parse tree
        buff = self.sbu                     # input source

#       print 'start extraction'
        try:
            w = buff.getNext()              # extract next token
#           print 'got token=' , w
            ws = u''.join(w.root)
        except ellyException.StemmingError as e:
            print >> sys.stderr , 'FATAL error' , e
            sys.exit(1)
#       print 'extracted' , '['+ ws + ']'
        wcapzn = w.isCapitalized()
        wsplit = w.isSplit()

        wl = len(ws)
        if wl > mnl:
            found = self._simpleTableLookUp(ws,tree,wsplit,wcapzn) > 0
#           print 'found in external table=' , found

        if wl >= mnl:
            if ws in self.rul.gtb.dctn:     # look up internally
#               print '"' + ws + '" in dictionary'
                if tree.createPhrasesFromDictionary(ws,wsplit,wcapzn):
                    found = True

#       print 'found in internal dictionary=' , found
        if found:                           # if any success, we are done
            return w
        if mnl > 0:
            return None                     # defer to previous lookup

#       print 'affix logic:'
#       print d.man.pref
#       print d.man.suff
        dvdd = False
        if d.man.analyze(w):                # any analysis possible?
            root = u''.join(w.root)         # if so, get parts of analysis
            tan = w.pres + [ root ] + w.sufs
            if len(w.sufs) > 0:
                sx = w.sufs[-1]
                dvdd = not ellyChar.isApostrophe(sx[1])
#           print 'token analysis=' , tan
            while len(tan) > 0:             # and put back into input
                x = tan.pop()
                buff.prepend(x)
                buff.prepend(' ')
            w = buff.getNext()              # get token again with stemming and macros

#           print 'analyzed w=' , w

            ws = u''.join(w.root)

            if ws[-1] == '+':
#               print 'len(queue)=' , len(tree.queue)
                m = d.ptb.match(w.root,tree)
#               print 'root=' , w.root
#               print 'match=' , m
#               print 'len(queue)=' , len(tree.queue)
#               print 'char span=' , tree.lastph.lens
                if m > 0:
                    tree.lastph.bias = 2
                    found = True

#           print 'after found=' , found
            if len(ws) < mnl: return None   # external lookup?
            if self._simpleTableLookUp(ws,tree,False,wcapzn):  # external lookup
                found = True

            if ws in self.rul.gtb.dctn:     # internal lookup?
                if tree.createPhrasesFromDictionary(ws,wsplit,wcapzn):
                    found = True

        if found:                           # if any success, we are done
#           print 'token recognized'
            w.dvdd = dvdd
            return w

#       print 'still unrecognized token w=' , unicode(w)

        lws = len(ws)
        if lws > 1:                         # special handling of + or -
            if ws[0] == '+' and ws[-1] != '+':
#               print 'root=' , ws          # marks root with prefixes removed
                if self._simpleTableLookUp(ws[1:],tree) > 0:
                    return w
            if ws[0] == '-':
                w.shortenBy(lws-1)          # -X not recognized as suffix
#               print 'w=' , w              # try processing - separately
                cn = buff.peek()
                if ellyChar.isLetterOrDigit(cn):
                    buff.prepend(' ')
                buff.prepend(ws[1:])        # put back X for further analysis

        if self.pnc.match(w.root):          # check if next token is punctuation
#           print 'catg=' , self.pnc.catg , self.pnc.synf.hexadecimal()
            if tree.addLiteralPhrase(self.pnc.catg,self.pnc.synf):
                tree.lastph.lens = w.getLength()
                tree.lastph.krnl.semf.combine(self.pnc.semf)
#               print 'semf=' , self.pnc.semf
#               print 'lastph=' , tree.lastph
#           print 'punc w=' , unicode(w)
        else:
#           print 'must create UNKN leaf node'
            tree.createUnknownPhrase(w)     # unknown type as last resort
            tree.lastph.lens = len(ws)

        return w
Beispiel #16
0
    def _scanText ( self , k ):

        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary, pattern, amd template tables an
        also running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            match parameters [ text span of match , vocabulary match , suffix removed ]

        exceptions:
            ParseOverflow
        """

#       print '_scanText k=' , k
        sb = self.sbu.buffer           # input buffer
        tr = self.ptr                  # parse tree for results

#       print '_scanText sb=' , sb
                                       # initialize match status
        nspan = 0                      #   total span of match
        vmchs = [ ]                    #   chars of vocabulary entry matched
        suffx = ''                     #   any suffix removed in match

        d = self.rul                   # grammar rule definitions

        m = d.ptb.match(sb,tr)         # try token by pattern match next
#       print 'pattern m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

        m = d.ctb.match(sb,tr)         # try multi-word template  match next
#       print 'template m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

        m = self.iex.run(sb)           # try entity extractors next
#       print 'extractor m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

#       lm = len(sb)                   # scan limit
#       print 'lm=' , lm , 'm=' , m
        capd = ellyChar.isUpperCaseLetter(sb[0])
#       print 'next component=' , sb[:k] , ', context=' , sb[k:lm]

        if self.vtb != None:           # look in external dictionary, if it exists
            ls = list(sb[:k])
#           print 'ls 0=' , ls
            ellyChar.toLowerCaseASCII(ls)
            ss = u''.join(ls)                   # where to start for vocabulary indexing
#           print 'ls 1=' , ls
            n = vocabularyTable.delimitKey(ss)  # get actual indexing
#           print 'delimiting n=' , n , '=' , '<' + ss[:n] + '>'
#           print vocabularyTable.listDBKeys(self.vtb.cdb)

            rl = self.vtb.lookUp(sb,n) # get list of the maximum text matches
#           print len(rl) , 'matches'
            if len(rl) > 0:            #
                r0 = rl[0]             # look at first record
#               print 'r0=' , r0
                vmln = r0.nspan        # should be same for all matches
                vchs = r0.vem.chs      #
                vsfx = r0.suffx        #
#               print 'nspan=' , vmln , vsfx

                if ( vmln > nspan or
                     vmln == nspan and vsfx == '' ):

                    nspan = vmln       # keep vocabulary matches
                    vmchs = vchs       #
                    suffx = vsfx       #

                    for r in rl:
                        ve = r.vem     # get vocabulary entry
#                       print 've=' , ve
#                       if ve.gen != None: print 've.gen=' , ve.gen
                        if tr.addLiteralPhraseWithSemantics(
                                ve.cat,ve.syf,ve.smf,ve.bia,ve.gen,len(suffx) > 0):
                            tr.lastph.lens = nspan  # char length of leaf phrase node
                                                    # needed for later selection
                            tr.lastph.krnl.cncp = ve.con
                            if capd:
                                tr.lastph.krnl.semf.set(0)
#                           print 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens
                            if suffx != '':
                                if ellyChar.isApostrophe(suffx[1]):
                                    tr.lastph.krnl.usen = 0

#               print 'vocabulary m=' , vmln
#               print 'queue after table lookup:' , len(self.ptr.queue)

#           print 'sb=' , sb

#       print 'maximum match=' , nspan
#       print 'input=' , self.sbu.buffer[:nspan]

        if nspan > 0:                  # any matches at all?
            tr.requeue()               # if so, keep only longest of them
#       print 'queue after scan:' , len(self.ptr.queue)

#       print 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']'
        return [ nspan , vmchs , suffx ]
Beispiel #17
0
    def _lookUpNext ( self ):

        """
        look up possible next segments in input buffer by various means,
        keeping tokens only for the LONGEST segment

        arguments:
            self

        returns:
            True on successful lookup, False otherwise

        exceptions:
            ParseOverflow
        """

        self.sbu.skipSpaces()          # skip leading spaces
        s = self.sbu.buffer
#       print '_lookUp@0 buffer=' , s

        if len(s) == 0:                # check for end of input
            return False               # if so, done

#       print 'in =' , unicode(self.sbu)
        if self.trs != None:           # preanalysis of number expressions
            self.trs.rewriteNumber(s)

#       print '_lookUp@1 buffer=' , self.sbu.buffer
#       print 'macro expansion s[0]=' , s[0]
        self.sbu.expand()              # apply macro substitutions
#       print 'macro expanded  s[0]=' , s[0]
#       print '_lookUp@2 buffer=' , self.sbu.buffer

        s = self.sbu.buffer

#       print 'expanded len=' , len(s)
        if len(s) == 0: return True    # macros can empty out buffer

        k = self.sbu.findBreak()       # find extent of first component for lookup
        if k == 0:
            k = 1                      # must have at least one char in token

#       print 'break at k=' , k
        kl = len(s)
        if  k + 1 < kl and s[k] == '+' and s[k+1] == ' ':
            k += 1                     # recognize possible prefix

#       print 'len(s)=' , kl , 'k=' , k , 's=', s

#       print '_lookUp@3 buffer=' , self.sbu.buffer
        mr = self._scanText(k)         # text matching in various ways
        mx  = mr[0]                    # overall maximum match length
        chs = mr[1]                    # any vocabulary element matched
        suf = mr[2]                    # any suffix removed in matching
#       print '_lookUp@4 buffer=' , self.sbu.buffer
        s = self.sbu.buffer
#       print 'k=' , k
#       print 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf
#       print 'len(s)=' , len(s) , 's=' , s

        if ( k < mx or
             k == mx and suf != '' ):  # next word cannot produce token as long as already seen?

#           print 'queue:' , len(self.ptr.queue)
#           print 'chs=' , chs
            if len(chs) > 0:           # any vocabulary matches?
#               print 'put back' , suf , mx , s
                self.sbu.skip(mx)      # if so, they supersede
                if suf != '':          # handle any suffix removal
                    self.sbu.prepend(list(suf))
#                   print 'suf=' , suf
            else:
                chs = self.sbu.extract(mx)
#               print 'extracted chs=' , chs
#           print 'token chs=' , chs
            to = ellyToken.EllyToken(chs)
#           print 'long token=' , unicode(to)
            self.ctx.addTokenToListing(to)
            if suf != '':
                if not ellyChar.isApostrophe(suf[1]):
                    to.dvdd = True     # must note suffix removal for token!
#           print 'only queue:' , len(self.ptr.queue)
            return True

#       print 'mx=' , mx
#       print 'plus queue:' , len(self.ptr.queue)
        wsk = self.sbu.buffer[:k]
        cap = ellyChar.isUpperCaseLetter(wsk[0])
#       print 'wsk=' , wsk
        rws = u''.join(wsk)
        found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap)
        if not found:
#           print 'not found, k=' , k
            if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]):
                k -= 1
                rws = rws[:-1]
                found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap)
#       print 'found in dictionary=' , found
        if found or mx > 0:            # match found in dictionary or by text scan
            if not found:
                k = mx                 # if by text scan, must make token longer
                rws = rws[:k]          # if mx > k
            self.sbu.skip(k)
#           print 'next=' , self.sbu.buffer[self.sbu.index:]
#           print 'queue after =' , len(self.ptr.queue)
            to = ellyToken.EllyToken(rws[:k])
            if len(suf) > 1:           # change token to show suffix properly
#               print 'suf=' , suf
                cs = suf[1]            # first char in suffix after '-'
                rt = to.root           # this is a list!
                lk = -1                # start at last char in token
                while rt[lk] != cs: lk -= 1
                sn = len(rt) + lk      # where to divide suffix from root
#               print 'sn=' , sn , rt
                to.root = rt[:sn]      # root without suffix
                self.sbu.prepend(suf)  # restore suffix to input for processing
            else:                      # no suffix
                chx = self.sbu.peek()  # look at next char after match
                if chx == '-':         # if hyphen, need to separate it
                    self.sbu.skip()
                    if ellyChar.isLetter(self.sbu.peek()):
                        self.sbu.prepend(' ')
                    self.sbu.prepend('-')
#           print 'add' , unicode(to)
            self.ctx.addTokenToListing(to)  # add token to listing for sentence
            return True

#       print '[' + rws + ']' , 'still unrecognized'

        chx = rws[0]                   # special hyphen check
        if chx == '-' and k > 1:
#           print 'look in  internal dictionary'
            if self.ptr.createPhrasesFromDictionary(chx,False,False):
#               print 'found!'
                to = ellyToken.EllyToken(chx)  # treat hyphen as token
                self.ctx.addTokenToListing(to) # add it to token list
                self.sbu.skip()                # remove from input
                return True

        to = self._extractToken(mx)    # single-word matching with analysis and lookup

#       print 'extracted to=' , unicode(to)
        if to == None:                 # if no match, we are done and will return
#           print 'mx=' , mx
            return False if mx == 0 else True  # still success if _scanText() found something
        if self.ptr.lastph != None:
            self.ptr.lastph.lens = to.getLength()

#       print 'to=' , unicode(to) , 'len(s)=' , len(s) , s
#       posn = self.ctx.countTokensInListing()
#       print 'at', posn , 'in token list'
        self.ctx.addTokenToListing(to) # add token to listing for sentence
#       tol = self.ctx.getNthTokenInListing(-1)
#       print 'last token root=' , tol.root
        return True                    # successful lookup
Beispiel #18
0
def match ( patn , text , offs=0 , limt=None ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit of matching

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # three private functions using local variables of match()
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        return uf

    def _span ( typw ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
        returns:
            non-negative count if any match possible, otherwise -1
        """
        k = minMatch(patn[mp:])  # calculate min char count to match rest of pattern

#       print "exclude=",k,"@",offs

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findBreak(text,offs) - k # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # char type matching a wildcard

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match?

    if limt == None: limt = len(text)

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit

#   print text[offs:limt],":",list(patn)

    while True:

        ## literally match as many next chars as possible

        while mp < ml:
            if offs >= limt:
                last = ''
            else:
                last = text[offs].lower()
                offs += 1
#           print 'matching last=' , last , 'at' , offs
            if patn[mp] != last: break
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat',mp,"<",ml
#       print "txt @",offs

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",ord(tc)

        if tc == cALL:   # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1  # get new binding record
            bf[0] = offs              # bind from current offset
            offs += nm                # move offset past end of span
            bf[1] = offs              # bind to   new     offset
#           print "offs=",offs
            uf = _mark(1); unj += 1   # get new unwinding record
            uf.count = nm             # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last in [ '.' , ',' , '-' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:"
            if last != '' and ellyChar.isWhiteSpace(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1   # dummy record to block
            mf[0] = -1                #   later binding consolidation
            if last != '':
                offs -= 1             # try for rematch
            m = mp                    # find corresponding EOS
            while m < ml:             #
                if patn[m] == cEOS: break
                m += 1
            else:                     # no EOS?
                m -= 1                # if so, pretend there is one anyway
            uf = _mark(0); unj += 1   # for unwinding on any later match failure
            uf.pats = m + 1           # i.e. one char past next EOS
            uf.txts = offs            # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1             # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
            if last != '':            # still more to match?
                offs -= 1
                nm = _span(tc)        # maximum match possible
#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
                    continue

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch

#       print "fail - unwinding",unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    return mbd             # consolidated bindings plus new offset
Beispiel #19
0
    def _extractToken(self, mnl):
        """
        extract next token from input buffer and look up in grammar table

        arguments:
            self  -
            mnl   - minimum length for any previous match

        returns:
            ellyToken on success, otherwise None

        exceptions:
            ParseOverflow
        """

        d = self.rul  # grammar rule definitions

        tree = self.ptr  # parse tree
        buff = self.sbu  # input source

        #       print ( 'start extraction' )
        try:
            w = buff.getNext()  # extract next token
            #           print ( 'got token=' , w )
            ws = ''.join(w.root)
        except ellyException.StemmingError as e:
            #           print ( 'FATAL error' , e , file=sys.stderr )
            sys.exit(1)
#       print ( 'extracted' , '['+ ws + ']' )
        wcapzn = w.isCapitalized()
        wsplit = w.isSplit()

        wl = len(ws)
        if wl > mnl:
            found = self._simpleTableLookUp(ws, tree, wsplit, wcapzn) > 0
#           print ( 'found in external table=' , found )

        if wl >= mnl:
            if ws in self.rul.gtb.dctn:  # look up internally
                #               print ( v'"' + ws + '" in dictionary' )
                if tree.createPhrasesFromDictionary(ws, wsplit, wcapzn):
                    found = True

#       print ( 'found in internal dictionary=' , found )
        if found:  # if any success, we are done
            return w
        if mnl > 0:
            return None  # defer to previous lookup

#       print ( 'affix logic:' )
#       print ( d.man.pref )
#       print ( d.man.suff )
        dvdd = False
        if d.man.analyze(w):  # any analysis possible?
            root = ''.join(w.root)  # if so, get parts of analysis
            tan = w.pres + [root] + w.sufs
            if len(w.sufs) > 0:
                sx = w.sufs[-1]
                dvdd = not ellyChar.isApostrophe(sx[1])
#           print ( 'token analysis=' , tan )
            while len(tan) > 0:  # and put back into input
                x = tan.pop()
                buff.prepend(x)
                buff.prepend(' ')
            w = buff.getNext()  # get token again with stemming and macros

            #           print ( 'analyzed w=' , w )

            ws = ''.join(w.root)

            if ws[-1] == '+':
                #               print ( 'len(queue)=' , len(tree.queue) )
                m = d.ptb.match(w.root, tree)
                #               print ( 'root=' , w.root )
                #               print ( 'match=' , m )
                #               print ( 'len(queue)=' , len(tree.queue) )
                #               print ( 'char span=' , tree.lastph.lens )
                if m > 0:
                    tree.lastph.bias = 2
                    found = True

#           print ( 'after found=' , found )
            if len(ws) < mnl: return None  # external lookup?
            if self._simpleTableLookUp(ws, tree, False,
                                       wcapzn):  # external lookup
                found = True

            if ws in self.rul.gtb.dctn:  # internal lookup?
                if tree.createPhrasesFromDictionary(ws, wsplit, wcapzn):
                    found = True

        if found:  # if any success, we are done
            #           print ( 'token recognized' )
            w.dvdd = dvdd
            return w

#       print ( 'still unrecognized token w=' , str(w) )

        lws = len(ws)
        if lws > 1:  # special handling of + or -
            if ws[0] == '+' and ws[-1] != '+':
                #               print ( 'root=' , ws )      # marks root with prefixes removed
                if self._simpleTableLookUp(ws[1:], tree) > 0:
                    return w
            if ws[0] == '-':
                w.shortenBy(lws - 1)  # -X not recognized as suffix
                #               print ( 'w=' , w )          # try processing - separately
                cn = buff.peek()
                if ellyChar.isLetterOrDigit(cn):
                    buff.prepend(' ')
                buff.prepend(ws[1:])  # put back X for further analysis

        if self.pnc.match(w.root):  # check if next token is punctuation
            #           print ( 'catg=' , self.pnc.catg , self.pnc.synf.hexadecimal() )
            if tree.addLiteralPhrase(self.pnc.catg, self.pnc.synf):
                tree.lastph.lens = w.getLength()
                tree.lastph.krnl.semf.combine(self.pnc.semf)


#               print ( 'semf=' , self.pnc.semf )
#               print ( 'lastph=' , tree.lastph )
#           print ( 'punc w=' , str(w) )
        else:
            #           print ( 'must create UNKN leaf node' )
            tree.createUnknownPhrase(w)  # unknown type as last resort
            tree.lastph.lens = len(ws)

        return w