Python isApostrophe Examples

Programming Language: Python

Namespace/Package Name: ellyChar

Method/Function: isApostrophe

Examples at hotexamples.com: 19

Python isApostrophe - 19 examples found. These are the top rated real world Python examples of ellyChar.isApostrophe extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: deinflectedMatching.py Project: prohippo/pyelly

def finAPO ( ss , sp ):

    """
    handle final apostrophes

    arguments:
        ss  - character stream
        sp  - last char of word in stream
    """

    lss = len(ss)
#   print 'finAPO lss=' , lss , ss[sp:]
    if lss > sp + 1:
#       print 'ending=' , ss[sp:]
        if ellyChar.isApostrophe(ss[sp+1]):
            if lss > sp + 2:
#               print 'ss=' , ss[sp:]
                if ss[sp+2].lower() == 's':
                    if terminate(ss,sp+3,lss):
                        sp += 1
#                       print 'sp=' , sp
                        ss[sp] = "'"
                        return
            if ss[sp].lower() == 's' and terminate(ss,sp+2,lss):
                sp += 1
                ss[sp] = Ls

Example #2

Show file

File: macroTable.py Project: prohippo/pyelly

    def getRules ( self , a ):

        """
        get appropriate macros for text with specified starting char

        arguments:
            self  -
            a     - first letter of current buffer contents (NOT space!)

        returns:
            a list of unpacked macro rules to try out
        """

#       print 'getRules(a=' , a , ')'
        if a == '': return [ ]
        if ellyChar.isLetterOrDigit(a):
            k = ellyChar.toIndex(a)
            ls = self.index[k]
#           print 'index a=' , a , 'k=' , k
            ws = self.letWx if ellyChar.isLetter(a) else self.digWx
            uniqueAdd(ls,ws)
            uniqueAdd(ls,self.anyWx)
        elif ellyChar.isApostrophe(a):
            ls = self.apoWx
        else:
            ls = self.index[0]
            uniqueAdd(ls,self.anyWx)
#       print len(ls) , ' rules to check'
        return [ r.unpack() for r in ls ]

Example #3

Show file

def finAPO(ss, sp):
    """
    handle final apostrophes

    arguments:
        ss  - character stream
        sp  - last char of word in stream
    """

    lss = len(ss)
    #   print 'finAPO lss=' , lss , ss[sp:]
    if lss > sp + 1:
        #       print 'ending=' , ss[sp:]
        if ellyChar.isApostrophe(ss[sp + 1]):
            if lss > sp + 2:
                #               print 'ss=' , ss[sp:]
                if ss[sp + 2].lower() == 's':
                    if terminate(ss, sp + 3, lss):
                        sp += 1
                        #                       print 'sp=' , sp
                        ss[sp] = "'"
                        return
            if ss[sp].lower() == 's' and terminate(ss, sp + 2, lss):
                sp += 1
                ss[sp] = Ls

Example #4

Show file

File: vocabularyTable.py Project: belkhir-nacim/pyelly

def delimitKey ( t ):

    """
    get part of term for vocabulary table indexing that
    ends in alphanumeric or is a single nonalphanumeric
    with special stripping of 'S at the end

    arguments:
        t  - text string to scan

    returns:
        count of chars to put into search key
    """

    ln = len(t)                   # number of chars in input text
    if ln == 0: return 0
    n = t.find(' ')               # find rough range of key for SQLite in text
    if n < 0: n = ln              # if undivided by spaces, take everything
    n -= 1                        # index of last char in range
    while n > 0:                  # scan input text backwards
        c = t[n]                  # check char for alphanumeric
        if ellyChar.isLetterOrDigit(c):
#           print 'n=' , n , 'c=' , c
            if n > 1:             # check for 'S as special case!
                if ( c in [ 's' , 'S' ] and
                     ellyChar.isApostrophe(t[n-1]) ):
#                   print 'drop \'S from SQLite key'
                    n -= 1
                else:
                    break
            else:
                break
        n -= 1                    # continue scanning backwards
    return n + 1                  # to get key length ending in alphanumeric

Example #5

Show file

    def getRules(self, a):
        """
        get appropriate macros for text with specified starting char

        arguments:
            self  -
            a     - first letter of current buffer contents (NOT space!)

        returns:
            a list of unpacked macro rules to try out
        """

        #       print ( 'getRules(a=' , a , ')' )
        if a == '': return []
        if ellyChar.isLetterOrDigit(a):
            k = ellyChar.toIndex(a)
            ls = self.index[k]
            #           print ( 'index a=' , a , 'k=' , k )
            ws = self.letWx if ellyChar.isLetter(a) else self.digWx
            uniqueAdd(ls, ws)
            uniqueAdd(ls, self.anyWx)
        elif ellyChar.isApostrophe(a):
            ls = self.apoWx
        else:
            ls = self.index[0]
            uniqueAdd(ls, self.anyWx)
#       print ( len(ls) , ' rules to check' )
        return [r.unpack() for r in ls]

Example #6

Show file

    def simplify(self, strg):
        """
        apply inflectional stemming to string

        arguments:
           self  -
           strg  - input Unicode string

        returns:
           stemmed Unicode string
        """

        if len(strg) < 4: return strg
        if strg[-1] == "s" and ellyChar.isApostrophe(strg[-2]):
            return strg[:-2]
        else:
            t = ellyToken.EllyToken(strg)
            self.apply(t)
            return t.toUnicode()

Example #7

Show file

File: vocabularyTable.py Project: prohippo/pyelly

def delimitKey ( t ):

    """
    get bounds of vocabulary table key for looking up a term
    starting at the front of a given text string
    with special stripping of 'S at the end

    arguments:
        t  - text string to scan

    returns:
        count of chars to take for search key
    """

    ln = len(t)                   # number of chars in input text
    if ln == 0: return 0
    if not ellyChar.isLetterOrDigit(t[0]): return 1

#   print 'delimitKey t=' , t

    k = t.find('-')               # find rough range of SQLite key in text
    n = t.find(' ')               # delimited by either a hyphen or a space
    if n < 0: n = ln              # if space, take everything
    if k > 1 and n > k: n = k     # hyphen delimits if it comes first
    n -= 1                        # index of last char of candidate key
#   print 'k=' , k , 'n=' , n

    while n > 0:                  # scan input text backwards
        c = t[n]                  # check char for alphanumeric
        if ellyChar.isLetterOrDigit(c):
#           print 'n=' , n , 'c=' , c
            if n > 1:             # check for 'S as special case!
                if ( c in [ 's' , 'S' ] and
                     ellyChar.isApostrophe(t[n-1]) ):
#                   print 'drop \'S from SQLite key'
                    n -= 1
                else:
                    break
            else:
                break
        n -= 1                    # continue scanning backwards
#   print 'key=' , t[:n+1]
    return n + 1                  # to get key length ending in alphanumeric

Example #8

Show file

def delimitKey(t):
    """
    get bounds of vocabulary table key for looking up a term
    starting at the front of a given text string
    with special stripping of 'S at the end

    arguments:
        t  - text string to scan

    returns:
        count of chars to take for search key
    """

    ln = len(t)  # number of chars in input text
    if ln == 0: return 0
    if not ellyChar.isLetterOrDigit(t[0]): return 1

    #   print ( 'delimitKey t=' , t )

    k = t.find('-')  # find rough range of SQLite key in text
    n = t.find(' ')  # delimited by either a hyphen or a space
    if n < 0: n = ln  # if space, take everything
    if k > 1 and n > k: n = k  # hyphen delimits if it comes first
    n -= 1  # index of last char of candidate key
    #   print ( 'k=' , k , 'n=' , n )

    while n > 0:  # scan input text backwards
        c = t[n]  # check char for alphanumeric
        if ellyChar.isLetterOrDigit(c):
            #           print ( 'n=' , n , 'c=' , c )
            if n > 1:  # check for 'S as special case!
                if (c in ['s', 'S'] and ellyChar.isApostrophe(t[n - 1])):
                    #                   print ( 'drop \'S from SQLite key' )
                    n -= 1
                else:
                    break
            else:
                break
        n -= 1  # continue scanning backwards


#   print ( 'key=' , t[:n+1] )
    return n + 1  # to get key length ending in alphanumeric

Example #9

Show file

File: inflectionStemmerEN.py Project: belkhir-nacim/pyelly

    def simplify ( self , strg ):

        """
        apply inflectional stemming to string

        arguments:
           self  -
           strg  - input Unicode string

        returns:
           stemmed Unicode string
        """

        if len(strg) < 4: return strg
        if strg[-1] == "s" and ellyChar.isApostrophe(strg[-2]):
            return strg[:-2]
        else:
            t = ellyToken.EllyToken(strg)
            self.apply(t)
            return t.toUnicode()

Example #10

Show file

    def _scanText(self, k):
        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary, pattern, amd template tables an
        also running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            match parameters [ text span of match , vocabulary match , suffix removed ]

        exceptions:
            ParseOverflow
        """

        #       print ( '_scanText k=' , k )
        sb = self.sbu.buffer  # input buffer
        tr = self.ptr  # parse tree for results

        #       print ( '_scanText sb=' , sb )
        # initialize match status
        nspan = 0  #   total span of match
        vmchs = []  #   chars of vocabulary entry matched
        suffx = ''  #   any suffix removed in match

        d = self.rul  # grammar rule definitions

        m = d.ptb.match(sb, tr)  # try token by pattern match next
        #       print ( 'pattern m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

        m = d.ctb.match(sb, tr)  # try multi-word template  match next
        #       print ( 'template m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

        m = self.iex.run(sb)  # try entity extractors next
        #       print ( 'extractor m=' , m )
        if nspan < m:
            nspan = m  # on longer match, update maximum

#       print ( 'nspan=' , nspan, sb[:nspan] )

        lm = len(sb)  # scan limit
        #       print ( 'lm=' , lm , 'm=' , m )
        capd = ellyChar.isUpperCaseLetter(sb[0])
        #       print ( 'next component=' , sb[:k] , ', context=' , sb[k:lm] )

        if self.vtb != None:  # look in external dictionary, if it exists
            ls = list(sb[:k])
            #           print ( 'vtb ls 0=' , ls )
            ellyChar.toLowerCaseASCII(ls)
            ss = ''.join(ls)  # where to start for vocabulary indexing
            #           print ( 'vtb ls 1=' , ls )
            n = vocabularyTable.delimitKey(ss)  # get actual indexing
            #           print ( 'delimiting n=' , n , ':' , '<' + ss[:n] + '>' )
            #           print ( vocabularyTable.listDBKeys(self.vtb.cdb) )

            rl = self.vtb.lookUp(sb, n)  # get list of the maximum text matches
            #           print ( 'external matches=' , len(rl) )
            #           print ( 'input text=' , sb )

            if len(rl) > 0:  #
                r0 = rl[0]  # look at first record
                #               print ( 'r0=' , r0 )
                vmln = r0.nspan  # should be same for all matches
                vchs = r0.vem.chs  #
                vsfx = r0.suffx  #
                #               print ( 'nspan=' , vmln , vsfx )

                if (vmln > nspan or vmln == nspan and vsfx == ''):

                    nspan = vmln  # keep vocabulary matches
                    vmchs = vchs  #
                    suffx = vsfx  #

                    for r in rl:
                        ve = r.vem  # get vocabulary entry
                        #                       print ( 've=' , ve )
                        #                       if ve.gen != None: print ( 've.gen=' , ve.gen )
                        if tr.addLiteralPhraseWithSemantics(
                                ve.cat, ve.syf, ve.smf, ve.bia, ve.gen,
                                len(suffx) > 0):
                            tr.lastph.lens = nspan  # char length of leaf phrase node
                            # needed for later selection
                            tr.lastph.krnl.cncp = ve.con
                            if capd:
                                tr.lastph.krnl.semf.set(0)
#                           print ( 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens )
                            if suffx != '':
                                if ellyChar.isApostrophe(suffx[1]):
                                    tr.lastph.krnl.usen = 0

#               print ( 'vocabulary m=' , vmln )
#               print ( 'queue after table lookup:' , len(self.ptr.queue) )

#           print ( 'vtb sb=' , sb )

#       print ( 'maximum match=' , nspan )
#       print ( 'next input=' , sb[:nspan] )

        if nspan > 0:  # any matches at all?
            tr.requeue()  # if so, keep only longest of them
#       print ( 'queue after scan:' , len(self.ptr.queue) )

#       print ( 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']' )
        return [nspan, vmchs, suffx]

Example #11

Show file

def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __str__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + str(self.kind)  +
                     ',ct=' + str(self.count) +
                     ',pa=' + str(self.pats)  +
                     ',tx=' + str(self.txts)  +
                     ',bd=' + str(self.bnds)  +
                     ',ns=' + str(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print ( "binding:",offs,ns )
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print ( "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw) )
#       print ( "_span: txt @",offs,"pat @",mp,"nsp=",nsp )
#       print ( "text to span:",text[offs:] )
#       print ( "pat rest=" , patn[mp:] )
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print ( "exclude=",k,"chars from possible span for rest of pattern" )

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print ( mx,"chars available to scan" )
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print ( "text at",offs,"maximum wildcard match=",mx )

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print ( 'span c=' , c )
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print ( "maximum wildcard span=",nm )

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print ( 'starting match, limt=',limt,text[offs:limt],":",patn )
#   print ( 'nsps=' , nsps )

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print ( '---- loop mp=' , mp , 'ml=' , ml )
        while mp < ml:
            if offs >= limt:
#               print ( "offs=",offs,"limt=",limt )
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print ( 'patn=' , patn )
            mc = patn[mp]
#           print ( 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs )
#           print ( 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')' )
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print ( 'hyphen special matching, limt=', limt , 'offs=' , offs )
#                       print ( 'text[offs:]=' , text[offs:] )
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print ( 'no special matching of hyphen' )
                        break

#           print ( 'matched @mp=' , mp )
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print ( 'pat @',mp,"<",ml )
#       print ( "txt @",offs,'<',limt,'last=',last )
#       print ( '@',offs,text[offs:] )

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print ( "tc=",'{:04x}'.format(ord(tc)),deconvert(tc) )

        if tc == cALL:      # a * wildcard?

#           print ( "ALL last=< " + last + " >" )
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print ( "offs=",offs,'nm=',nm )
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print ( "END $:",last )
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue
            elif not ellyChar.isText(last):
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print ( "ANY:",last,offs )
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print ( 'at cCAN' )
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print ( "ALF:",last,offs )
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print ( "UPR:",last,'@',offs )
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print ( "LWR:",last,'@',offs )
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print ( "SPC:","["+last+"]" )
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print ( 'NO space' )

        elif tc == cAPO: # apostrophe wildcard?
#           print ( "APO: last=" , last )
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print ( "SOS" )
#           print ( last,'@',offs )
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print ( "EOS" )
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print ( 'spanning wildcard, offs=' , offs , 'last=(' + last + ')' )
            if last != '':               # still more to match?
                offs -= 1
#               print ( 'nsps=' , nsps )
#               print ( '@' , offs , text )
                nm = _span(tc,nsps)      # maximum match possible

#               print ( 'spanning=' , nm )
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print ( 'spanning=' , nm )
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print ( 'offs=' , offs )
                    last = text[offs] if offs < limt else ''
                    continue
#           print ( 'fail tc=' , deconvert(tc) )

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print ( "fail - unwinding" , unj )

        while unj > 0:               # try unwinding, if possible
#           print ( "unw:",unj )
            uf = unw[unj-1]          # get most recent unwinding record
#           print ( uf )
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print ( "no unwinding" )
            break                   # quit if unwinding is exhausted
#       print ( 'cnt=' , uf.count , 'off=' , offs )

    ##
    ## clean up on match mode or on no match possible
    ##

#   print ( "matched=",matched )

    if not matched: return None     # no bindings

#   print ( text,offs )

    ## consolidate contiguous bindings for subsequent substitutions

#   print ( "BEFORE consolidating consecutive bindings" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print ( "AFTER" )
#   print ( "bd:",len(mbd) )
#   print ( mbd[0] )
#   print ( '----' )
#   for b in mbd[1:]:
#       print ( b )

    return mbd             # consolidated bindings plus new offset

Example #12

Show file

File: vocabularyTable.py Project: belkhir-nacim/pyelly

    def doMatchUp ( self , vcs , txs ):

        """
        match current text with vocabulary entry, possibly removing final inflection
        (this method assumes English; override for other languages)

        arguments:
            self  -
            vcs   - vocabulary entry chars
            txs   - text chars to be matched

        returns:
            count of txs chars matched, 0 on mismatch
        """

#       print 'match up vcs=' , vcs
        self.endg = ''                       # default inflection
        lvc = len(vcs)
        ltx = len(txs)
        if ltx < lvc: return 0

        nr = icmpr(vcs,txs)                  # do match on lists of chars
#       print 'nr=' , nr , 'nt='
#       print 'txs=' , txs , 'ltx=' , ltx

        if nr == 0:                          # vocabulary entry fully matched?
            if ltx == lvc:
                return ltx                   # if no more text, done
            dnc = ltx - lvc                  # otherwise, check text past match
#           print 'dnc=' , dnc

            if ellyChar.isApostrophe(txs[lvc]):             # apostrophe check
                if dnc > 1 and txs[lvc+1] in [ 's' , 'S' ]: # 'S found?
                    if dnc == 2 or _terminate(txs[lvc+2]):
                        self.endg = '-\'s'                  #
                        return lvc + 2                      # if so, remove ending
                    return 0
                if txs[lvc-1] in [ 's' , 'S' ]:             # S' found?
                    if dnc == 1 or _terminate(txs[lvc+1]):
                        self.endg = '-\'s'                  # put in implied S!
                        return lvc + 1                      # if so, remove ending
                    return 0
                return lvc                                  # break at
            if _terminate(txs[lvc]):
                return lvc                                  # successful match

#       alphanumeric follows match either full or partial
#       try inflectional stemming to align a match here

        if self.stm == None:
            return 0                     # if no stemmer, no match possible

        k = lvc - nr + 1                 # get extent of text to match against
        while k < ltx:
            if _terminate(txs[k]):
                break                    # find current end of text to match
            k += 1
        n = k - 1
        while n > 0:
            if _terminate(txs[n]):
                n += 1
                break                    # find start of stemming
            n -= 1

#       print 'k=' , k , 'n=' , n , 'nr=' , nr

        if k - n < nr:                   # check if stemming could help match
            return 0

        tc = txs[k-1]                    # last char at end of text to match
#       print 'tc=' , tc
        if tc != 's' and tc != 'd' and tc != 'g':
            return 0                     # only -S, -ED, and -ING checked

        tw = ''.join(txs[n:k])           # segment of text for stemming
        sw = self.stm.simplify(tw)       # inflectional stemming
#       print 'sw=' , sw , 'tw=' , tw
        if len(sw) + n != lvc:           # stemmed result should now align
            return 0                     #   with vocabulary entry

#       print 'nr=' , nr
        ns = 0 if nr == 0 else icmpr(vcs[-nr:],sw[-nr:]) # continue from previous match
#       print 'ns=' , ns
        if ns == 0:                      # mismatch gone?
            self.endg = ( '-s'   if tc == 's' else
                          '-ed'  if tc == 'd' else
                          '-ing' )       # indicate ending removed
#           print 'txs=' , txs
#           print 'ltx=' , ltx , 'endg=' , self.endg
            return k                     # successful match

        return 0                         # no match by default

Example #13

Show file

File: ellyWildcard.py Project: prohippo/pyelly

def match ( patn , text , offs=0 , limt=None , nsps=0 ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit for any matching
        nsps  - number of spaces to match in pattern

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
            nsps   - saved count of spaces matched
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1
            self.nsps  = 0

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  +
                     ',ns=' + unicode(self.nsps)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # four private functions using local variables of match() defined just above
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind , nsp ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
            nsp   - number of spaces in pattern still unmatched
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        uf.nsps  = nsp
        return uf

    def _span ( typw , nsp=0 ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
            nsp  - spaces to be matched in pattern
        returns:
            non-negative count if any match possible, otherwise -1
        """
#       print "_span: typw=" , '{:04x}'.format(ord(typw)) , deconvert(typw)
#       print "_span: txt @",offs,"pat @",mp,"nsp=",nsp
#       print "text to span:",text[offs:]
#       print "pat rest=" , patn[mp:]
        k = minMatch(patn[mp:])                # calculate min char count to match rest of pattern

#       print "exclude=",k,"chars from possible span for rest of pattern"

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findExtendedBreak(text,offs,nsp)
#       print mx,"chars available to scan"
        mx -= k                                # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # matchup function for wildcard type

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
#           print 'span c=' , c
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match yet?

    if limt == None: limt = len(text)

#   print 'starting match, limt=',limt,text[offs:limt],":",patn
#   print 'nsps=' , nsps

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit
    last = ''

    while True:

        ## literally match as many next chars as possible

#       print '---- loop mp=' , mp , 'ml=' , ml
        while mp < ml:
            if offs >= limt:
#               print "offs=",offs,"limt=",limt
                last = ''
            elif limt == 0:
                break
            else:
                last = text[offs]
                offs += 1
#           print 'patn=' , patn
            mc = patn[mp]
#           print 'matching last=' , last, '(' , '{:04x}'.format(ord(last)) if last != '' else '-', ') at' , offs
#           print 'against       ' , mc  , '(' , '{:04x}'.format(ord(mc)) , ')'
            if mc != last:
                if mc != last.lower():
                    if mc == Hyphn and last == ' ' and limt - offs > 2:
#                       print 'hyphen special matching, limt=', limt , 'offs=' , offs
#                       print 'text[offs:]=' , text[offs:]
                        if text[offs] != Hyphn or text[offs+1] != ' ':
                            break
                        offs += 2
                    else:
#                       print 'no special matching of hyphen'
                        break

#           print 'matched @mp=' , mp
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat @',mp,"<",ml
#       print "txt @",offs,'<',limt,'last=',last
#       print '@',offs,text[offs:]

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",'{:04x}'.format(ord(tc)),deconvert(tc)

        if tc == cALL:      # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL,nsps)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1     # get new binding record
            bf[0] = offs                 # bind from current offset
            offs += nm                   # move offset past end of span
            bf[1] = offs                 # bind to   new     offset
#           print "offs=",offs,'nm=',nm
            uf = _mark(1,nsps); unj += 1 # get new unwinding record
            uf.count = nm                # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last == '-':
                offs -= 1
                continue
            elif last in [ '.' , ',' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls or txc in Grk:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' , ellyChar.HYPH ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
#           print "ANY:",last,offs
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cCAN: # nonalphanumeric wildcard?
#           print 'at cCAN'
            if last != ellyChar.AMP:
                if last == '' or not ellyChar.isLetterOrDigit(last):
                    _bind(); mbi += 1
                    continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cUPR: # uppercase letter wildcard?
#           print "UPR:",last,'@',offs
            if last != '' and ellyChar.isUpperCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cLWR: # lowercase letter wildcard?
#           print "LWR:",last,'@',offs
            if last != '' and ellyChar.isLowerCaseLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:","["+last+"]"
            if last != '' and ellyChar.isWhiteSpace(last):
                nsps -= 1
                _bind(); _modify(); mbi += 1
                continue
#           print 'NO space'

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1      # dummy record to block
            mf[0] = -1                   #   later binding consolidation
            if last != '':
                offs -= 1                # try for rematch
            m = mp                       # find corresponding EOS
            while m < ml:                #
                if patn[m] == cEOS: break
                m += 1
            else:                        # no EOS?
                m -= 1                   # if so, pretend there is one anyway
            uf = _mark(0,nsps); unj += 1 # for unwinding on any later match failure
            uf.pats = m + 1              # i.e. one char past next EOS
            uf.txts = offs               # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1                # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
#           print 'spanning wildcard, offs=' , offs , 'last=(' + last + ')'
            if last != '':               # still more to match?
                offs -= 1
#               print 'nsps=' , nsps
#               print '@' , offs , text
                nm = _span(tc,nsps)      # maximum match possible

#               print 'spanning=' , nm
                if nm == 0:                             # compensate for findExtendedBreak peculiarity
                    if offs + 1 < limt and mp < ml:     # with closing ] or ) to be matched in pattern
                        if patn[mp] in Enc:             # from text input
                            nm += 1

#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1,nsps); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
#                   print 'offs=' , offs
                    last = text[offs] if offs < limt else ''
                    continue
#           print 'fail tc=' , deconvert(tc)

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch
        ##

#       print "fail - unwinding" , unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            nsps = uf.nsps           #
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted
#       print 'cnt=' , uf.count , 'off=' , offs

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating consecutive bindings"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   print mbd[0]
#   print '----'
#   for b in mbd[1:]:
#       print b

    return mbd             # consolidated bindings plus new offset

Example #14

Show file

    def _lookUpNext(self):
        """
        look up possible next segments in input buffer by various means,
        keeping tokens only for the LONGEST segment

        arguments:
            self

        returns:
            True on successful lookup, False otherwise

        exceptions:
            ParseOverflow
        """

        self.sbu.skipSpaces()  # skip leading spaces
        s = self.sbu.buffer
        #       print ( '_lookUp@0 buffer=' , s )

        if len(s) == 0:  # check for end of input
            return False  # if so, done

#       print ( 'in =' , str(self.sbu) )
        if self.trs != None:  # preanalysis of number expressions
            self.trs.rewriteNumber(s)

#       print ( '_lookUp@1 buffer=' , self.sbu.buffer )
#       print ( 'macro expansion s[0]=' , s[0] )
        self.sbu.expand()  # apply macro substitutions
        #       print ( 'macro expanded  s[0]=' , s[0] )
        #       print ( '_lookUp@2 buffer=' , self.sbu.buffer )

        s = self.sbu.buffer

        #       print ( 'expanded len=' , len(s) )
        if len(s) == 0: return True  # macros can empty out buffer

        k = self.sbu.findBreak()  # find extent of first component for lookup
        if k == 0:
            k = 1  # must have at least one char in token

#       print ( 'break at k=' , k )
        kl = len(s)
        if k + 1 < kl and s[k] == '+' and s[k + 1] == ' ':
            k += 1  # recognize possible prefix

#       print ( 'len(s)=' , kl , 'k=' , k , 's=', s )

#       print ( '_lookUp@3 buffer=' , self.sbu.buffer )
        mr = self._scanText(k)  # text matching in various ways
        mx = mr[0]  # overall maximum match length
        chs = mr[1]  # any vocabulary element matched
        suf = mr[2]  # any suffix removed in matching
        #       print ( '_lookUp@4 buffer=' , self.sbu.buffer )
        s = self.sbu.buffer
        #       print ( 'k=' , k )
        #       print ( 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf )
        #       print ( 'len(s)=' , len(s) , 's=' , s )

        if (k < mx or k == mx and suf != ''
            ):  # next word cannot produce token as long as already seen?

            #           print ( 'queue:' , len(self.ptr.queue) )
            #           print ( 'chs=' , chs )
            if len(chs) > 0:  # any vocabulary matches?
                #               print ( 'put back' , suf , mx , s )
                self.sbu.skip(mx)  # if so, they supersede
                if suf != '':  # handle any suffix removal
                    self.sbu.prepend(list(suf))
#                   print ( 'suf=' , suf )
            else:
                chs = self.sbu.extract(mx)

#           print ( 'extract chs=' , chs )
            to = ellyToken.EllyToken(chs)
            #           print ( 'token=' , str(to) )
            self.ctx.addTokenToListing(to)
            if suf != '':
                if not ellyChar.isApostrophe(suf[1]):
                    to.dvdd = True  # must note suffix removal for token!
#           print ( 'only queue:' , len(self.ptr.queue) )
            return True

#       print ( 'mx=' , mx )
#       print ( 'plus queue:' , len(self.ptr.queue) )
        wsk = self.sbu.buffer[:k]
        cap = ellyChar.isUpperCaseLetter(wsk[0])
        #       print ( 'wsk=' , wsk )
        rws = ''.join(wsk)
        found = self.ptr.createPhrasesFromDictionary(rws.lower(), False, cap)
        if not found:
            if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]):
                k -= 1
                rws = rws[:-1]
                found = self.ptr.createPhrasesFromDictionary(
                    rws.lower(), False, cap)
#       print ( rws , 'found in dictionary=' , found )
        if found or mx > 0:  # match found in dictionary or by text scan
            if not found:
                k = mx  # if by text scan, must make token longer
                rws = rws[:k]  # if mx > k
            self.sbu.skip(k)
            #           print ( 'next=' , self.sbu.buffer[self.sbu.index:] )
            #           print ( 'queue after =' , len(self.ptr.queue) )
            to = ellyToken.EllyToken(rws[:k])
            if len(suf) > 1:  # change token to show suffix properly
                #               print ( 'suf=' , suf )
                cs = suf[1]  # first char in suffix after '-'
                rt = to.root  # this is a list!
                lk = -1  # start at last char in token
                while rt[lk] != cs:
                    lk -= 1
                sn = len(rt) + lk  # where to divide suffix from root
                #               print ( 'sn=' , sn , rt )
                to.root = rt[:sn]  # root without suffix
                self.sbu.prepend(suf)  # restore suffix to input for processing
            else:  # no suffix
                chx = self.sbu.peek()  # look at next char after match
                if chx == '-':  # if hyphen, need to separate it
                    self.sbu.skip()
                    if ellyChar.isLetter(self.sbu.peek()):
                        self.sbu.prepend(' ')
                    self.sbu.prepend('-')
#           print ( 'add' , str(to) )
            self.ctx.addTokenToListing(to)  # add token to listing for sentence
            return True

#       print ( '[' + rws + ']' , 'still unrecognized' )

        chx = rws[0]  # special hyphen check
        if chx == '-' and k > 1:
            #           print ( 'look in  internal dictionary' )
            if self.ptr.createPhrasesFromDictionary(chx, False, False):
                #               print ( 'found!' )
                to = ellyToken.EllyToken(chx)  # treat hyphen as token
                self.ctx.addTokenToListing(to)  # add it to token list
                self.sbu.skip()  # remove from input
                return True

        to = self._extractToken(
            mx)  # single-word matching with analysis and lookup

        #       print ( 'extracted to=' , str(to) )
        if to == None:  # if no match, we are done and will return
            #           print ( 'mx=' , mx )
            return False if mx == 0 else True  # still success if _scanText() found something
        if self.ptr.lastph != None:
            self.ptr.lastph.lens = to.getLength()

#       print ( 'to=' , str(to) , 'len(s)=' , len(s) , s )
#       posn = self.ctx.countTokensInListing()
#       print ( 'at', posn , 'in token list' )
        self.ctx.addTokenToListing(to)  # add token to listing for sentence
        #       tol = self.ctx.getNthTokenInListing(-1)
        #       print ( 'last token root=' , tol.root )
        return True  # successful lookup

Example #15

Show file

File: ellyBase.py Project: prohippo/pyelly

    def _extractToken ( self , mnl ):

        """
        extract next token from input buffer and look up in grammar table

        arguments:
            self  -
            mnl   - minimum length for any previous match

        returns:
            ellyToken on success, otherwise None

        exceptions:
            ParseOverflow
        """

        d = self.rul                        # grammar rule definitions

        tree = self.ptr                     # parse tree
        buff = self.sbu                     # input source

#       print 'start extraction'
        try:
            w = buff.getNext()              # extract next token
#           print 'got token=' , w
            ws = u''.join(w.root)
        except ellyException.StemmingError as e:
            print >> sys.stderr , 'FATAL error' , e
            sys.exit(1)
#       print 'extracted' , '['+ ws + ']'
        wcapzn = w.isCapitalized()
        wsplit = w.isSplit()

        wl = len(ws)
        if wl > mnl:
            found = self._simpleTableLookUp(ws,tree,wsplit,wcapzn) > 0
#           print 'found in external table=' , found

        if wl >= mnl:
            if ws in self.rul.gtb.dctn:     # look up internally
#               print '"' + ws + '" in dictionary'
                if tree.createPhrasesFromDictionary(ws,wsplit,wcapzn):
                    found = True

#       print 'found in internal dictionary=' , found
        if found:                           # if any success, we are done
            return w
        if mnl > 0:
            return None                     # defer to previous lookup

#       print 'affix logic:'
#       print d.man.pref
#       print d.man.suff
        dvdd = False
        if d.man.analyze(w):                # any analysis possible?
            root = u''.join(w.root)         # if so, get parts of analysis
            tan = w.pres + [ root ] + w.sufs
            if len(w.sufs) > 0:
                sx = w.sufs[-1]
                dvdd = not ellyChar.isApostrophe(sx[1])
#           print 'token analysis=' , tan
            while len(tan) > 0:             # and put back into input
                x = tan.pop()
                buff.prepend(x)
                buff.prepend(' ')
            w = buff.getNext()              # get token again with stemming and macros

#           print 'analyzed w=' , w

            ws = u''.join(w.root)

            if ws[-1] == '+':
#               print 'len(queue)=' , len(tree.queue)
                m = d.ptb.match(w.root,tree)
#               print 'root=' , w.root
#               print 'match=' , m
#               print 'len(queue)=' , len(tree.queue)
#               print 'char span=' , tree.lastph.lens
                if m > 0:
                    tree.lastph.bias = 2
                    found = True

#           print 'after found=' , found
            if len(ws) < mnl: return None   # external lookup?
            if self._simpleTableLookUp(ws,tree,False,wcapzn):  # external lookup
                found = True

            if ws in self.rul.gtb.dctn:     # internal lookup?
                if tree.createPhrasesFromDictionary(ws,wsplit,wcapzn):
                    found = True

        if found:                           # if any success, we are done
#           print 'token recognized'
            w.dvdd = dvdd
            return w

#       print 'still unrecognized token w=' , unicode(w)

        lws = len(ws)
        if lws > 1:                         # special handling of + or -
            if ws[0] == '+' and ws[-1] != '+':
#               print 'root=' , ws          # marks root with prefixes removed
                if self._simpleTableLookUp(ws[1:],tree) > 0:
                    return w
            if ws[0] == '-':
                w.shortenBy(lws-1)          # -X not recognized as suffix
#               print 'w=' , w              # try processing - separately
                cn = buff.peek()
                if ellyChar.isLetterOrDigit(cn):
                    buff.prepend(' ')
                buff.prepend(ws[1:])        # put back X for further analysis

        if self.pnc.match(w.root):          # check if next token is punctuation
#           print 'catg=' , self.pnc.catg , self.pnc.synf.hexadecimal()
            if tree.addLiteralPhrase(self.pnc.catg,self.pnc.synf):
                tree.lastph.lens = w.getLength()
                tree.lastph.krnl.semf.combine(self.pnc.semf)
#               print 'semf=' , self.pnc.semf
#               print 'lastph=' , tree.lastph
#           print 'punc w=' , unicode(w)
        else:
#           print 'must create UNKN leaf node'
            tree.createUnknownPhrase(w)     # unknown type as last resort
            tree.lastph.lens = len(ws)

        return w

Example #16

Show file

File: ellyBase.py Project: prohippo/pyelly

    def _scanText ( self , k ):

        """
        try to match in buffer regardless of word boundaries
        using Elly vocabulary, pattern, amd template tables an
        also running Elly entity extractors

        arguments:
            self  -
            k     - length of first component in buffer

        returns:
            match parameters [ text span of match , vocabulary match , suffix removed ]

        exceptions:
            ParseOverflow
        """

#       print '_scanText k=' , k
        sb = self.sbu.buffer           # input buffer
        tr = self.ptr                  # parse tree for results

#       print '_scanText sb=' , sb
                                       # initialize match status
        nspan = 0                      #   total span of match
        vmchs = [ ]                    #   chars of vocabulary entry matched
        suffx = ''                     #   any suffix removed in match

        d = self.rul                   # grammar rule definitions

        m = d.ptb.match(sb,tr)         # try token by pattern match next
#       print 'pattern m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

        m = d.ctb.match(sb,tr)         # try multi-word template  match next
#       print 'template m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

        m = self.iex.run(sb)           # try entity extractors next
#       print 'extractor m=' , m
        if  nspan < m:
            nspan = m                  # on longer match, update maximum

#       lm = len(sb)                   # scan limit
#       print 'lm=' , lm , 'm=' , m
        capd = ellyChar.isUpperCaseLetter(sb[0])
#       print 'next component=' , sb[:k] , ', context=' , sb[k:lm]

        if self.vtb != None:           # look in external dictionary, if it exists
            ls = list(sb[:k])
#           print 'ls 0=' , ls
            ellyChar.toLowerCaseASCII(ls)
            ss = u''.join(ls)                   # where to start for vocabulary indexing
#           print 'ls 1=' , ls
            n = vocabularyTable.delimitKey(ss)  # get actual indexing
#           print 'delimiting n=' , n , '=' , '<' + ss[:n] + '>'
#           print vocabularyTable.listDBKeys(self.vtb.cdb)

            rl = self.vtb.lookUp(sb,n) # get list of the maximum text matches
#           print len(rl) , 'matches'
            if len(rl) > 0:            #
                r0 = rl[0]             # look at first record
#               print 'r0=' , r0
                vmln = r0.nspan        # should be same for all matches
                vchs = r0.vem.chs      #
                vsfx = r0.suffx        #
#               print 'nspan=' , vmln , vsfx

                if ( vmln > nspan or
                     vmln == nspan and vsfx == '' ):

                    nspan = vmln       # keep vocabulary matches
                    vmchs = vchs       #
                    suffx = vsfx       #

                    for r in rl:
                        ve = r.vem     # get vocabulary entry
#                       print 've=' , ve
#                       if ve.gen != None: print 've.gen=' , ve.gen
                        if tr.addLiteralPhraseWithSemantics(
                                ve.cat,ve.syf,ve.smf,ve.bia,ve.gen,len(suffx) > 0):
                            tr.lastph.lens = nspan  # char length of leaf phrase node
                                                    # needed for later selection
                            tr.lastph.krnl.cncp = ve.con
                            if capd:
                                tr.lastph.krnl.semf.set(0)
#                           print 'vocab phr=' , tr.lastph , 'len=' , tr.lastph.lens
                            if suffx != '':
                                if ellyChar.isApostrophe(suffx[1]):
                                    tr.lastph.krnl.usen = 0

#               print 'vocabulary m=' , vmln
#               print 'queue after table lookup:' , len(self.ptr.queue)

#           print 'sb=' , sb

#       print 'maximum match=' , nspan
#       print 'input=' , self.sbu.buffer[:nspan]

        if nspan > 0:                  # any matches at all?
            tr.requeue()               # if so, keep only longest of them
#       print 'queue after scan:' , len(self.ptr.queue)

#       print 'returns [' , nspan , ',' , vmchs , ',' , suffx , ']'
        return [ nspan , vmchs , suffx ]

Example #17

Show file

File: ellyBase.py Project: prohippo/pyelly

    def _lookUpNext ( self ):

        """
        look up possible next segments in input buffer by various means,
        keeping tokens only for the LONGEST segment

        arguments:
            self

        returns:
            True on successful lookup, False otherwise

        exceptions:
            ParseOverflow
        """

        self.sbu.skipSpaces()          # skip leading spaces
        s = self.sbu.buffer
#       print '_lookUp@0 buffer=' , s

        if len(s) == 0:                # check for end of input
            return False               # if so, done

#       print 'in =' , unicode(self.sbu)
        if self.trs != None:           # preanalysis of number expressions
            self.trs.rewriteNumber(s)

#       print '_lookUp@1 buffer=' , self.sbu.buffer
#       print 'macro expansion s[0]=' , s[0]
        self.sbu.expand()              # apply macro substitutions
#       print 'macro expanded  s[0]=' , s[0]
#       print '_lookUp@2 buffer=' , self.sbu.buffer

        s = self.sbu.buffer

#       print 'expanded len=' , len(s)
        if len(s) == 0: return True    # macros can empty out buffer

        k = self.sbu.findBreak()       # find extent of first component for lookup
        if k == 0:
            k = 1                      # must have at least one char in token

#       print 'break at k=' , k
        kl = len(s)
        if  k + 1 < kl and s[k] == '+' and s[k+1] == ' ':
            k += 1                     # recognize possible prefix

#       print 'len(s)=' , kl , 'k=' , k , 's=', s

#       print '_lookUp@3 buffer=' , self.sbu.buffer
        mr = self._scanText(k)         # text matching in various ways
        mx  = mr[0]                    # overall maximum match length
        chs = mr[1]                    # any vocabulary element matched
        suf = mr[2]                    # any suffix removed in matching
#       print '_lookUp@4 buffer=' , self.sbu.buffer
        s = self.sbu.buffer
#       print 'k=' , k
#       print 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf
#       print 'len(s)=' , len(s) , 's=' , s

        if ( k < mx or
             k == mx and suf != '' ):  # next word cannot produce token as long as already seen?

#           print 'queue:' , len(self.ptr.queue)
#           print 'chs=' , chs
            if len(chs) > 0:           # any vocabulary matches?
#               print 'put back' , suf , mx , s
                self.sbu.skip(mx)      # if so, they supersede
                if suf != '':          # handle any suffix removal
                    self.sbu.prepend(list(suf))
#                   print 'suf=' , suf
            else:
                chs = self.sbu.extract(mx)
#               print 'extracted chs=' , chs
#           print 'token chs=' , chs
            to = ellyToken.EllyToken(chs)
#           print 'long token=' , unicode(to)
            self.ctx.addTokenToListing(to)
            if suf != '':
                if not ellyChar.isApostrophe(suf[1]):
                    to.dvdd = True     # must note suffix removal for token!
#           print 'only queue:' , len(self.ptr.queue)
            return True

#       print 'mx=' , mx
#       print 'plus queue:' , len(self.ptr.queue)
        wsk = self.sbu.buffer[:k]
        cap = ellyChar.isUpperCaseLetter(wsk[0])
#       print 'wsk=' , wsk
        rws = u''.join(wsk)
        found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap)
        if not found:
#           print 'not found, k=' , k
            if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]):
                k -= 1
                rws = rws[:-1]
                found = self.ptr.createPhrasesFromDictionary(rws.lower(),False,cap)
#       print 'found in dictionary=' , found
        if found or mx > 0:            # match found in dictionary or by text scan
            if not found:
                k = mx                 # if by text scan, must make token longer
                rws = rws[:k]          # if mx > k
            self.sbu.skip(k)
#           print 'next=' , self.sbu.buffer[self.sbu.index:]
#           print 'queue after =' , len(self.ptr.queue)
            to = ellyToken.EllyToken(rws[:k])
            if len(suf) > 1:           # change token to show suffix properly
#               print 'suf=' , suf
                cs = suf[1]            # first char in suffix after '-'
                rt = to.root           # this is a list!
                lk = -1                # start at last char in token
                while rt[lk] != cs: lk -= 1
                sn = len(rt) + lk      # where to divide suffix from root
#               print 'sn=' , sn , rt
                to.root = rt[:sn]      # root without suffix
                self.sbu.prepend(suf)  # restore suffix to input for processing
            else:                      # no suffix
                chx = self.sbu.peek()  # look at next char after match
                if chx == '-':         # if hyphen, need to separate it
                    self.sbu.skip()
                    if ellyChar.isLetter(self.sbu.peek()):
                        self.sbu.prepend(' ')
                    self.sbu.prepend('-')
#           print 'add' , unicode(to)
            self.ctx.addTokenToListing(to)  # add token to listing for sentence
            return True

#       print '[' + rws + ']' , 'still unrecognized'

        chx = rws[0]                   # special hyphen check
        if chx == '-' and k > 1:
#           print 'look in  internal dictionary'
            if self.ptr.createPhrasesFromDictionary(chx,False,False):
#               print 'found!'
                to = ellyToken.EllyToken(chx)  # treat hyphen as token
                self.ctx.addTokenToListing(to) # add it to token list
                self.sbu.skip()                # remove from input
                return True

        to = self._extractToken(mx)    # single-word matching with analysis and lookup

#       print 'extracted to=' , unicode(to)
        if to == None:                 # if no match, we are done and will return
#           print 'mx=' , mx
            return False if mx == 0 else True  # still success if _scanText() found something
        if self.ptr.lastph != None:
            self.ptr.lastph.lens = to.getLength()

#       print 'to=' , unicode(to) , 'len(s)=' , len(s) , s
#       posn = self.ctx.countTokensInListing()
#       print 'at', posn , 'in token list'
        self.ctx.addTokenToListing(to) # add token to listing for sentence
#       tol = self.ctx.getNthTokenInListing(-1)
#       print 'last token root=' , tol.root
        return True                    # successful lookup

Example #18

Show file

File: ellyWildcard.py Project: belkhir-nacim/pyelly

def match ( patn , text , offs=0 , limt=None ):

    """
    compare a pattern with wildcards to input text

    arguments:
        patn  - pattern to matched
        text  - what to match against
        offs  - start text offset for matching
        limt  - limit of matching

    returns:
        bindings if match is successful, None otherwise
    """

    class Unwinding(object):

        """
        to hold unwinding information for macro pattern backup and rematch

        attributes:
            kind   - 0=optional 1=* wildcard
            count  - how many backups allowed
            pats   - saved pattern index for backup
            txts   - saved input text index
            bnds   - saved binding index
        """

        def __init__ ( self , kind ):
            """
            initialize to defaults

            arguments:
                self  -
                kind  - of winding
            """
            self.kind  = kind
            self.count = 0
            self.pats  = 0
            self.txts  = 0
            self.bnds  = 1

        def __unicode__ ( self ):
            """
            show unwinding contents for debugging

            arguments:
                self
            returns:
                attributes as array
            """
            return ( '[kd=' + unicode(self.kind)  +
                     ',ct=' + unicode(self.count) +
                     ',pa=' + unicode(self.pats)  +
                     ',tx=' + unicode(self.txts)  +
                     ',bd=' + unicode(self.bnds)  + ']' )

    #### local variables for match( ) ####

    mbd = [ 0 ]       # stack for pattern match bindings (first usable index = 1)
    mbi = 1           # current binding index
    unw = [ ]         # stack for unwinding on match failure
    unj = 0           # current unwinding index

    ##
    # three private functions using local variables of match()
    #

    def _bind ( ns=None ):
        """
        get next available wildcard binding frame
        arguments:
            ns  - optional initial span of text for binding
        returns:
            binding frame
        """
#       print "binding:",offs,ns
        os = offs - 1
        if ns == None: ns = 1 # by default, binding is to 1 char
        if mbi == len(mbd):   # check if at end of available frames
            mbd.append([ 0 , 0 ])
        bf = mbd[mbi]         # next available record
        bf[0] = os            # set binding to range of chars
        bf[1] = os + ns       #
        return bf

    def _modify ( ):
        """
        set special tag for binding
        arguments:

        """
        mbd[mbi].append(None)

    def _mark ( kind ):
        """
        set up for backing up pattern match
        arguments:
            kind  - 0=optional 1=* wildcard
        returns:
            unwinding frame
        """
        if unj == len(unw): # check if at end of available frames
            unw.append(Unwinding(kind))
        uf = unw[unj]       # next available
        uf.kind  = kind
        uf.count = 1
        uf.pats  = mp
        uf.txts  = offs
        uf.bnds  = mbi
        return uf

    def _span ( typw ):
        """
        count chars available for wildcard match
        arguments:
            typw - wildcard
        returns:
            non-negative count if any match possible, otherwise -1
        """
        k = minMatch(patn[mp:])  # calculate min char count to match rest of pattern

#       print "exclude=",k,"@",offs

        # calculate maximum chars a wildcard can match

        mx = ellyChar.findBreak(text,offs) - k # max span reduced by exclusion
        if mx < 0: return -1                   # cannot match if max span < 0

        tfn = Matching[typw]                   # char type matching a wildcard

#       print "text at",offs,"maximum wildcard match=",mx

        nm = 0
        for i in range(mx):
            c = text[offs+i]                   # next char in text from offset
            if not tfn(c): break               # stop when it fails to match
            nm += 1

#       print "maximum wildcard span=",nm

        return nm

    #
    # end of private functions
    ##

    #############################
    ####  main matching loop ####
    #############################

    matched = False  # successful pattern match?

    if limt == None: limt = len(text)

    mp = 0           # pattern index
    ml = len(patn)   # pattern match limit

#   print text[offs:limt],":",list(patn)

    while True:

        ## literally match as many next chars as possible

        while mp < ml:
            if offs >= limt:
                last = ''
            else:
                last = text[offs].lower()
                offs += 1
#           print 'matching last=' , last , 'at' , offs
            if patn[mp] != last: break
            mp += 1

        ## check whether mismatch is due to special pattern char

#       print 'pat',mp,"<",ml
#       print "txt @",offs

        if mp >= ml:        # past end of pattern?
            matched = True  # if so, match is made
            break

        tc = patn[mp]       # otherwise, get unmatched pattern element
        mp += 1             #
#       print "tc=",ord(tc)

        if tc == cALL:   # a * wildcard?

#           print "ALL last=< " + last + " >"
            if last != '': offs -= 1

            nm = _span(cALL)

            ## save info for restart of matching on subsequent failure

            bf = _bind(nm); mbi += 1  # get new binding record
            bf[0] = offs              # bind from current offset
            offs += nm                # move offset past end of span
            bf[1] = offs              # bind to   new     offset
#           print "offs=",offs
            uf = _mark(1); unj += 1   # get new unwinding record
            uf.count = nm             # can back up this many times on mismatch
            continue

        elif tc == cEND: # end specification
#           print "END $:",last
            if last == '':
                continue
            elif last in [ '.' , ',' , '-' ]:
                if offs == limt:
                    offs -= 1
                    continue
                txc = text[offs]
                if ellyChar.isWhiteSpace(txc) or txc in Trmls:
                    offs -= 1
                    continue
            elif last in ellyBuffer.separators:
                offs -= 1
                continue
            elif last in [ '?' , '!' ]:
                offs -= 1
                continue

        elif tc == cANY: # alphanumeric wildcard?
            if last != '' and ellyChar.isLetterOrDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cDIG: # digit wildcard?
            if last != '' and ellyChar.isDigit(last):
                _bind(); mbi += 1
                continue

        elif tc == cALF: # letter wildcard?
#           print "ALF:",last,offs
            if last != '' and ellyChar.isLetter(last):
                _bind(); mbi += 1
                continue

        elif tc == cSPC: # space wildcard?
#           print "SPC:"
            if last != '' and ellyChar.isWhiteSpace(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cAPO: # apostrophe wildcard?
#           print "APO: last=" , last
            if ellyChar.isApostrophe(last):
                _bind(); _modify(); mbi += 1
                continue

        elif tc == cSOS:
#           print "SOS"
#           print last,'@',offs
            mf = _bind(0); mbi += 1   # dummy record to block
            mf[0] = -1                #   later binding consolidation
            if last != '':
                offs -= 1             # try for rematch
            m = mp                    # find corresponding EOS
            while m < ml:             #
                if patn[m] == cEOS: break
                m += 1
            else:                     # no EOS?
                m -= 1                # if so, pretend there is one anyway
            uf = _mark(0); unj += 1   # for unwinding on any later match failure
            uf.pats = m + 1           # i.e. one char past next EOS
            uf.txts = offs            # start of text before optional match
            continue

        elif tc == cEOS:
#           print "EOS"
            if last != '':
                offs -= 1             # back up for rematch
            continue

        elif tc == cSAN or tc == cSDG or tc == cSAL:
            if last != '':            # still more to match?
                offs -= 1
                nm = _span(tc)        # maximum match possible
#               print 'spanning=' , nm
                if nm >= 1:
                    bf = _bind(nm); mbi += 1
                    bf[0] = offs      # bind from current offset
                    offs += nm        # move offset past end of span
                    bf[1] = offs      # bind to   new     offset
                    uf = _mark(1); unj += 1
                    uf.count = nm - 1 # at least one char must be matched
                    continue

        elif tc == '':
            if last == '' or not ellyChar.isPureCombining(last):
                matched = True        # successful match
                break

        ## match failure: rewind to last match branch

#       print "fail - unwinding",unj

        while unj > 0:               # try unwinding, if possible
#           print "unw:",unj
            uf = unw[unj-1]          # get most recent unwinding record
#           print uf
            if uf.count <= 0:        # if available count is used up,
                unj -= 1             # go to next unwinding record
                continue
            uf.count -= 1            # decrement available count
            uf.txts -= uf.kind       # back up one char for scanning text input
            mp = uf.pats             # unwind pattern pointer
            offs = uf.txts           # unwind text input
            mbi = uf.bnds            # restore binding
            mbd[mbi-1][1] -= uf.kind # reduce span of binding if for wildcard
            break
        else:
#           print "no unwinding"
            break                   # quit if unwinding is exhausted

    ##
    ## clean up on match mode or on no match possible
    ##

#   print "matched=",matched

    if not matched: return None     # no bindings

#   print text,offs

    ## consolidate contiguous bindings for subsequent substitutions

#   print "BEFORE consolidating"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    mbdo = mbd
    lb  = -1               # binding reference
    lbd = [ 0 , -1 ]       # sentinel value, not real binding
    mbd = [ lbd ]          # initialize with new offset after matching
    mbdo.pop(0)            # ignore empty binding
    while len(mbdo) > 0:   #
        bd = mbdo.pop(0)   # get next binding
        if len(bd) > 2:
            bd.pop()
            mbd.append(bd)
            lbd = bd
            lb = -1
        elif bd[0] < 0:    # check for optional match indicator here
            lb = -1        # if so, drop from new consolidated bindings
        elif lb == bd[0]:  # check for binding continuous with previous
            lb = bd[1]     #
            lbd[1] = lb    # if so, combine with previous binding
        else:              #
            mbd.append(bd) # otherwise, add new binding
            lbd = bd       #
            lb = bd[1]     #

    mbd[0] = offs          # replace start of bindings with text length matched

#   print "AFTER"
#   print "bd:",len(mbd)
#   for b in mbd:
#       print b

    return mbd             # consolidated bindings plus new offset

Example #19

Show file

    def _extractToken(self, mnl):
        """
        extract next token from input buffer and look up in grammar table

        arguments:
            self  -
            mnl   - minimum length for any previous match

        returns:
            ellyToken on success, otherwise None

        exceptions:
            ParseOverflow
        """

        d = self.rul  # grammar rule definitions

        tree = self.ptr  # parse tree
        buff = self.sbu  # input source

        #       print ( 'start extraction' )
        try:
            w = buff.getNext()  # extract next token
            #           print ( 'got token=' , w )
            ws = ''.join(w.root)
        except ellyException.StemmingError as e:
            #           print ( 'FATAL error' , e , file=sys.stderr )
            sys.exit(1)
#       print ( 'extracted' , '['+ ws + ']' )
        wcapzn = w.isCapitalized()
        wsplit = w.isSplit()

        wl = len(ws)
        if wl > mnl:
            found = self._simpleTableLookUp(ws, tree, wsplit, wcapzn) > 0
#           print ( 'found in external table=' , found )

        if wl >= mnl:
            if ws in self.rul.gtb.dctn:  # look up internally
                #               print ( v'"' + ws + '" in dictionary' )
                if tree.createPhrasesFromDictionary(ws, wsplit, wcapzn):
                    found = True

#       print ( 'found in internal dictionary=' , found )
        if found:  # if any success, we are done
            return w
        if mnl > 0:
            return None  # defer to previous lookup

#       print ( 'affix logic:' )
#       print ( d.man.pref )
#       print ( d.man.suff )
        dvdd = False
        if d.man.analyze(w):  # any analysis possible?
            root = ''.join(w.root)  # if so, get parts of analysis
            tan = w.pres + [root] + w.sufs
            if len(w.sufs) > 0:
                sx = w.sufs[-1]
                dvdd = not ellyChar.isApostrophe(sx[1])
#           print ( 'token analysis=' , tan )
            while len(tan) > 0:  # and put back into input
                x = tan.pop()
                buff.prepend(x)
                buff.prepend(' ')
            w = buff.getNext()  # get token again with stemming and macros

            #           print ( 'analyzed w=' , w )

            ws = ''.join(w.root)

            if ws[-1] == '+':
                #               print ( 'len(queue)=' , len(tree.queue) )
                m = d.ptb.match(w.root, tree)
                #               print ( 'root=' , w.root )
                #               print ( 'match=' , m )
                #               print ( 'len(queue)=' , len(tree.queue) )
                #               print ( 'char span=' , tree.lastph.lens )
                if m > 0:
                    tree.lastph.bias = 2
                    found = True

#           print ( 'after found=' , found )
            if len(ws) < mnl: return None  # external lookup?
            if self._simpleTableLookUp(ws, tree, False,
                                       wcapzn):  # external lookup
                found = True

            if ws in self.rul.gtb.dctn:  # internal lookup?
                if tree.createPhrasesFromDictionary(ws, wsplit, wcapzn):
                    found = True

        if found:  # if any success, we are done
            #           print ( 'token recognized' )
            w.dvdd = dvdd
            return w

#       print ( 'still unrecognized token w=' , str(w) )

        lws = len(ws)
        if lws > 1:  # special handling of + or -
            if ws[0] == '+' and ws[-1] != '+':
                #               print ( 'root=' , ws )      # marks root with prefixes removed
                if self._simpleTableLookUp(ws[1:], tree) > 0:
                    return w
            if ws[0] == '-':
                w.shortenBy(lws - 1)  # -X not recognized as suffix
                #               print ( 'w=' , w )          # try processing - separately
                cn = buff.peek()
                if ellyChar.isLetterOrDigit(cn):
                    buff.prepend(' ')
                buff.prepend(ws[1:])  # put back X for further analysis

        if self.pnc.match(w.root):  # check if next token is punctuation
            #           print ( 'catg=' , self.pnc.catg , self.pnc.synf.hexadecimal() )
            if tree.addLiteralPhrase(self.pnc.catg, self.pnc.synf):
                tree.lastph.lens = w.getLength()
                tree.lastph.krnl.semf.combine(self.pnc.semf)


#               print ( 'semf=' , self.pnc.semf )
#               print ( 'lastph=' , tree.lastph )
#           print ( 'punc w=' , str(w) )
        else:
            #           print ( 'must create UNKN leaf node' )
            tree.createUnknownPhrase(w)  # unknown type as last resort
            tree.lastph.lens = len(ws)

        return w