Esempio n. 1
0
    def getNext(self):
        """
        get single Chinese character

        arguments:
            self

        returns:
            a token or None if buffer is empty

        exceptions:
            StemmingError
        """

        #       print super(EllyBufferZH,self) , 'ZH getNext'
        ln = len(self.buffer)
        if ln == 0:
            return None

#       print 'buffer=' , self.buffer
        n = 1
        if ellyChar.isDigit(self.buffer[0]):
            while n < ln and ellyChar.isDigit(self.buffer[n]):
                n += 1

        w = ellyToken.EllyToken(self.extract(n))
        #       print 'return token=' , w
        #       print 'ZH extracted'
        #       print 'buffer=' , self.buffer
        return w
Esempio n. 2
0
    def simplify(self, strg):
        """
        apply inflectional stemming to string

        arguments:
           self  -
           strg  - input Unicode string

        returns:
           stemmed Unicode string
        """

        if len(strg) < 4: return strg
        if strg[-1] == "s" and ellyChar.isApostrophe(strg[-2]):
            return strg[:-2]
        else:
            t = ellyToken.EllyToken(strg)
            self.apply(t)
            return t.toUnicode()
Esempio n. 3
0
def stemTest(stemmer, suffix=None):
    """
    test stemmer with examples from standard input

    arguments:
        stemmer - must be of class with apply(x) method
        suffix  - suffix to report in output
    """

    out = ''

    print("testing ", stemmer)
    if suffix != None:
        out = '[-' + suffix + ']'
        print('suffix', out)
    print("enter words to stem:")

    while True:
        try:
            sys.stdout.write("> ")
            sys.stdout.flush()
            line = sys.stdin.readline()
        except KeyboardInterrupt:
            break

        w = line.rstrip()
        if len(w) == 0: break  # stop upon empty line for EOF
        print('"%s"' % w, end=' ')  #
        tok = ellyToken.EllyToken(w)  # make new token
        try:
            sta = stemmer.apply(tok)  # apply stemmer
        except ellyException.StemmingError:
            print('stemming error!', file=sys.stderr)
            sys.exit(1)
        print("-->>", ''.join(tok.root), end=' ')  # stemming result
        if suffix == None:
            print(tok.getSuffixes(), end=' ')  # list of suffixes removed
        else:
            print(out, end=' ')
        print(" success code=", sta)
Esempio n. 4
0
    print 'pre=', pre

    treeLogic.dumpLT(pre.indx)

    xs = [  # test cases
        'telegraph +graph', 'telephone +phone', 'transportation +portation',
        'pseudopod +pod'
    ]

    nfail = 0

    for x in xs:
        rec = x.strip().split()  # next test case
        if len(rec) != 2: continue  # better be [ input , expected output ]

        w = rec[0]  # get separate components
        r = rec[1]

        t = ellyToken.EllyToken(w)  # make token for matching
        b = pre.match(t)
        a = ''.join(t.root)
        if not b:
            print ' NO MATCH=', rec
        m = (r == a)
        if not m:
            print '     FAIL=', rec, '!= <' + a + '>'
            nfail += 1

    print nfail, 'examples failed'
Esempio n. 5
0
    def _lookUpNext(self):
        """
        look up next segment in input buffer by various means

        arguments:
            self

        returns:
            True on success, False otherwise
        """

        self.sbu.skipSpaces()  # skip leading spaces
        s = self.sbu.buffer

        if len(s) == 0:  # check for end of input
            return False  # if so, done

        if self.trs != None:  # preanalysis of number expressions
            self.trs.rewriteNumber(s)

        self.sbu.expand()  # apply macro substitutions

        s = self.sbu.buffer

        #       print ( 'expanded len=' , len(s) )
        #       print ( 'sbu=' , s )

        if len(s) == 0: return True  # macros can empty out buffer

        k = self.sbu.findBreak()  # try to find first component for lookup
        if k == 0:
            k = 1  # must have at least one char in token

        kl = len(s)
        if k + 1 < kl and s[k] == '+' and s[k + 1] == ' ':
            k += 1  # recognize possible prefix

#       print ( 'len(s)=' , kl , 'k=' , k , 's=', s )

        mr = self._scanText(k)  # text matching
        mx = mr[0]
        mty = mr[1]
        chs = mr[2]  # any vocabulary element matched
        suf = mr[3]  # any suffix removed in matching
        s = self.sbu.buffer
        #       print ( 'mx=' , mx , 'len(s)=' , len(s), 'k=' , k )
        #       print ( 's=' , s )

        if (k < mx or k == mx and
                suf != ''):  # next token cannot be as long as already seen?
            if len(chs) > 0:
                self.sbu.skip(mx)
                if suf != '':
                    self.sbu.prepend(suf)
            else:
                chs = self.sbu.extract(mx)
            to = ellyToken.EllyToken(''.join(chs))
            self.tks.append([mty, to])
            return True

        wsk = self.sbu.buffer[:k]
        #       print ( 'wsk=' , wsk )
        rws = ''.join(wsk).lower()
        found = rws in self.gtb.dctn

        if found:
            #           print ( 'found internally' )
            mty += 'Id'

        if found or mx > 0:
            self.sbu.skip(k)
            to = ellyToken.EllyToken(rws)
            if len(suf) > 1:  # change token to show suffix properly
                #               print ( 'suf=' , suf )
                cs = suf[1]  # first char in suffix after '-'
                rt = to.root  # this is a list!
                lk = -1  # start at last char in token
                while rt[lk] != cs:
                    lk -= 1
                sn = len(rt) + lk  # where to divide suffix from root
                #               print ( 'sn=' , sn , rt )
                to.root = rt[:sn]  # root without suffix
                self.sbu.prepend(suf)  # restore suffix to input for processing
            self.tks.append([mty, to])
            return True

#       print ( 'extract token' )
        self._extractToken(mx, mty)  # single-word matching with analysis
        return True
Esempio n. 6
0
        if intr: sys.stdout.write('> ')
        lin = sys.stdin.readline()
        if len(lin) == 0:  # EOF check
            break
        lin = lin.strip()
        if len(lin) == 0:
            if intr: break  # empty line quits look on interactive input
            else: continue  # but otherwise continues
        lin = lin.split(' ')
        if len(lin) < 2:
            lin.append('-')  # should be [ word , root ] tuple
        w = lin[0]  # unpack
        r = lin[1]
        if r != '-':
            ntry += 1  # input pair to be added to testing
        t = ellyToken.EllyToken(w)
        #       print '0o t=' , t
        try:
            inf.apply(t)  # apply inflectional  stemming
        except ellyException.StemmingError:
            print 'stemming error'
            sys.exit(1)
#       print '1i t=' , t
        if not mor.analyze(t):  # apply morphological stemming
            msg = 'no morphological change'
        else:
            msg = ''
#       print '2m t=' , t
        nr = ''.join(t.root)  # resulting root
        if nr != r and r != '-':
            nfail += 1
Esempio n. 7
0
    def __init__(self):
        """
        create environment for testing semantic procedure

        arguments:
            self
        """

        stb = symbolTable.SymbolTable()  # empty
        hry = conceptualHierarchy.ConceptualHierarchy()  # empty
        ctx = interpretiveContext.InterpretiveContext(stb, {}, {}, hry)
        self.context = ctx  # make available

        ptb = parseTreeBase.ParseTreeBase()  # just for generating phrases

        self.toknL = ellyToken.EllyToken(
            'uvwxxyz')  # insert dummy data that might
        self.toknR = ellyToken.EllyToken('abcdefg')  # be replaced from outside

        ctx.addTokenToListing(self.toknL)  # put a token in first position
        ctx.addTokenToListing(self.toknR)  # and a token in second

        x = ctx.syms.getSyntaxTypeIndexNumber(
            'x')  # for consistency, define two
        y = ctx.syms.getSyntaxTypeIndexNumber(
            'y')  # syntactic categories for rules

        fbs = ellyBits.EllyBits(symbolTable.FMAX)  # zero feature bits

        exL = grammarRule.ExtendingRule(x, fbs)  # dummy rules as a place for
        exR = grammarRule.ExtendingRule(x,
                                        fbs)  # attaching semantic procedures
        spl = grammarRule.SplittingRule(y, fbs)  # for testing

        # dummy semantic procedures
        gX = ["left", "right"]  # generative
        gL = ["obtain"]  #
        gR = ["obtain"]  #

        gP = ["append did it!"]  # for standalone generative subprocedure

        cX = []  # cognitive
        cL = [">> +1"]  #
        cR = [">> -1"]  #

        ctx.pushStack()  # needed for local variables usable in testing
        ctx.setLocalVariable(
            "vl", "LLLL")  # make two variables available to work with
        ctx.setLocalVariable("vr", "RRRR")  #

        ctx.setProcedure('do', self._genp(gP))  # define procedure 'do'

        exL.gens = self._genp(gL)  # assign semantic procedures to rules
        exL.cogs = self._cogp(cL)  #

        exR.gens = self._genp(gR)  #
        exR.cogs = self._cogp(cR)  #

        spl.gens = self._genp(gX)  #
        spl.cogs = self._cogp(cX)  #

        phr = ptb.makePhrase(0, spl)  # make phrase for splitting plus
        phr.krnl.lftd = ptb.makePhrase(0, exL)  # left and right descendants
        phr.krnl.rhtd = ptb.makePhrase(1, exR)  # defined by left and right
        # extending rules from above
        phr.ntok = 1

        stb.getFeatureSet('!one,two', True)  # define semantic feature
        print stb.smindx
        smx = stb.smindx['!']  #
        ix = smx['one']  #
        print 'ix=', ix
        phr.krnl.semf.set(ix)  # turn on feature for phrase
        ix = smx['two']  #
        print 'ix=', ix
        phr.krnl.semf.set(ix)  # turn on feature for phrase
        print 'semf=', phr.krnl.semf

        self.phrase = phr  # make phrase available
Esempio n. 8
0
    gtbu = grammarTable.GrammarTable(stbu, rdr)
    ctxu = Ctx()
    tksu = ctxu.tokns

    tree = ParseTreeWithDisplay(stbu, gtbu, None, ctxu)
    print()
    print(tree)
    print()
    print(dir(tree))
    print()

    cat = stbu.getSyntaxTypeIndexNumber('num')
    fbs = ellyBits.EllyBits(symbolTable.FMAX)
    tree.addLiteralPhrase(cat, fbs)
    tree.digest()
    tksu.append(ellyToken.EllyToken('66'))
    tree.restartQueue()

    ws = ['nn', 'b', 'aj']  # from test.g.elly
    wu = ['ww', 'wx', 'wy', 'wz']  # unknown terms

    for w in ws:

        tree.createPhrasesFromDictionary(w, False, False)
        #       print ( '**** to' , tree.phlim , tree.lastph , 'rule=' , tree.lastph.krnl.rule.seqn )
        tree.digest()
        #       print ( '**** to' , tree.phlim , tree.lastph , 'rule=' , tree.lastph.krnl.rule.seqn )
        tksu.append(ellyToken.EllyToken(w))
        tree.restartQueue()

    for w in wu:
Esempio n. 9
0
    filn = sys.argv[1] if len(sys.argv) > 1 else 'default'

    basn = ellyConfiguration.baseSource + '/'
    dfn = ellyDefinitionReader.EllyDefinitionReader(basn + filn + '.stl.elly')
    if dfn.error != None:
        print >> sys.stderr, dfn.error
        sys.exit(1)
    print dfn.linecount(), 'definition lines for', filn + '.stl.elly'

    try:
        inf = inflectionStemmerEN.InflectionStemmerEN()
        suf = SuffixTreeLogic(dfn)
    except ellyException.TableFailure:
        print >> sys.stderr, 'cannot load stemming tables'
        sys.exit(1)
    suf.infl = inf
    #   print 'suf=' , suf
    #   print 'index=' , map(lambda x: ellyChar.toChar(ellyChar.toIndex(x)) , suf.indx.keys())
    print ''

    while True:
        sys.stdout.write('> ')
        wrd = sys.stdin.readline().strip()
        if len(wrd) == 0: break
        t = ellyToken.EllyToken(wrd)
        b = suf.match(t)
        a = ''.join(t.root)
        print t.getPrefixes(), a, t.getSuffixes(), ': status=', b
    sys.stdout.write('\n')
Esempio n. 10
0
    sta = tree.createPhrasesFromDictionary(sgm, False, False)
    print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph

    print '----'
    sgm = 'abcd'  # test example not in dictionary
    sta = tree.createPhrasesFromDictionary(sgm, False, False)
    print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph

    print '----'
    sgm = 'xyz'  # test example in dictionary
    sta = tree.createPhrasesFromDictionary(sgm, False, False)
    print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph

    print '----'
    sgm = 'pqr'  # test example not in dictionary
    sta = tree.createUnknownPhrase(ellyToken.EllyToken(sgm))
    print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph

    print '----'
    sgm = '.'  # test example not in dictionary
    tree.gbits[0].clear()
    sta = tree.addLiteralPhrase(tree.gtb.PUNC, fbbs)
    print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph
    tree.gbits[0].set(tree.gtb.PUNC)
    sta = tree.addLiteralPhrase(tree.gtb.PUNC, fbbs)
    print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph

    print ''
    print 'ambiguities:'
    for a in tree.ambig:
        while a != None:
Esempio n. 11
0
    def _lookUpNext(self):
        """
        look up possible next segments in input buffer by various means,
        keeping tokens only for the LONGEST segment

        arguments:
            self

        returns:
            True on successful lookup, False otherwise

        exceptions:
            ParseOverflow
        """

        self.sbu.skipSpaces()  # skip leading spaces
        s = self.sbu.buffer
        #       print ( '_lookUp@0 buffer=' , s )

        if len(s) == 0:  # check for end of input
            return False  # if so, done

#       print ( 'in =' , str(self.sbu) )
        if self.trs != None:  # preanalysis of number expressions
            self.trs.rewriteNumber(s)

#       print ( '_lookUp@1 buffer=' , self.sbu.buffer )
#       print ( 'macro expansion s[0]=' , s[0] )
        self.sbu.expand()  # apply macro substitutions
        #       print ( 'macro expanded  s[0]=' , s[0] )
        #       print ( '_lookUp@2 buffer=' , self.sbu.buffer )

        s = self.sbu.buffer

        #       print ( 'expanded len=' , len(s) )
        if len(s) == 0: return True  # macros can empty out buffer

        k = self.sbu.findBreak()  # find extent of first component for lookup
        if k == 0:
            k = 1  # must have at least one char in token

#       print ( 'break at k=' , k )
        kl = len(s)
        if k + 1 < kl and s[k] == '+' and s[k + 1] == ' ':
            k += 1  # recognize possible prefix

#       print ( 'len(s)=' , kl , 'k=' , k , 's=', s )

#       print ( '_lookUp@3 buffer=' , self.sbu.buffer )
        mr = self._scanText(k)  # text matching in various ways
        mx = mr[0]  # overall maximum match length
        chs = mr[1]  # any vocabulary element matched
        suf = mr[2]  # any suffix removed in matching
        #       print ( '_lookUp@4 buffer=' , self.sbu.buffer )
        s = self.sbu.buffer
        #       print ( 'k=' , k )
        #       print ( 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf )
        #       print ( 'len(s)=' , len(s) , 's=' , s )

        if (k < mx or k == mx and suf != ''
            ):  # next word cannot produce token as long as already seen?

            #           print ( 'queue:' , len(self.ptr.queue) )
            #           print ( 'chs=' , chs )
            if len(chs) > 0:  # any vocabulary matches?
                #               print ( 'put back' , suf , mx , s )
                self.sbu.skip(mx)  # if so, they supersede
                if suf != '':  # handle any suffix removal
                    self.sbu.prepend(list(suf))
#                   print ( 'suf=' , suf )
            else:
                chs = self.sbu.extract(mx)

#           print ( 'extract chs=' , chs )
            to = ellyToken.EllyToken(chs)
            #           print ( 'token=' , str(to) )
            self.ctx.addTokenToListing(to)
            if suf != '':
                if not ellyChar.isApostrophe(suf[1]):
                    to.dvdd = True  # must note suffix removal for token!
#           print ( 'only queue:' , len(self.ptr.queue) )
            return True

#       print ( 'mx=' , mx )
#       print ( 'plus queue:' , len(self.ptr.queue) )
        wsk = self.sbu.buffer[:k]
        cap = ellyChar.isUpperCaseLetter(wsk[0])
        #       print ( 'wsk=' , wsk )
        rws = ''.join(wsk)
        found = self.ptr.createPhrasesFromDictionary(rws.lower(), False, cap)
        if not found:
            if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]):
                k -= 1
                rws = rws[:-1]
                found = self.ptr.createPhrasesFromDictionary(
                    rws.lower(), False, cap)
#       print ( rws , 'found in dictionary=' , found )
        if found or mx > 0:  # match found in dictionary or by text scan
            if not found:
                k = mx  # if by text scan, must make token longer
                rws = rws[:k]  # if mx > k
            self.sbu.skip(k)
            #           print ( 'next=' , self.sbu.buffer[self.sbu.index:] )
            #           print ( 'queue after =' , len(self.ptr.queue) )
            to = ellyToken.EllyToken(rws[:k])
            if len(suf) > 1:  # change token to show suffix properly
                #               print ( 'suf=' , suf )
                cs = suf[1]  # first char in suffix after '-'
                rt = to.root  # this is a list!
                lk = -1  # start at last char in token
                while rt[lk] != cs:
                    lk -= 1
                sn = len(rt) + lk  # where to divide suffix from root
                #               print ( 'sn=' , sn , rt )
                to.root = rt[:sn]  # root without suffix
                self.sbu.prepend(suf)  # restore suffix to input for processing
            else:  # no suffix
                chx = self.sbu.peek()  # look at next char after match
                if chx == '-':  # if hyphen, need to separate it
                    self.sbu.skip()
                    if ellyChar.isLetter(self.sbu.peek()):
                        self.sbu.prepend(' ')
                    self.sbu.prepend('-')
#           print ( 'add' , str(to) )
            self.ctx.addTokenToListing(to)  # add token to listing for sentence
            return True

#       print ( '[' + rws + ']' , 'still unrecognized' )

        chx = rws[0]  # special hyphen check
        if chx == '-' and k > 1:
            #           print ( 'look in  internal dictionary' )
            if self.ptr.createPhrasesFromDictionary(chx, False, False):
                #               print ( 'found!' )
                to = ellyToken.EllyToken(chx)  # treat hyphen as token
                self.ctx.addTokenToListing(to)  # add it to token list
                self.sbu.skip()  # remove from input
                return True

        to = self._extractToken(
            mx)  # single-word matching with analysis and lookup

        #       print ( 'extracted to=' , str(to) )
        if to == None:  # if no match, we are done and will return
            #           print ( 'mx=' , mx )
            return False if mx == 0 else True  # still success if _scanText() found something
        if self.ptr.lastph != None:
            self.ptr.lastph.lens = to.getLength()

#       print ( 'to=' , str(to) , 'len(s)=' , len(s) , s )
#       posn = self.ctx.countTokensInListing()
#       print ( 'at', posn , 'in token list' )
        self.ctx.addTokenToListing(to)  # add token to listing for sentence
        #       tol = self.ctx.getNthTokenInListing(-1)
        #       print ( 'last token root=' , tol.root )
        return True  # successful lookup
Esempio n. 12
0
    def _getRaw(self):
        """
        obtain next raw token from buffer

        arguments:
            self

        returns:
            EllyToken on success, None otherwise
        """

        #       print ( '_getRaw() from' , len(self.buffer) , 'chars' )
        #       print ( 'before skipping spaces, buffer=' , self.buffer )
        self.skipSpaces()
        ln = len(self.buffer)
        #       print ( "after skip=",ln )
        if ln == 0:
            return None

        ## get length of next token and if it has
        ## initial - or +, check for word fragment

#       print ( 'buffer start=' , self.buffer[0] )

        k = 0  # number of chars for next token

        cz = ' ' if ln == 0 else self.buffer[0]
        if cz in [MIN, PLS]:
            k = self.findSeparator(1)
        elif cz == APO:
            if ln > 2 and self.buffer[1].lower(
            ) == 's' and self.buffer[2] in separators:
                k = 2
            else:
                k = 1
        elif cz in [COM, DOT, UELP]:  # these can be tokens by themselves
            k = 1
        else:
            #           print ( 'full token extraction' )
            k = self.findSeparator()
            #           print ( 'k=' , k , 'ln=' , ln )
            if k < 0:  # break multi-char token at next separator
                k = ln  # if no separator, go up to end of buffer
            elif k == 0:
                k = 1  # immediate break in scanning
            else:
                while k < ln:  # look at any separator and following context
                    x = self.buffer[k]
                    if x != MIN and x != COM:
                        break  # no further check if separator not hyphen or comma
                    if k + 1 >= ln or not ellyChar.isDigit(self.buffer[k + 1]):
                        #                       print ( 'x=' , x , 'buf=' , self.buffer[k:] )
                        break  # accept hyphen or comma if NOT followed by digit
                    else:  # otherwise, look for another separator
                        k = self.findSeparator(k + 2)
                        if k < 0:  #
                            k = ln

        ## if token not delimited, take rest of buffer as
        ## will fit into token working area

        if k < 0: k = ln

        #       print ( "take",k,"chars from",len(self.buffer),self.buffer )

        buf = self.extract(k)  # get k characters

        ## special check for hyphen next in buffer after extraction

        if self.match(MIN):  # hyphen immediately following?
            self.skip()  # if so, take it
            if self.atSpace():  # when followed by space
                buf.append(MIN)  # append hyphen to candidate token
                k += 1
            else:
                if not self.match(MIN):  # when not followed by another hyphen
                    self.prepend(ellyChar.SPC)  # put back a space
                else:
                    self.skip()  # double hyphen = dash
                    self.prepend(ellyChar.SPC)  # put back space after dash
                    self.prepend(MIN)  # put back second hyphen
                self.prepend(MIN)  # put back first
                self.prepend(
                    ellyChar.SPC)  # put extra space before hyphen or dash

        ## fill preallocated token for current position from working area

#       print ( "raw text buf=" , buf )

        to = ellyToken.EllyToken(''.join(buf))

        #       print ( "EllyBuffer token before=" , str(to) )

        ## strip off trailing non-token chars from token and put back in buffer

        km = k - 1
        while km > 0:
            x = buf[km]
            if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS:
                break
#           print ( 'trailing x=' , x )
            if x == APO or x == APX:
                if km > 0 and buf[km - 1] == 's':
                    break
            self.prepend(x)
            km -= 1
        km += 1
        if km < k:
            to.shortenBy(k - km, both=True)

#       print ( "EllyBuffer token=" , strx(to) )
#       print ( "next in buffer=" , self.buffer )
        return to