def getNext(self): """ get single Chinese character arguments: self returns: a token or None if buffer is empty exceptions: StemmingError """ # print super(EllyBufferZH,self) , 'ZH getNext' ln = len(self.buffer) if ln == 0: return None # print 'buffer=' , self.buffer n = 1 if ellyChar.isDigit(self.buffer[0]): while n < ln and ellyChar.isDigit(self.buffer[n]): n += 1 w = ellyToken.EllyToken(self.extract(n)) # print 'return token=' , w # print 'ZH extracted' # print 'buffer=' , self.buffer return w
def simplify(self, strg): """ apply inflectional stemming to string arguments: self - strg - input Unicode string returns: stemmed Unicode string """ if len(strg) < 4: return strg if strg[-1] == "s" and ellyChar.isApostrophe(strg[-2]): return strg[:-2] else: t = ellyToken.EllyToken(strg) self.apply(t) return t.toUnicode()
def stemTest(stemmer, suffix=None): """ test stemmer with examples from standard input arguments: stemmer - must be of class with apply(x) method suffix - suffix to report in output """ out = '' print("testing ", stemmer) if suffix != None: out = '[-' + suffix + ']' print('suffix', out) print("enter words to stem:") while True: try: sys.stdout.write("> ") sys.stdout.flush() line = sys.stdin.readline() except KeyboardInterrupt: break w = line.rstrip() if len(w) == 0: break # stop upon empty line for EOF print('"%s"' % w, end=' ') # tok = ellyToken.EllyToken(w) # make new token try: sta = stemmer.apply(tok) # apply stemmer except ellyException.StemmingError: print('stemming error!', file=sys.stderr) sys.exit(1) print("-->>", ''.join(tok.root), end=' ') # stemming result if suffix == None: print(tok.getSuffixes(), end=' ') # list of suffixes removed else: print(out, end=' ') print(" success code=", sta)
print 'pre=', pre treeLogic.dumpLT(pre.indx) xs = [ # test cases 'telegraph +graph', 'telephone +phone', 'transportation +portation', 'pseudopod +pod' ] nfail = 0 for x in xs: rec = x.strip().split() # next test case if len(rec) != 2: continue # better be [ input , expected output ] w = rec[0] # get separate components r = rec[1] t = ellyToken.EllyToken(w) # make token for matching b = pre.match(t) a = ''.join(t.root) if not b: print ' NO MATCH=', rec m = (r == a) if not m: print ' FAIL=', rec, '!= <' + a + '>' nfail += 1 print nfail, 'examples failed'
def _lookUpNext(self): """ look up next segment in input buffer by various means arguments: self returns: True on success, False otherwise """ self.sbu.skipSpaces() # skip leading spaces s = self.sbu.buffer if len(s) == 0: # check for end of input return False # if so, done if self.trs != None: # preanalysis of number expressions self.trs.rewriteNumber(s) self.sbu.expand() # apply macro substitutions s = self.sbu.buffer # print ( 'expanded len=' , len(s) ) # print ( 'sbu=' , s ) if len(s) == 0: return True # macros can empty out buffer k = self.sbu.findBreak() # try to find first component for lookup if k == 0: k = 1 # must have at least one char in token kl = len(s) if k + 1 < kl and s[k] == '+' and s[k + 1] == ' ': k += 1 # recognize possible prefix # print ( 'len(s)=' , kl , 'k=' , k , 's=', s ) mr = self._scanText(k) # text matching mx = mr[0] mty = mr[1] chs = mr[2] # any vocabulary element matched suf = mr[3] # any suffix removed in matching s = self.sbu.buffer # print ( 'mx=' , mx , 'len(s)=' , len(s), 'k=' , k ) # print ( 's=' , s ) if (k < mx or k == mx and suf != ''): # next token cannot be as long as already seen? if len(chs) > 0: self.sbu.skip(mx) if suf != '': self.sbu.prepend(suf) else: chs = self.sbu.extract(mx) to = ellyToken.EllyToken(''.join(chs)) self.tks.append([mty, to]) return True wsk = self.sbu.buffer[:k] # print ( 'wsk=' , wsk ) rws = ''.join(wsk).lower() found = rws in self.gtb.dctn if found: # print ( 'found internally' ) mty += 'Id' if found or mx > 0: self.sbu.skip(k) to = ellyToken.EllyToken(rws) if len(suf) > 1: # change token to show suffix properly # print ( 'suf=' , suf ) cs = suf[1] # first char in suffix after '-' rt = to.root # this is a list! lk = -1 # start at last char in token while rt[lk] != cs: lk -= 1 sn = len(rt) + lk # where to divide suffix from root # print ( 'sn=' , sn , rt ) to.root = rt[:sn] # root without suffix self.sbu.prepend(suf) # restore suffix to input for processing self.tks.append([mty, to]) return True # print ( 'extract token' ) self._extractToken(mx, mty) # single-word matching with analysis return True
if intr: sys.stdout.write('> ') lin = sys.stdin.readline() if len(lin) == 0: # EOF check break lin = lin.strip() if len(lin) == 0: if intr: break # empty line quits look on interactive input else: continue # but otherwise continues lin = lin.split(' ') if len(lin) < 2: lin.append('-') # should be [ word , root ] tuple w = lin[0] # unpack r = lin[1] if r != '-': ntry += 1 # input pair to be added to testing t = ellyToken.EllyToken(w) # print '0o t=' , t try: inf.apply(t) # apply inflectional stemming except ellyException.StemmingError: print 'stemming error' sys.exit(1) # print '1i t=' , t if not mor.analyze(t): # apply morphological stemming msg = 'no morphological change' else: msg = '' # print '2m t=' , t nr = ''.join(t.root) # resulting root if nr != r and r != '-': nfail += 1
def __init__(self): """ create environment for testing semantic procedure arguments: self """ stb = symbolTable.SymbolTable() # empty hry = conceptualHierarchy.ConceptualHierarchy() # empty ctx = interpretiveContext.InterpretiveContext(stb, {}, {}, hry) self.context = ctx # make available ptb = parseTreeBase.ParseTreeBase() # just for generating phrases self.toknL = ellyToken.EllyToken( 'uvwxxyz') # insert dummy data that might self.toknR = ellyToken.EllyToken('abcdefg') # be replaced from outside ctx.addTokenToListing(self.toknL) # put a token in first position ctx.addTokenToListing(self.toknR) # and a token in second x = ctx.syms.getSyntaxTypeIndexNumber( 'x') # for consistency, define two y = ctx.syms.getSyntaxTypeIndexNumber( 'y') # syntactic categories for rules fbs = ellyBits.EllyBits(symbolTable.FMAX) # zero feature bits exL = grammarRule.ExtendingRule(x, fbs) # dummy rules as a place for exR = grammarRule.ExtendingRule(x, fbs) # attaching semantic procedures spl = grammarRule.SplittingRule(y, fbs) # for testing # dummy semantic procedures gX = ["left", "right"] # generative gL = ["obtain"] # gR = ["obtain"] # gP = ["append did it!"] # for standalone generative subprocedure cX = [] # cognitive cL = [">> +1"] # cR = [">> -1"] # ctx.pushStack() # needed for local variables usable in testing ctx.setLocalVariable( "vl", "LLLL") # make two variables available to work with ctx.setLocalVariable("vr", "RRRR") # ctx.setProcedure('do', self._genp(gP)) # define procedure 'do' exL.gens = self._genp(gL) # assign semantic procedures to rules exL.cogs = self._cogp(cL) # exR.gens = self._genp(gR) # exR.cogs = self._cogp(cR) # spl.gens = self._genp(gX) # spl.cogs = self._cogp(cX) # phr = ptb.makePhrase(0, spl) # make phrase for splitting plus phr.krnl.lftd = ptb.makePhrase(0, exL) # left and right descendants phr.krnl.rhtd = ptb.makePhrase(1, exR) # defined by left and right # extending rules from above phr.ntok = 1 stb.getFeatureSet('!one,two', True) # define semantic feature print stb.smindx smx = stb.smindx['!'] # ix = smx['one'] # print 'ix=', ix phr.krnl.semf.set(ix) # turn on feature for phrase ix = smx['two'] # print 'ix=', ix phr.krnl.semf.set(ix) # turn on feature for phrase print 'semf=', phr.krnl.semf self.phrase = phr # make phrase available
gtbu = grammarTable.GrammarTable(stbu, rdr) ctxu = Ctx() tksu = ctxu.tokns tree = ParseTreeWithDisplay(stbu, gtbu, None, ctxu) print() print(tree) print() print(dir(tree)) print() cat = stbu.getSyntaxTypeIndexNumber('num') fbs = ellyBits.EllyBits(symbolTable.FMAX) tree.addLiteralPhrase(cat, fbs) tree.digest() tksu.append(ellyToken.EllyToken('66')) tree.restartQueue() ws = ['nn', 'b', 'aj'] # from test.g.elly wu = ['ww', 'wx', 'wy', 'wz'] # unknown terms for w in ws: tree.createPhrasesFromDictionary(w, False, False) # print ( '**** to' , tree.phlim , tree.lastph , 'rule=' , tree.lastph.krnl.rule.seqn ) tree.digest() # print ( '**** to' , tree.phlim , tree.lastph , 'rule=' , tree.lastph.krnl.rule.seqn ) tksu.append(ellyToken.EllyToken(w)) tree.restartQueue() for w in wu:
filn = sys.argv[1] if len(sys.argv) > 1 else 'default' basn = ellyConfiguration.baseSource + '/' dfn = ellyDefinitionReader.EllyDefinitionReader(basn + filn + '.stl.elly') if dfn.error != None: print >> sys.stderr, dfn.error sys.exit(1) print dfn.linecount(), 'definition lines for', filn + '.stl.elly' try: inf = inflectionStemmerEN.InflectionStemmerEN() suf = SuffixTreeLogic(dfn) except ellyException.TableFailure: print >> sys.stderr, 'cannot load stemming tables' sys.exit(1) suf.infl = inf # print 'suf=' , suf # print 'index=' , map(lambda x: ellyChar.toChar(ellyChar.toIndex(x)) , suf.indx.keys()) print '' while True: sys.stdout.write('> ') wrd = sys.stdin.readline().strip() if len(wrd) == 0: break t = ellyToken.EllyToken(wrd) b = suf.match(t) a = ''.join(t.root) print t.getPrefixes(), a, t.getSuffixes(), ': status=', b sys.stdout.write('\n')
sta = tree.createPhrasesFromDictionary(sgm, False, False) print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph print '----' sgm = 'abcd' # test example not in dictionary sta = tree.createPhrasesFromDictionary(sgm, False, False) print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph print '----' sgm = 'xyz' # test example in dictionary sta = tree.createPhrasesFromDictionary(sgm, False, False) print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph print '----' sgm = 'pqr' # test example not in dictionary sta = tree.createUnknownPhrase(ellyToken.EllyToken(sgm)) print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph print '----' sgm = '.' # test example not in dictionary tree.gbits[0].clear() sta = tree.addLiteralPhrase(tree.gtb.PUNC, fbbs) print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph tree.gbits[0].set(tree.gtb.PUNC) sta = tree.addLiteralPhrase(tree.gtb.PUNC, fbbs) print sgm, ':', sta, ', phlim=', tree.phlim, 'lastph=', tree.lastph print '' print 'ambiguities:' for a in tree.ambig: while a != None:
def _lookUpNext(self): """ look up possible next segments in input buffer by various means, keeping tokens only for the LONGEST segment arguments: self returns: True on successful lookup, False otherwise exceptions: ParseOverflow """ self.sbu.skipSpaces() # skip leading spaces s = self.sbu.buffer # print ( '_lookUp@0 buffer=' , s ) if len(s) == 0: # check for end of input return False # if so, done # print ( 'in =' , str(self.sbu) ) if self.trs != None: # preanalysis of number expressions self.trs.rewriteNumber(s) # print ( '_lookUp@1 buffer=' , self.sbu.buffer ) # print ( 'macro expansion s[0]=' , s[0] ) self.sbu.expand() # apply macro substitutions # print ( 'macro expanded s[0]=' , s[0] ) # print ( '_lookUp@2 buffer=' , self.sbu.buffer ) s = self.sbu.buffer # print ( 'expanded len=' , len(s) ) if len(s) == 0: return True # macros can empty out buffer k = self.sbu.findBreak() # find extent of first component for lookup if k == 0: k = 1 # must have at least one char in token # print ( 'break at k=' , k ) kl = len(s) if k + 1 < kl and s[k] == '+' and s[k + 1] == ' ': k += 1 # recognize possible prefix # print ( 'len(s)=' , kl , 'k=' , k , 's=', s ) # print ( '_lookUp@3 buffer=' , self.sbu.buffer ) mr = self._scanText(k) # text matching in various ways mx = mr[0] # overall maximum match length chs = mr[1] # any vocabulary element matched suf = mr[2] # any suffix removed in matching # print ( '_lookUp@4 buffer=' , self.sbu.buffer ) s = self.sbu.buffer # print ( 'k=' , k ) # print ( 'scan result mx=' , mx , 'chs=' , chs , 'suf=' , suf ) # print ( 'len(s)=' , len(s) , 's=' , s ) if (k < mx or k == mx and suf != '' ): # next word cannot produce token as long as already seen? # print ( 'queue:' , len(self.ptr.queue) ) # print ( 'chs=' , chs ) if len(chs) > 0: # any vocabulary matches? # print ( 'put back' , suf , mx , s ) self.sbu.skip(mx) # if so, they supersede if suf != '': # handle any suffix removal self.sbu.prepend(list(suf)) # print ( 'suf=' , suf ) else: chs = self.sbu.extract(mx) # print ( 'extract chs=' , chs ) to = ellyToken.EllyToken(chs) # print ( 'token=' , str(to) ) self.ctx.addTokenToListing(to) if suf != '': if not ellyChar.isApostrophe(suf[1]): to.dvdd = True # must note suffix removal for token! # print ( 'only queue:' , len(self.ptr.queue) ) return True # print ( 'mx=' , mx ) # print ( 'plus queue:' , len(self.ptr.queue) ) wsk = self.sbu.buffer[:k] cap = ellyChar.isUpperCaseLetter(wsk[0]) # print ( 'wsk=' , wsk ) rws = ''.join(wsk) found = self.ptr.createPhrasesFromDictionary(rws.lower(), False, cap) if not found: if k > mx and k > 1 and ellyChar.isEmbeddedCombining(rws[-1]): k -= 1 rws = rws[:-1] found = self.ptr.createPhrasesFromDictionary( rws.lower(), False, cap) # print ( rws , 'found in dictionary=' , found ) if found or mx > 0: # match found in dictionary or by text scan if not found: k = mx # if by text scan, must make token longer rws = rws[:k] # if mx > k self.sbu.skip(k) # print ( 'next=' , self.sbu.buffer[self.sbu.index:] ) # print ( 'queue after =' , len(self.ptr.queue) ) to = ellyToken.EllyToken(rws[:k]) if len(suf) > 1: # change token to show suffix properly # print ( 'suf=' , suf ) cs = suf[1] # first char in suffix after '-' rt = to.root # this is a list! lk = -1 # start at last char in token while rt[lk] != cs: lk -= 1 sn = len(rt) + lk # where to divide suffix from root # print ( 'sn=' , sn , rt ) to.root = rt[:sn] # root without suffix self.sbu.prepend(suf) # restore suffix to input for processing else: # no suffix chx = self.sbu.peek() # look at next char after match if chx == '-': # if hyphen, need to separate it self.sbu.skip() if ellyChar.isLetter(self.sbu.peek()): self.sbu.prepend(' ') self.sbu.prepend('-') # print ( 'add' , str(to) ) self.ctx.addTokenToListing(to) # add token to listing for sentence return True # print ( '[' + rws + ']' , 'still unrecognized' ) chx = rws[0] # special hyphen check if chx == '-' and k > 1: # print ( 'look in internal dictionary' ) if self.ptr.createPhrasesFromDictionary(chx, False, False): # print ( 'found!' ) to = ellyToken.EllyToken(chx) # treat hyphen as token self.ctx.addTokenToListing(to) # add it to token list self.sbu.skip() # remove from input return True to = self._extractToken( mx) # single-word matching with analysis and lookup # print ( 'extracted to=' , str(to) ) if to == None: # if no match, we are done and will return # print ( 'mx=' , mx ) return False if mx == 0 else True # still success if _scanText() found something if self.ptr.lastph != None: self.ptr.lastph.lens = to.getLength() # print ( 'to=' , str(to) , 'len(s)=' , len(s) , s ) # posn = self.ctx.countTokensInListing() # print ( 'at', posn , 'in token list' ) self.ctx.addTokenToListing(to) # add token to listing for sentence # tol = self.ctx.getNthTokenInListing(-1) # print ( 'last token root=' , tol.root ) return True # successful lookup
def _getRaw(self): """ obtain next raw token from buffer arguments: self returns: EllyToken on success, None otherwise """ # print ( '_getRaw() from' , len(self.buffer) , 'chars' ) # print ( 'before skipping spaces, buffer=' , self.buffer ) self.skipSpaces() ln = len(self.buffer) # print ( "after skip=",ln ) if ln == 0: return None ## get length of next token and if it has ## initial - or +, check for word fragment # print ( 'buffer start=' , self.buffer[0] ) k = 0 # number of chars for next token cz = ' ' if ln == 0 else self.buffer[0] if cz in [MIN, PLS]: k = self.findSeparator(1) elif cz == APO: if ln > 2 and self.buffer[1].lower( ) == 's' and self.buffer[2] in separators: k = 2 else: k = 1 elif cz in [COM, DOT, UELP]: # these can be tokens by themselves k = 1 else: # print ( 'full token extraction' ) k = self.findSeparator() # print ( 'k=' , k , 'ln=' , ln ) if k < 0: # break multi-char token at next separator k = ln # if no separator, go up to end of buffer elif k == 0: k = 1 # immediate break in scanning else: while k < ln: # look at any separator and following context x = self.buffer[k] if x != MIN and x != COM: break # no further check if separator not hyphen or comma if k + 1 >= ln or not ellyChar.isDigit(self.buffer[k + 1]): # print ( 'x=' , x , 'buf=' , self.buffer[k:] ) break # accept hyphen or comma if NOT followed by digit else: # otherwise, look for another separator k = self.findSeparator(k + 2) if k < 0: # k = ln ## if token not delimited, take rest of buffer as ## will fit into token working area if k < 0: k = ln # print ( "take",k,"chars from",len(self.buffer),self.buffer ) buf = self.extract(k) # get k characters ## special check for hyphen next in buffer after extraction if self.match(MIN): # hyphen immediately following? self.skip() # if so, take it if self.atSpace(): # when followed by space buf.append(MIN) # append hyphen to candidate token k += 1 else: if not self.match(MIN): # when not followed by another hyphen self.prepend(ellyChar.SPC) # put back a space else: self.skip() # double hyphen = dash self.prepend(ellyChar.SPC) # put back space after dash self.prepend(MIN) # put back second hyphen self.prepend(MIN) # put back first self.prepend( ellyChar.SPC) # put extra space before hyphen or dash ## fill preallocated token for current position from working area # print ( "raw text buf=" , buf ) to = ellyToken.EllyToken(''.join(buf)) # print ( "EllyBuffer token before=" , str(to) ) ## strip off trailing non-token chars from token and put back in buffer km = k - 1 while km > 0: x = buf[km] if ellyChar.isLetterOrDigit(x) or x == MIN or x == PLS or x == UNS: break # print ( 'trailing x=' , x ) if x == APO or x == APX: if km > 0 and buf[km - 1] == 's': break self.prepend(x) km -= 1 km += 1 if km < k: to.shortenBy(k - km, both=True) # print ( "EllyBuffer token=" , strx(to) ) # print ( "next in buffer=" , self.buffer ) return to