def demo(): gfile = GrammarFile.read_file('test.cfg') cp = gfile.earley_parser() sent = 'the police read the solutions that Poirot sent' tokens = list(tokenize.whitespace(sent)) trees = cp.parse_n(tokens) for tree in trees: print tree
def processWhitespacesWithoutStopWords(self, corpus, caseSensitive): # initialise token buffer tokens = [] # get tokens separated by whitespaces tokenizedCorpus = tokenize.whitespace(corpus) # compile regular expression for matching whitespaces whitespaces = re.compile(r'\s\ \;') # go through each token in corpus for token in tokenizedCorpus: # if case-sensitive handling of tokens if caseSensitive == 1: pass else: token = token.lower() # remove white spaces at beginning token = whitespaces.sub('', token) # append token to list tokens.append(token) # return tokens return tokens
def processWhitespaces(self, corpus, stopWordList, caseSensitive, minimumTokenLength=3, maximumTokenLength=25): # initialise token list tokens = [] # initialise token buffer tokenBuffer = '' # get tokens separated by whitespaces tokenizedCorpus = tokenize.whitespace(corpus) # compile regular expression for matching special characters specialCharacters = re.compile(r'\&.+\;') # compile regular expression for matching whitespaces whitespaces = re.compile(r'\s|\ \;') # compile regular expression for sentence-boundary matching sentenceBoundary = re.compile(r'[\.\:\!\?\,]') # go through each token in corpus for token in tokenizedCorpus: # get token length tokenLength = len(token) # see, if token contains special character specialCharacterMatches = specialCharacters.findall(token) # reduce special characters to size one if specialCharacterMatches: for match in specialCharacterMatches: tokenLength -= (len(match) - 1) # if case-sensitive handling of tokens if caseSensitive == 1: pass else: token = token.lower() # remove white spaces at beginning and end token = whitespaces.sub('', token) # write token to buffer and remove punctuation tokenBuffer = sentenceBoundary.sub('', token) # mark stop words if tokenLength < minimumTokenLength or tokenLength > maximumTokenLength or tokenBuffer in stopWordList or tokenBuffer.lower( ) in stopWordList: tokens.append(token + '<STOPWORD>') else: tokens.append(token) # return tokens return tokens
def text_parse(grammar, sent, trace=2, drawtrees=False, latex=False): parser = grammar.earley_parser(trace=trace) print parser._grammar tokens = list(tokenize.whitespace(sent)) trees = parser.parse_n(tokens) if drawtrees: from treeview import TreeView TreeView(trees) else: for tree in trees: if latex: print tree.latex_qtree() else: print tree
def _demo_stemmer(stemmer): # Tokenize a sample text. from nltk_lite import tokenize text = "John was eating icecream" tokens = tokenize.whitespace(text) # Print the results. print stemmer for word in tokens: print "%20s => %s" % (word, stemmer.stem(word)) print
def processWhitespaces(self, corpus, stopWordList, caseSensitive, minimumTokenLength = 3, maximumTokenLength = 25): # initialise token list tokens = [] # initialise token buffer tokenBuffer = '' # get tokens separated by whitespaces tokenizedCorpus = tokenize.whitespace(corpus) # compile regular expression for matching special characters specialCharacters = re.compile(r'\&.+\;') # compile regular expression for matching whitespaces whitespaces = re.compile(r'\s|\ \;') # compile regular expression for sentence-boundary matching sentenceBoundary = re.compile(r'[\.\:\!\?\,]') # go through each token in corpus for token in tokenizedCorpus: # get token length tokenLength = len(token) # see, if token contains special character specialCharacterMatches = specialCharacters.findall(token) # reduce special characters to size one if specialCharacterMatches: for match in specialCharacterMatches: tokenLength -= (len(match) - 1) # if case-sensitive handling of tokens if caseSensitive == 1: pass else: token = token.lower() # remove white spaces at beginning and end token = whitespaces.sub('', token) # write token to buffer and remove punctuation tokenBuffer = sentenceBoundary.sub('', token) # mark stop words if tokenLength < minimumTokenLength or tokenLength > maximumTokenLength or tokenBuffer in stopWordList or tokenBuffer.lower() in stopWordList: tokens.append(token + '<STOPWORD>') else: tokens.append(token) # return tokens return tokens
def demo(): import sys, time S = GrammarCategory.parse('S') VP = GrammarCategory.parse('VP') NP = GrammarCategory.parse('NP') PP = GrammarCategory.parse('PP') V = GrammarCategory.parse('V') N = GrammarCategory.parse('N') P = GrammarCategory.parse('P') Name = GrammarCategory.parse('Name') Det = GrammarCategory.parse('Det') DetSg = GrammarCategory.parse('Det[-pl]') DetPl = GrammarCategory.parse('Det[+pl]') NSg = GrammarCategory.parse('N[-pl]') NPl = GrammarCategory.parse('N[+pl]') # Define some grammatical productions. grammatical_productions = [ cfg.Production(S, (NP, VP)), cfg.Production(PP, (P, NP)), cfg.Production(NP, (NP, PP)), cfg.Production(VP, (VP, PP)), cfg.Production(VP, (V, NP)), cfg.Production(VP, (V,)), cfg.Production(NP, (DetPl, NPl)), cfg.Production(NP, (DetSg, NSg))] # Define some lexical productions. lexical_productions = [ cfg.Production(NP, ('John',)), cfg.Production(NP, ('I',)), cfg.Production(Det, ('the',)), cfg.Production(Det, ('my',)), cfg.Production(Det, ('a',)), cfg.Production(NSg, ('dog',)), cfg.Production(NSg, ('cookie',)), cfg.Production(V, ('ate',)), cfg.Production(V, ('saw',)), cfg.Production(P, ('with',)), cfg.Production(P, ('under',)), ] earley_grammar = cfg.Grammar(S, grammatical_productions) earley_lexicon = {} for prod in lexical_productions: earley_lexicon.setdefault(prod.rhs()[0].upper(), []).append(prod.lhs()) sent = 'I saw John with a dog with my cookie' print "Sentence:\n", sent from nltk_lite import tokenize tokens = list(tokenize.whitespace(sent)) t = time.time() cp = FeatureEarleyChartParse(earley_grammar, earley_lexicon, trace=1) trees = cp.parse_n(tokens) print "Time: %s" % (time.time() - t) for tree in trees: print tree
def raw(files="english-kjv"): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "genesis", file + ".txt") s = open(path).read() for t in tokenize.whitespace(s): yield t
def getNGramStructure(sourceFile): # initialise n-gram dictionary ngrams = {} # read file corpus = sourceFile.read() # get tokens separated by whitespaces tokenizedCorpus = tokenize.whitespace(corpus) # go through each token for token in tokenizedCorpus: # split token in single characters characters = list(token) # copy character list charactersBuffer = list(characters) # initialise buffer buffer1 = "" # go through character list for char1 in characters: # write each n-gram to list buffer1 += char1 ngrams[buffer1] = ngrams.get(buffer1, 0) + 1 # shift from character list copy charactersBuffer.pop(0) # initialise buffer buffer2 = "" # go through copy of character list for char2 in charactersBuffer: buffer2 += char2 ngrams[buffer2] = ngrams.get(buffer2, 0) + 1 # return n-grams return ngrams
def getNGramStructure(sourceFile): # initialise n-gram dictionary ngrams = {} # read file corpus = sourceFile.read() # get tokens separated by whitespaces tokenizedCorpus = tokenize.whitespace(corpus) # go through each token for token in tokenizedCorpus: # split token in single characters characters = list(token) # copy character list charactersBuffer = list(characters) # initialise buffer buffer1 = '' # go through character list for char1 in characters: # write each n-gram to list buffer1 += char1 ngrams[buffer1] = ngrams.get(buffer1, 0) + 1 # shift from character list copy charactersBuffer.pop(0) # initialise buffer buffer2 = '' # go through copy of character list for char2 in charactersBuffer: buffer2 += char2 ngrams[buffer2] = ngrams.get(buffer2, 0) + 1 # return n-grams return ngrams
def demo(): """ A demonstration of the recursive descent parser. """ from nltk_lite.parse import cfg # Define some nonterminals S, VP, NP, PP = cfg.nonterminals('S, VP, NP, PP') V, N, P, Name, Det = cfg.nonterminals('V, N, P, Name, Det') # Define a grammar. productions = ( # Syntactic Productions cfg.Production(S, [NP, 'saw', NP]), cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(VP, [V, NP, PP]), cfg.Production(NP, [Det, N, PP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['telescope']) ) grammar = cfg.Grammar(S, productions) # Tokenize a sample sentence. sent = list(tokenize.whitespace('I saw a man in the park')) # Define a list of parsers. parser = RecursiveDescent(grammar) parser.trace() for p in parser.get_parse_list(sent): print p
def demo(): """ Create a recursive descent parser demo, using a simple grammar and text. """ from nltk_lite.parse import cfg grammar = cfg.parse_grammar(""" # Grammatical productions. S -> NP VP NP -> Det N PP | Det N VP -> V NP PP | V NP | V PP -> P NP # Lexical productions. NP -> 'I' Det -> 'the' | 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'ate' | 'saw' P -> 'in' | 'under' | 'with' """) sent = list(tokenize.whitespace('the dog saw a man in the park')) RecursiveDescentDemo(grammar, sent).mainloop()
def demo(): """ Create a shift reduce parser demo, using a simple grammar and text. """ from nltk_lite.parse import cfg nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [cfg.Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions cfg.Production(S, [NP, VP]), cfg.Production(NP, [Det, N]), cfg.Production(NP, [NP, PP]), cfg.Production(VP, [VP, PP]), cfg.Production(VP, [V, NP, PP]), cfg.Production(VP, [V, NP]), cfg.Production(PP, [P, NP]), # Lexical Productions cfg.Production(NP, ['I']), cfg.Production(Det, ['the']), cfg.Production(Det, ['a']), cfg.Production(N, ['man']), cfg.Production(V, ['saw']), cfg.Production(P, ['in']), cfg.Production(P, ['with']), cfg.Production(N, ['park']), cfg.Production(N, ['dog']), cfg.Production(N, ['statue']), cfg.Production(Det, ['my']), ) grammar = cfg.Grammar(S, productions) # tokenize the sentence sent = list(tokenize.whitespace('my dog saw a man in the park with a statue')) ShiftReduceDemo(grammar, sent).mainloop()
def string2words(s, sep="/"): return [tag2tuple(t, sep)[0] for t in tokenize.whitespace(s)]
words = [] sentences = [] rowID = 0 # open file file = open(path + category, 'r') # add each line to corpus for line in file: corpus += line # close file pointer file.close() # get tokens from corpus tokenizedCorpus = tokenize.whitespace(corpus) # go through tokens for token in tokenizedCorpus: # add token to sentence words.append(tag.sub('', token)) # if sentence-boundary has been found in this token if sentenceBoundary.findall(token): # recompose sentence for word in words: sentenceString += word + ' ' # add to sentence string list sentences.append(sentenceString)
from nltk_lite import tokenize from nltk_lite.parse import cfg from nltk_lite.draw.rdparser import RecursiveDescentDemo productions = """ NP -> NP AND NP NP -> N N -> "cabbages" N -> "kings" AND -> "and" """ grammar = cfg.parse_grammar(productions) text = list(tokenize.whitespace('cabbages and kings')) RecursiveDescentDemo(grammar, text).mainloop()
def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade): # allow any kind of bracketing for flexibility L_BRACKET = re.compile(r"[\(\[\{<]") R_BRACKET = re.compile(r"[\)\]\}>]") if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd") s = open(path).read() data = _parse(s) for s in data: bracket = 0 itmType = None stack = [tree.Tree(top_node, [])] inTag = [] for itm in list(tokenize.whitespace(s)): if L_BRACKET.match(itm[0]): bracket += 1 itm = itm[1:] matched = False if partial_match == True: for eachItm in chunk_types: if len(eachItm) <= len(itm) and eachItm == itm[: len(eachItm)]: matched = True if collapse_partials == True: itm = eachItm else: if chunk_types is not None and itm in chunk_types: matched = True if matched == True: # and inTag == 0: chunk = tree.Tree(itm, []) if cascade == True: stack.append(chunk) inTag += [bracket] else: if len(inTag) == 0: stack[-1].append(chunk) inTag += [bracket] itmType = itm if R_BRACKET.match(itm[-1]): tmpItm = split(itm, itm[-1]) if tmpItm != "": if len(inTag) > 0 and inTag[-1] <= bracket: # inTag <= bracket: if cascade == True: stack[-1].append((itmType, tmpItm[0])) else: stack[-1][-1].append((itmType, tmpItm[0])) else: if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] stack[-1].append((itmType, tmpItm[0])) inTag = [] + inTag[:-2] bracket -= len(tmpItm) - 1 while len(inTag) > 0 and bracket < inTag[-1]: if cascade == True: if len(stack) > 1: stack[-2].append(stack[-1]) stack = stack[:-1] inTag = [] + inTag[:-2] yield stack
def _list_sent(sent): return [tokenize.whitespace(line) for line in tokenize.line(sent)]
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk_lite import tokenize from nltk_lite.parse import cfg, pcfg, pchart # Define two demos. Each demo has a sentence and a grammar. demos = [('I saw John with my cookie', pcfg.toy1), ('the boy saw Jack with Bob under the table with a telescope', pcfg.toy2)] # Ask the user which demo they want to use. print for i in range(len(demos)): print '%3s: %s' % (i+1, demos[i][0]) print ' %r' % demos[i][1] print print 'Which demo (%d-%d)? ' % (1, len(demos)), try: snum = int(sys.stdin.readline().strip())-1 sent, grammar = demos[snum] except: print 'Bad sentence number' return # Tokenize the sentence. tokens = list(tokenize.whitespace(sent)) # Define a list of parsers. We'll use all parsers. parsers = [ pchart.InsideParse(grammar), pchart.RandomParse(grammar), pchart.UnsortedParse(grammar), pchart.LongestParse(grammar), pchart.BeamParse(len(tokens)+1, grammar) ] # Run the parsers on the tokenized sentence. times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print '\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,pcfg) parser.trace(3) t = time.time() parses = parser.get_parse_list(tokens) times.append(time.time()-t) if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print print ' Parser | Time (secs) # Parses Average P(parse)' print '-------------------+------------------------------------------' for i in range(len(parsers)): print '%18s |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, times[i],num_parses[i],average_p[i]) parses = all_parses.keys() if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 print '-------------------+------------------------------------------' print '%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p) # Ask the user if we should draw the parses. print print 'Draw parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): from nltk_lite.draw.tree import draw_trees print ' please wait...' draw_trees(*parses) # Ask the user if we should print the parses. print print 'Print parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print parse
def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk_lite import tokenize from nltk_lite.parse import cfg, pcfg, ViterbiParse # Define two demos. Each demo has a sentence and a grammar. demos = [('I saw John with my cookie', pcfg.toy1), ('the boy saw Jack with Bob under the table with a telescope', pcfg.toy2)] # Ask the user which demo they want to use. print for i in range(len(demos)): print '%3s: %s' % (i+1, demos[i][0]) print ' %r' % demos[i][1] print print 'Which demo (%d-%d)? ' % (1, len(demos)), try: snum = int(sys.stdin.readline().strip())-1 sent, grammar = demos[snum] except: print 'Bad sentence number' return # Tokenize the sentence. tokens = list(tokenize.whitespace(sent)) parser = ViterbiParse(grammar) all_parses = {} print '\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar) parser.trace(3) t = time.time() parses = parser.get_parse_list(tokens) time = time.time()-t if parses: average = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: average = 0 num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print print 'Time (secs) # Parses Average P(parse)' print '-----------------------------------------' print '%11.4f%11d%19.14f' % (time, num_parses, average) parses = all_parses.keys() if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 print '------------------------------------------' print '%11s%11d%19.14f' % ('n/a', len(parses), p) # Ask the user if we should draw the parses. print print 'Draw parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): from nltk_lite.draw.tree import draw_trees print ' please wait...' draw_trees(*parses) # Ask the user if we should print the parses. print print 'Print parses (y/n)? ', if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print parse
def set_sentence(self, sentence): self._sent = list(tokenize.whitespace(sentence)) #[XX] use tagged? self.reset()
# initialise co-occurrence matrix coOccurrences = {} # open file file = open(path + category, 'r') # add each line to corpus for line in file: corpus += line # close file pointer file.close() # get tokens from corpus tokenizedCorpus = tokenize.whitespace(corpus) # go through tokens for token in tokenizedCorpus: # add token to sentence words.append(tag.sub('', token)) # if sentence-boundary has been found in this token if sentenceBoundary.findall(token): # recompose sentence for word in words: sentenceString += word + ' ' # add to sentence string list sentences.append(sentenceString)