def __init__(self, llwl='Brown', llNL=2, percen=80, NE=True, Col=True, Gram=True, Chu=True): ''' @param llwl:LogLikleyHood Corpa name ('Brown','AmE06','BE06') @param llNL:LogLikleyHood @param percen: Presision of output default = 20, 20% returned @param NE: Uses NE default True @param Col: Uses Collocation default True @param Gram: Uses N-Grams default True @param Chu: Uses Chunking default True ''' self.NEs = NE self.Col = Col self.Gram = Gram self.Chu = Chu self.p = percen print 'Starting to build ', llwl self.LL = LogLikelihood(wordlist=llwl, NLength=llNL) print 'LL Loaded' self.POS = POS() print 'POS Loaded' self.GD = GetData() print 'GD Loaded' self.Cu = Chunker(self.POS) print 'Cu Loaded' self.FL = Filter() print 'FL Loaded' self.CC = Collocation(self.POS) print 'CC Loaded' self.Ng = NGram() print 'Ng Loaded' self.S = Select(percentil=self.p) print 'S Loaded' self.To = Tokenize(self.FL) print 'To Loaded'
# Convert the grammar trees in the corpus into a CFG (Context-Free Grammar). grammar = funcs.InduceNonTerminal(grammarTrain) # Save the grammar file. pickle.dump(grammar, open("grammar.txt", "wb")) print("Grammar induction finished.") '''========= Part IV: Chunking ========''' # In this part, we chunk sentences into different phrases using the IOB (Inside-Outside-Beginning) tags. There are 3 ki- # nds of phrases: noun phrases (NP), verb phrases (VP) and preposition phrases (PP). # Load the train and test dataset for chunking. chunkTrain = nltk.corpus.conll2000.chunked_sents("train.txt") chunkTest = nltk.corpus.conll2000.chunked_sents("test.txt") # Initiate a Chunker object. Use the training corpus to train the chunker. chunker = Chunker(chunkTrain) # Evaluate the chunker's performance on the test corpus. print(chunker.evaluate(chunkTest)) # Use the trained chunker to chunk our own texts. chunkedSents = funcs.ChunkSents(tokens, chunker) # Save the chunked texts. pickle.dump(chunkedSents, open("chunked_sents.txt", "wb")) print("Chunking finished.") '''======== Part V: Deep parsing ========''' # In this part, we used the grammar induced in previous step to parse our texts. Basically we used a shift-reduce parsi- # ng algorithm to parse the texts and find out if there are larger phrases built on small phrases. # Initiate a parser object. Load it with grammar.