def test_interactive(self): docs = self.source.find() docs.batch_size(1000) for ind, doc in enumerate(clean_html.doc_iter(docs)): new_doc = pos.preprocess(doc["cleansed_text"]) print(new_doc) break
def simple_np_bgram(documents): bgram = BigramChunker(conll2000.chunked_sents('train.txt')) for doc in documents: buf = [] for sent in pos.preprocess(doc): buf.append(bgram.parse(sent)) yield buf
def simple_np_ugram(documents): ugram = UnigramChunker(conll2000.chunked_sents('train.txt')) """String sentences get split up into a datastructure""" for doc in documents: buf = [] for sent in pos.preprocess(doc): buf.append(ugram.parse(sent)) yield buf
import nltk, rdt.nlp.pos as pos from nltk.corpus import conll2000 class UnigramChunker(nltk.ChunkParserI): def __init__(self, train_sents): train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)] for sent in train_sents] self.tagger = nltk.UnigramTagger(train_data) def parse(self, sentence): pos_tags = [pos for (word,pos) in sentence] tagged_pos_tags = self.tagger.tag(pos_tags) chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags] conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)] return nltk.chunk.conlltags2tree(conlltags) if __name__ == "__main__": test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP']) train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP']) # print(train_sents) unigram_chunker = UnigramChunker(train_sents) print(unigram_chunker.evaluate(test_sents)) d = [ unigram_chunker.parse(sent) for sent in pos.preprocess("The dog went to the park.")] print(d) # print(unigram_chunker.parse(pos.preprocess("The dog went to the park.")))