Python preprocess Examples

Programming Language: Python

Namespace/Package Name: rdt.nlp.pos

Method/Function: preprocess

Examples at hotexamples.com: 4

Python preprocess - 4 examples found. These are the top rated real world Python examples of rdt.nlp.pos.preprocess extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: pos_iter.py Project: juchiyama/bigdata_fall2015

	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			new_doc = pos.preprocess(doc["cleansed_text"])
			print(new_doc)
			break

Example #2

Show file

File: chunk.py Project: juchiyama/bigdata_fall2015

def simple_np_bgram(documents):
	bgram = BigramChunker(conll2000.chunked_sents('train.txt'))
	for doc in documents:
		buf = []
		for sent in pos.preprocess(doc):
			buf.append(bgram.parse(sent))
		yield buf

Example #3

Show file

File: chunk.py Project: juchiyama/bigdata_fall2015

def simple_np_ugram(documents):
	ugram = UnigramChunker(conll2000.chunked_sents('train.txt'))

	"""String sentences get split up into a datastructure"""
	for doc in documents:
		buf = []
		for sent in pos.preprocess(doc):
			buf.append(ugram.parse(sent))
		yield buf

Example #4

Show file

File: ugram.py Project: juchiyama/bigdata_fall2015

import nltk, rdt.nlp.pos as pos
from nltk.corpus import conll2000

class UnigramChunker(nltk.ChunkParserI):
	def __init__(self, train_sents):
		train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
					  for sent in train_sents]
		self.tagger = nltk.UnigramTagger(train_data)

	def parse(self, sentence):
		pos_tags = [pos for (word,pos) in sentence]
		tagged_pos_tags = self.tagger.tag(pos_tags)
		chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
		conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
					 in zip(sentence, chunktags)]
		return nltk.chunk.conlltags2tree(conlltags)

if __name__ == "__main__":
	test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
	train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
	# print(train_sents)
	unigram_chunker = UnigramChunker(train_sents)
	print(unigram_chunker.evaluate(test_sents))
	d = [ unigram_chunker.parse(sent) for sent in pos.preprocess("The dog went to the park.")]
	print(d)
	# print(unigram_chunker.parse(pos.preprocess("The dog went to the park.")))