Python doc_iter Examples

Programming Language: Python

Namespace/Package Name: rdt.data.clean.html

Method/Function: doc_iter

Examples at hotexamples.com: 9

Python doc_iter - 9 examples found. These are the top rated real world Python examples of rdt.data.clean.html.doc_iter extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: pos_iter.py Project: juchiyama/bigdata_fall2015

	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			new_doc = pos.preprocess(doc["cleansed_text"])
			print(new_doc)
			break

Example #2

Show file

File: names.py Project: juchiyama/bigdata_fall2015

	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		nt = NamesTagger()
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			del(doc["_id"])
			
			break

Example #3

Show file

File: ngrams.py Project: juchiyama/bigdata_fall2015

	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			del(doc["_id"])
			dc = bigrams.collocationFinder(doc["cleansed_text"])
			print("\n")
			print(dc)
			break

Example #4

Show file

File: backoff.py Project: juchiyama/bigdata_fall2015

	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		tagger = ngrams.make_backoff_tagger()
		chain = lambda x : list(itertools.chain(*pos.tokenize_words(pos.tokenize_sents(x))))
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			print(tagger.tag(chain(doc["cleansed_text"])))
			if ind == 10:
				break

Example #5

Show file

File: html.py Project: juchiyama/bigdata_fall2015

	def test_small_corpus(self):
		print(self.source.most_recent_created_utc())
		count = 0
		docs = self.source.find()
		docs.batch_size(10000)
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			print(doc["cleansed_text"])
			print("VS")
			print(self.body_or_selftext(doc))
			print("----------")

Example #6

Show file

File: source.py Project: juchiyama/bigdata_fall2015

	def find_clean(self,*args,**kwargs):
		"""This functions just like normal find, but cleans the documents at the same time
		"""

		batch_size = kwargs["batch_size"] if "batch_size" in kwargs else 100

		docs = self.find(*args,**kwargs)
		docs.batch_size(batch_size)
		if "skip_none" in kwargs and kwargs["skip_none"] is True:
			return clean.skip_iter(docs)
		else:
			return clean.doc_iter(docs)

Example #7

Show file

File: ne_chunk.py Project: juchiyama/bigdata_fall2015

	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		tagger = ngrams.make_backoff_tagger()
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			sentences = pos.tokenize_sents(doc["cleansed_text"])
			tags = pos.tokenize_words(sentences)
			for sent in tags:
				tagged_sent = tagger.tag(sent)
				d = ne_chunk(tagged_sent)
				chunks = tree2conlltags(d)
				print(chunks)
			if ind == 10:
				break

Example #8

Show file

File: annotated_corpus.py Project: juchiyama/bigdata_fall2015

import rdt.data.clean.html as clean
import rdt.data.mongo.source as rdtcorp
import rdt.nlp.ngrams as ngrams
import rdt.nlp.pos as pos
from nltk.chunk import ne_chunk
from nltk.chunk.util import tree2conlltags
import rdt.nlp.conll_get as cnll

if __name__ == "__main__":
    source = rdtcorp.Source(conf_key="source_test")
    annotated = rdtcorp.Source(conf_key="annotated_test")
    docs = source.find()
    docs.batch_size(1000)
    tagger = ngrams.make_backoff_tagger()
    buf = []
    for ind, doc in enumerate(clean.doc_iter(docs)):
        del (doc["_id"])
        sentences = pos.tokenize_sents(doc["cleansed_text"])
        tags = pos.tokenize_words(sentences)
        doc["conlltags"] = []
        doc["nouns"] = []
        doc["named_entities"] = []
        for sent in tags:
            tagged_sent = tagger.tag(sent)
            d = ne_chunk(tagged_sent)
            chunks = tree2conlltags(d)
            doc["conlltags"].append(chunks)
            doc["nouns"].extend(cnll.get_nouns(chunks))
            doc["named_entities"].extend(cnll.get_ne(chunks))
        buf.append(doc)
        if ind % 1000:

Example #9

Show file

File: nice.py Project: juchiyama/bigdata_fall2015

	def test_no_bad_characters(self):
		print("\n")
		remov = re.compile("[0-9]")
		for doc in html.doc_iter(self.source.find().limit(1000)):
			print("".join(list(filter(lambda x : x in string.printable, doc["cleansed_text"]))))