Example #1
0
	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			new_doc = pos.preprocess(doc["cleansed_text"])
			print(new_doc)
			break
Example #2
0
	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		nt = NamesTagger()
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			del(doc["_id"])
			
			break
Example #3
0
	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			del(doc["_id"])
			dc = bigrams.collocationFinder(doc["cleansed_text"])
			print("\n")
			print(dc)
			break
Example #4
0
	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		tagger = ngrams.make_backoff_tagger()
		chain = lambda x : list(itertools.chain(*pos.tokenize_words(pos.tokenize_sents(x))))
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			print(tagger.tag(chain(doc["cleansed_text"])))
			if ind == 10:
				break
Example #5
0
	def test_small_corpus(self):
		print(self.source.most_recent_created_utc())
		count = 0
		docs = self.source.find()
		docs.batch_size(10000)
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			print(doc["cleansed_text"])
			print("VS")
			print(self.body_or_selftext(doc))
			print("----------")
Example #6
0
	def find_clean(self,*args,**kwargs):
		"""This functions just like normal find, but cleans the documents at the same time
		"""

		batch_size = kwargs["batch_size"] if "batch_size" in kwargs else 100

		docs = self.find(*args,**kwargs)
		docs.batch_size(batch_size)
		if "skip_none" in kwargs and kwargs["skip_none"] is True:
			return clean.skip_iter(docs)
		else:
			return clean.doc_iter(docs)
Example #7
0
	def test_interactive(self):
		docs = self.source.find()
		docs.batch_size(1000)
		tagger = ngrams.make_backoff_tagger()
		for ind, doc in enumerate(clean_html.doc_iter(docs)):
			sentences = pos.tokenize_sents(doc["cleansed_text"])
			tags = pos.tokenize_words(sentences)
			for sent in tags:
				tagged_sent = tagger.tag(sent)
				d = ne_chunk(tagged_sent)
				chunks = tree2conlltags(d)
				print(chunks)
			if ind == 10:
				break
import rdt.data.clean.html as clean
import rdt.data.mongo.source as rdtcorp
import rdt.nlp.ngrams as ngrams
import rdt.nlp.pos as pos
from nltk.chunk import ne_chunk
from nltk.chunk.util import tree2conlltags
import rdt.nlp.conll_get as cnll

if __name__ == "__main__":
    source = rdtcorp.Source(conf_key="source_test")
    annotated = rdtcorp.Source(conf_key="annotated_test")
    docs = source.find()
    docs.batch_size(1000)
    tagger = ngrams.make_backoff_tagger()
    buf = []
    for ind, doc in enumerate(clean.doc_iter(docs)):
        del (doc["_id"])
        sentences = pos.tokenize_sents(doc["cleansed_text"])
        tags = pos.tokenize_words(sentences)
        doc["conlltags"] = []
        doc["nouns"] = []
        doc["named_entities"] = []
        for sent in tags:
            tagged_sent = tagger.tag(sent)
            d = ne_chunk(tagged_sent)
            chunks = tree2conlltags(d)
            doc["conlltags"].append(chunks)
            doc["nouns"].extend(cnll.get_nouns(chunks))
            doc["named_entities"].extend(cnll.get_ne(chunks))
        buf.append(doc)
        if ind % 1000:
Example #9
0
	def test_no_bad_characters(self):
		print("\n")
		remov = re.compile("[0-9]")
		for doc in html.doc_iter(self.source.find().limit(1000)):
			print("".join(list(filter(lambda x : x in string.printable, doc["cleansed_text"]))))