def test_interactive(self): docs = self.source.find() docs.batch_size(1000) for ind, doc in enumerate(clean_html.doc_iter(docs)): new_doc = pos.preprocess(doc["cleansed_text"]) print(new_doc) break
def test_interactive(self): docs = self.source.find() docs.batch_size(1000) nt = NamesTagger() for ind, doc in enumerate(clean_html.doc_iter(docs)): del(doc["_id"]) break
def test_interactive(self): docs = self.source.find() docs.batch_size(1000) for ind, doc in enumerate(clean_html.doc_iter(docs)): del(doc["_id"]) dc = bigrams.collocationFinder(doc["cleansed_text"]) print("\n") print(dc) break
def test_interactive(self): docs = self.source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() chain = lambda x : list(itertools.chain(*pos.tokenize_words(pos.tokenize_sents(x)))) for ind, doc in enumerate(clean_html.doc_iter(docs)): print(tagger.tag(chain(doc["cleansed_text"]))) if ind == 10: break
def test_small_corpus(self): print(self.source.most_recent_created_utc()) count = 0 docs = self.source.find() docs.batch_size(10000) for ind, doc in enumerate(clean_html.doc_iter(docs)): print(doc["cleansed_text"]) print("VS") print(self.body_or_selftext(doc)) print("----------")
def find_clean(self,*args,**kwargs): """This functions just like normal find, but cleans the documents at the same time """ batch_size = kwargs["batch_size"] if "batch_size" in kwargs else 100 docs = self.find(*args,**kwargs) docs.batch_size(batch_size) if "skip_none" in kwargs and kwargs["skip_none"] is True: return clean.skip_iter(docs) else: return clean.doc_iter(docs)
def test_interactive(self): docs = self.source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() for ind, doc in enumerate(clean_html.doc_iter(docs)): sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) print(chunks) if ind == 10: break
import rdt.data.clean.html as clean import rdt.data.mongo.source as rdtcorp import rdt.nlp.ngrams as ngrams import rdt.nlp.pos as pos from nltk.chunk import ne_chunk from nltk.chunk.util import tree2conlltags import rdt.nlp.conll_get as cnll if __name__ == "__main__": source = rdtcorp.Source(conf_key="source_test") annotated = rdtcorp.Source(conf_key="annotated_test") docs = source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() buf = [] for ind, doc in enumerate(clean.doc_iter(docs)): del (doc["_id"]) sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) doc["conlltags"] = [] doc["nouns"] = [] doc["named_entities"] = [] for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) doc["conlltags"].append(chunks) doc["nouns"].extend(cnll.get_nouns(chunks)) doc["named_entities"].extend(cnll.get_ne(chunks)) buf.append(doc) if ind % 1000:
def test_no_bad_characters(self): print("\n") remov = re.compile("[0-9]") for doc in html.doc_iter(self.source.find().limit(1000)): print("".join(list(filter(lambda x : x in string.printable, doc["cleansed_text"]))))