def test_interactive(self): docs = self.source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() chain = lambda x : list(itertools.chain(*pos.tokenize_words(pos.tokenize_sents(x)))) for ind, doc in enumerate(clean_html.doc_iter(docs)): print(tagger.tag(chain(doc["cleansed_text"]))) if ind == 10: break
def collocationFinder(document,nbest=4): """ The is a bigram collocation finder. :param document: """ chain = lambda x : list(itertools.chain(*pos.tokenize_words(pos.tokenize_sents(x)))) stopset = set(stopwords.words('english')) filter_stops = lambda w: len(w) < 3 or w in stopset bcf = BigramCollocationFinder.from_words(chain(document)) bcf.apply_word_filter(filter_stops) return bcf.nbest(BigramAssocMeasures.likelihood_ratio, 4)
def test_interactive(self): docs = self.source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() for ind, doc in enumerate(clean_html.doc_iter(docs)): sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) print(chunks) if ind == 10: break
def filter_words(text): """Prepares the reddit document for bag of words. Turns the text into an array of strings, without stopwords :param text: blob of text :type text: [str] """ sents = pos.tokenize_words(pos.tokenize_sents(text)) final = [] """turn the list of sentences into a list of words""" for sent in sents: final.extend(sent) stop = stopwords.words('english') final = [w for w in final if w.lower() not in stop] final = [w.lower() for w in final] return final
def test_interactive(self): docs = self.source.find_clean(batch_size=1000) tagger = ngrams.make_backoff_tagger() print() for ind, doc in docs: sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) print("CHUNKS" + str(chunks)) print("NE" + str(cnll.get_ne(chunks))) print("NOUNS" + str(cnll.get_nouns(chunks))) if ind == 10: break
def clean_dict(doc, tagger=nltk.pos_tag): """ Processes NLP features from cleansed_text. All other functions wrap this one. Serves to act as the NLP-front end for reddit corpus parsing. Dictionaries and json strings are accepted and return dictionaries containing additional information. The processing done here represents the general annotations. The following are the new fields added to the dictionary. Classifiers will work to modify or wrap these methods. :: { conlltags : [[(word, pos, BIO)]], nouns : [word], named_entities : [[word, pos, BIO]], cleansed_text : [[word]] } :param doc: dictionary of reddit corpus. :type doc: dict :param tagger: A pos tagger. :type tagger: Tagger :returns: dict """ if "_id" in doc: del (doc["_id"]) sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) or [] doc["conlltags"] = [] doc["nouns"] = [] doc["named_entities"] = [] for sent in tags: tagged_sent = nltk.pos_tag(sent) or [] d = ne_chunk(tagged_sent) or [] chunks = tree2conlltags(d) doc["conlltags"].append(chunks) doc["nouns"].extend(cnll.get_nouns(chunks)) doc["named_entities"].extend(cnll.get_ne(chunks)) return doc
import rdt.nlp.ngrams as ngrams import rdt.nlp.pos as pos from nltk.chunk import ne_chunk from nltk.chunk.util import tree2conlltags import rdt.nlp.conll_get as cnll if __name__ == "__main__": source = rdtcorp.Source(conf_key="source_test") annotated = rdtcorp.Source(conf_key="annotated_test") docs = source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() buf = [] for ind, doc in enumerate(clean.doc_iter(docs)): del (doc["_id"]) sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) doc["conlltags"] = [] doc["nouns"] = [] doc["named_entities"] = [] for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) doc["conlltags"].append(chunks) doc["nouns"].extend(cnll.get_nouns(chunks)) doc["named_entities"].extend(cnll.get_ne(chunks)) buf.append(doc) if ind % 1000: annotated.insert(buf) buf = []