def test_interactive(self): docs = self.source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() chain = lambda x : list(itertools.chain(*pos.tokenize_words(pos.tokenize_sents(x)))) for ind, doc in enumerate(clean_html.doc_iter(docs)): print(tagger.tag(chain(doc["cleansed_text"]))) if ind == 10: break
def ner_feature(document,tagger=None): if tagger is None: tagger = ngrams.make_backoff_tagger() sents = annotate.dirty_dict(document,tagger=tagger) fts = {} for noun in sents["nouns"]: nn = "contains_noun(" + noun + ")" fts[nn] = True return fts
def test_interactive(self): docs = self.source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() for ind, doc in enumerate(clean_html.doc_iter(docs)): sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) print(chunks) if ind == 10: break
def test_interactive(self): docs = self.source.find_clean(batch_size=1000) tagger = ngrams.make_backoff_tagger() print() for ind, doc in docs: sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) print("CHUNKS" + str(chunks)) print("NE" + str(cnll.get_ne(chunks))) print("NOUNS" + str(cnll.get_nouns(chunks))) if ind == 10: break
def test_classbased(self): tagger = ngrams.make_backoff_tagger() params = { "corpora" : self.source, "labeled_set" : lambda : self.source.find_clean({"subreddit" : "fitness"}, batch_size=1000, limit=2000), "unlabeled_set" : lambda : self.source.find_clean({"subreddit" : "AskReddit"}, batch_size=1000, limit=2000), "feature" : lambda x : cls.ner_feature(x,tagger=tagger), "exit" : lambda self : self.corpora.exit() } pnb_a = annotate.PNBAnnotater(**params) pnb_a.train() pnb_a.describe() ct = 0 for doc, annotation in pnb_a.classify_iter(self.source.find_clean()): ct += 1 # print(doc) print(annotation) if ct == 10: break print("------------")
def test_evaluate(self): # the data sources ukr = self.source.find_clean({"subreddit" : "UkrainianConflict"}, limit=2500, batch_size=1000) askr = self.source.find_clean({"subreddit" : "AskReddit"}, limit=2500, batch_size=1000) alll = self.source.find_clean(limit=10000) tagger = ngrams.make_backoff_tagger() featuresets = [(cls.ner_feature(doc,tagger=tagger), "YES") for doc in ukr] featuresets.extend([(cls.ner_feature(doc,tagger=tagger), "NO") for doc in askr]) random.shuffle(featuresets) trainset, testset = featuresets[1250:], featuresets[:1250] classifier = NaiveBayesClassifier.train(trainset) f = open("./UkrainianConflictNVM","w") for doc in alll: del (doc["_id"]) truthiness = False truthiness = classifier.classify(cls.ner_feature(doc,tagger=tagger)) if truthiness: f.write(json.dumps(doc) + "\n") f.close() print(nltk.classify.accuracy(classifier, testset))
def setUp(self): self.t = self.assertTrue self.inst = self.assertIsInstance self.source = source.Source(host="localhost", port=27017, database="reddit_stream_test", collection="combined") self.tagger = tagger.make_backoff_tagger()
import rdt.data.clean.html as clean import rdt.data.mongo.source as rdtcorp import rdt.nlp.ngrams as ngrams import rdt.nlp.pos as pos from nltk.chunk import ne_chunk from nltk.chunk.util import tree2conlltags import rdt.nlp.conll_get as cnll if __name__ == "__main__": source = rdtcorp.Source(conf_key="source_test") annotated = rdtcorp.Source(conf_key="annotated_test") docs = source.find() docs.batch_size(1000) tagger = ngrams.make_backoff_tagger() buf = [] for ind, doc in enumerate(clean.doc_iter(docs)): del (doc["_id"]) sentences = pos.tokenize_sents(doc["cleansed_text"]) tags = pos.tokenize_words(sentences) doc["conlltags"] = [] doc["nouns"] = [] doc["named_entities"] = [] for sent in tags: tagged_sent = tagger.tag(sent) d = ne_chunk(tagged_sent) chunks = tree2conlltags(d) doc["conlltags"].append(chunks) doc["nouns"].extend(cnll.get_nouns(chunks)) doc["named_entities"].extend(cnll.get_ne(chunks)) buf.append(doc) if ind % 1000: