def test_evaluate(self): # the data sources ukr = self.source.find_clean({"subreddit" : "UkrainianConflict"}, limit=2500, batch_size=1000) askr = self.source.find_clean({"subreddit" : "AskReddit"}, limit=2500, batch_size=1000) alll = self.source.find_clean(limit=10000) tagger = ngrams.make_backoff_tagger() featuresets = [(cls.ner_feature(doc,tagger=tagger), "YES") for doc in ukr] featuresets.extend([(cls.ner_feature(doc,tagger=tagger), "NO") for doc in askr]) random.shuffle(featuresets) trainset, testset = featuresets[1250:], featuresets[:1250] classifier = NaiveBayesClassifier.train(trainset) f = open("./UkrainianConflictNVM","w") for doc in alll: del (doc["_id"]) truthiness = False truthiness = classifier.classify(cls.ner_feature(doc,tagger=tagger)) if truthiness: f.write(json.dumps(doc) + "\n") f.close() print(nltk.classify.accuracy(classifier, testset))
def test_classbased(self): tagger = ngrams.make_backoff_tagger() params = { "corpora" : self.source, "labeled_set" : lambda : self.source.find_clean({"subreddit" : "fitness"}, batch_size=1000, limit=2000), "unlabeled_set" : lambda : self.source.find_clean({"subreddit" : "AskReddit"}, batch_size=1000, limit=2000), "feature" : lambda x : cls.ner_feature(x,tagger=tagger), "exit" : lambda self : self.corpora.exit() } pnb_a = annotate.PNBAnnotater(**params) pnb_a.train() pnb_a.describe() ct = 0 for doc, annotation in pnb_a.classify_iter(self.source.find_clean()): ct += 1 # print(doc) print(annotation) if ct == 10: break print("------------")