class SubredditClassifierTestCase(unittest.TestCase): def setUp(self): self.t = self.assertTrue self.inst = self.assertIsInstance self.feature = Features(host="localhost",port=27017,database="reddit_stream",collection="features") self.source = AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined") def tearDown(self): pass def test_bigram(self): bg = list(self.feature.find({"subreddit" : "UkrainianConflict"}, to_tuples=True,field="bigrams"))[0] bg = [ d[0] for d in bg["bigrams"] ] words = [d[0] for d in bg] words.extend([d[1] for d in bg]) words = list(set(words)) # print(words) # print(bg) yay = [] for doc, ft in self.source.find_ft({"subreddit" : "UkrainianConflict"},batch_size=1000): tups = ft.keys() the_words = list(set([d[0] for d in tups] + [d[1] for d in tups])) # is identifying words in the_words for word in words: ft["contains(" + word + ")"] = word in the_words to_dump = [] for key in ft.keys(): if key not in bg: to_dump.append(key) for dump in to_dump: del ft[dump] if len(ft.keys()) > 0: yay.append((ft,"UkrainianConflict")) #print() #print(bg) for doc, ft in self.source.find_ft({}, limit=6000,batch_size=1000): yay.append((ft, "Not UkrainianConflict")) random.shuffle(yay) test_set, train_set = yay[int(len(yay)/2):], yay[:int(len(yay)/2)] classifier = nltk.NaiveBayesClassifier.train(train_set) classifier.show_most_informative_features() for doc, ft in self.source.find_ft({"subreddit" : "news"}, skip=6000,batch_size=1000): if classifier.classify(ft) == "UkrainianConflict": print("YAY", doc)
class AutoClassifierTestCase(unittest.TestCase): def setUp(self): self.t = self.assertTrue self.inst = self.assertIsInstance self.ft_db = Features(host='localhost',port=27017,database="reddit_stream",collection="features") self.source = AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined") def tearDown(self): pass def test_classifier(self): bgram_doc = list(self.ft_db.find({"subreddit" : "UkrainianConflict"},to_tuples=True,field="bigrams"))[0] allbgram_doc = list(self.ft_db.find({"subreddit" : "all"}, to_tuples=True, field='bigrams'))[0] pos_fts = { d[0]:True for d in bgram_doc["bigrams"] } neu_fts = { d[0]:True for d in allbgram_doc["bigrams"] } ukr = [] neu = [] for doc, fts in self.source.find_ft({"subreddit" : "UkrainianConflict"}): nomore = [] for key in fts.keys(): if key not in pos_fts: nomore = [] for n in nomore: del fts[n] if len(fts.keys()) > 0: ukr.append(fts) for doc, fts in self.source.find_ft(limit=6000): neu.append(fts) nvb = PositiveNaiveBayesClassifier.train(ukr,neu) for do, fts in self.source.find_ft(skip=6000,limit=10): print(nvb.classify(fts)) nvb.show_most_informative_features() """ukr = []
def setUp(self): self.t = self.assertTrue self.inst = self.assertIsInstance self.feature = Features(host="localhost",port=27017,database="reddit_stream",collection="features") self.source = AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined")