Esempio n. 1
0
class SubredditClassifierTestCase(unittest.TestCase):

	def setUp(self):
		self.t = self.assertTrue
		self.inst = self.assertIsInstance
		self.feature = Features(host="localhost",port=27017,database="reddit_stream",collection="features")
		self.source = AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined")
	def tearDown(self):
		pass

	def test_bigram(self):
		bg = list(self.feature.find({"subreddit" : "UkrainianConflict"}, to_tuples=True,field="bigrams"))[0]
		bg = [ d[0] for d in bg["bigrams"] ]
		words = [d[0] for d in bg]
		words.extend([d[1] for d in bg])
		words = list(set(words))
		# print(words)
		# print(bg)
		yay = []
		for doc, ft in self.source.find_ft({"subreddit" : "UkrainianConflict"},batch_size=1000):
			tups = ft.keys()
			the_words = list(set([d[0] for d in tups] + [d[1] for d in tups]))
			# is identifying words in the_words
			for word in words:
				ft["contains(" + word + ")"] = word in the_words
			to_dump = []
			for key in ft.keys():
				if key not in bg:
					to_dump.append(key)
			for dump in to_dump:
				del ft[dump]
			if len(ft.keys()) > 0:
				yay.append((ft,"UkrainianConflict"))
			#print()
			#print(bg)
		for doc, ft in self.source.find_ft({}, limit=6000,batch_size=1000):
			yay.append((ft, "Not UkrainianConflict"))

		random.shuffle(yay)
		test_set, train_set = yay[int(len(yay)/2):], yay[:int(len(yay)/2)]
		classifier = nltk.NaiveBayesClassifier.train(train_set)

		classifier.show_most_informative_features()

		for doc, ft in self.source.find_ft({"subreddit" : "news"}, skip=6000,batch_size=1000):
			if classifier.classify(ft) == "UkrainianConflict":
				print("YAY", doc)
class AutoClassifierTestCase(unittest.TestCase):

	def setUp(self):
		self.t = self.assertTrue
		self.inst = self.assertIsInstance
		self.ft_db = Features(host='localhost',port=27017,database="reddit_stream",collection="features")
		self.source = AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined")
	def tearDown(self):	
		pass

	def test_classifier(self):
		bgram_doc = list(self.ft_db.find({"subreddit" : "UkrainianConflict"},to_tuples=True,field="bigrams"))[0]
		allbgram_doc = list(self.ft_db.find({"subreddit" : "all"}, to_tuples=True, field='bigrams'))[0]

		pos_fts = { d[0]:True for d in bgram_doc["bigrams"] }
		neu_fts = { d[0]:True for d in allbgram_doc["bigrams"] }
		
		ukr = []
		neu = []

		for doc, fts in self.source.find_ft({"subreddit" : "UkrainianConflict"}):
			nomore = []
			for key in fts.keys():
				if key not in pos_fts:
					nomore = []
				for n in nomore:
					del fts[n]
			if len(fts.keys()) > 0:
				ukr.append(fts)

		for doc, fts in self.source.find_ft(limit=6000):
			neu.append(fts)

		nvb = PositiveNaiveBayesClassifier.train(ukr,neu)
		for do, fts in self.source.find_ft(skip=6000,limit=10):
			print(nvb.classify(fts))
		nvb.show_most_informative_features()

		"""ukr = []
Esempio n. 3
0
	def setUp(self):
		self.t = self.assertTrue
		self.inst = self.assertIsInstance
		self.feature = Features(host="localhost",port=27017,database="reddit_stream",collection="features")
		self.source = AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined")