Example #1
0
class FeaturesDBTestCase(unittest.TestCase):

	def setUp(self):
		self.t = self.assertTrue
		self.inst = self.assertIsInstance
		self.fts_db = Features(host="localhost",port=27017,database="reddit_stream",collection="features")
	def tearDown(self):	
		pass

	def test_convert_list_to_tuples(self):
		for doc in self.fts_db.find({"subreddit" : "UkrainianConflict"}, to_tuples=True,field="bigrams"):
		 	print(doc)
class AutoClassifierTestCase(unittest.TestCase):

	def setUp(self):
		self.t = self.assertTrue
		self.inst = self.assertIsInstance
		self.ft_db = Features(host='localhost',port=27017,database="reddit_stream",collection="features")
		self.source = AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined")
	def tearDown(self):	
		pass

	def test_classifier(self):
		bgram_doc = list(self.ft_db.find({"subreddit" : "UkrainianConflict"},to_tuples=True,field="bigrams"))[0]
		allbgram_doc = list(self.ft_db.find({"subreddit" : "all"}, to_tuples=True, field='bigrams'))[0]

		pos_fts = { d[0]:True for d in bgram_doc["bigrams"] }
		neu_fts = { d[0]:True for d in allbgram_doc["bigrams"] }
		
		ukr = []
		neu = []

		for doc, fts in self.source.find_ft({"subreddit" : "UkrainianConflict"}):
			nomore = []
			for key in fts.keys():
				if key not in pos_fts:
					nomore = []
				for n in nomore:
					del fts[n]
			if len(fts.keys()) > 0:
				ukr.append(fts)

		for doc, fts in self.source.find_ft(limit=6000):
			neu.append(fts)

		nvb = PositiveNaiveBayesClassifier.train(ukr,neu)
		for do, fts in self.source.find_ft(skip=6000,limit=10):
			print(nvb.classify(fts))
		nvb.show_most_informative_features()

		"""ukr = []
Example #3
0
class SubredditClassifierTestCase(unittest.TestCase):

	def setUp(self):
		self.t = self.assertTrue
		self.inst = self.assertIsInstance
		self.feature = Features(host="localhost",port=27017,database="reddit_stream",collection="features")
		self.source = AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined")
	def tearDown(self):
		pass

	def test_bigram(self):
		bg = list(self.feature.find({"subreddit" : "UkrainianConflict"}, to_tuples=True,field="bigrams"))[0]
		bg = [ d[0] for d in bg["bigrams"] ]
		words = [d[0] for d in bg]
		words.extend([d[1] for d in bg])
		words = list(set(words))
		# print(words)
		# print(bg)
		yay = []
		for doc, ft in self.source.find_ft({"subreddit" : "UkrainianConflict"},batch_size=1000):
			tups = ft.keys()
			the_words = list(set([d[0] for d in tups] + [d[1] for d in tups]))
			# is identifying words in the_words
			for word in words:
				ft["contains(" + word + ")"] = word in the_words
			to_dump = []
			for key in ft.keys():
				if key not in bg:
					to_dump.append(key)
			for dump in to_dump:
				del ft[dump]
			if len(ft.keys()) > 0:
				yay.append((ft,"UkrainianConflict"))
			#print()
			#print(bg)
		for doc, ft in self.source.find_ft({}, limit=6000,batch_size=1000):
			yay.append((ft, "Not UkrainianConflict"))

		random.shuffle(yay)
		test_set, train_set = yay[int(len(yay)/2):], yay[:int(len(yay)/2)]
		classifier = nltk.NaiveBayesClassifier.train(train_set)

		classifier.show_most_informative_features()

		for doc, ft in self.source.find_ft({"subreddit" : "news"}, skip=6000,batch_size=1000):
			if classifier.classify(ft) == "UkrainianConflict":
				print("YAY", doc)
from rdt.data.mongo.features import Features
import sys

if __name__ == "__main__":
	subreddit = sys.argv[1]
	fts = Features(host="localhost",port=27017,database="reddit_stream",collection="features")
	bgrams = list(fts.find({"subreddit" : subreddit}))[0]["bigrams"]
	# bgrams = list(filter(lambda x : True if x[0]))
	while 1:
		print(bgrams)
		print("what do you want remove?")
		word1 = input("enter the first word: ")
		word2 = input("enter the second word: ")
		bgrams = list(filter(lambda x : x[0][0] != word1 and x[0][1] != word2, bgrams))
		action = input("(w)rite, (q)uit, (c)ontinue")
		if action == "w":
			fts.upsert({"subreddit" : subreddit}, {"bigrams" : bgrams})
		if action == "q":
			break
Example #5
0
	def setUp(self):
		self.t = self.assertTrue
		self.inst = self.assertIsInstance
		self.feature = Features(host="localhost",port=27017,database="reddit_stream",collection="features")
		self.source = AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined")
Example #6
0
	def setUp(self):
		self.t = self.assertTrue
		self.inst = self.assertIsInstance
		self.fts_db = Features(host="localhost",port=27017,database="reddit_stream",collection="features")
from rdt.data.mongo.features import Features
import rdt.job as job, nltk, sys
from nltk.corpus import stopwords

if __name__ == "__main__":
	stopwords = stopwords.words('english') + ['-','https', '%','[', ']', "''", "``",'--', "'s", ",", ".","-","(",")",":","n't", "?","!"]
	ft_db=Features(host="localhost",port=27017,database="reddit_stream",collection="features")
	job = job.AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined")
	gen = job.to_words({}, remove_stopwords=True, limit=6000)
	finder = job.bigram_collocation_finder(gen)
	finder.apply_freq_filter(4)
	finder.apply_word_filter(lambda w: w in stopwords)
	bigram_measures = nltk.collocations.BigramAssocMeasures()
	scored = finder.score_ngrams(bigram_measures.raw_freq)
	ft_db.upsert({"subreddit" : "all"}, {"bigrams" : sorted(finder.ngram_fd.items(), key=lambda t:(-t[1], t[0])) })
	# print(sorted(finder.ngram_fd.items(), key=lambda t:(-t[1], t[0]))[:10])
	# print(len(finder.ngram_fd.items()))