Ejemplo n.º 1
0
from rdt.data.mongo.features import Features
import sys

if __name__ == "__main__":
	subreddit = sys.argv[1]
	fts = Features(host="localhost",port=27017,database="reddit_stream",collection="features")
	bgrams = list(fts.find({"subreddit" : subreddit}))[0]["bigrams"]
	# bgrams = list(filter(lambda x : True if x[0]))
	while 1:
		print(bgrams)
		print("what do you want remove?")
		word1 = input("enter the first word: ")
		word2 = input("enter the second word: ")
		bgrams = list(filter(lambda x : x[0][0] != word1 and x[0][1] != word2, bgrams))
		action = input("(w)rite, (q)uit, (c)ontinue")
		if action == "w":
			fts.upsert({"subreddit" : subreddit}, {"bigrams" : bgrams})
		if action == "q":
			break
from rdt.data.mongo.features import Features
import rdt.job as job, nltk, sys
from nltk.corpus import stopwords

if __name__ == "__main__":
	stopwords = stopwords.words('english') + ['-','https', '%','[', ']', "''", "``",'--', "'s", ",", ".","-","(",")",":","n't", "?","!"]
	ft_db=Features(host="localhost",port=27017,database="reddit_stream",collection="features")
	job = job.AnnotatedSource(host="localhost",port=27017,database="reddit_stream",collection="big_combined")
	gen = job.to_words({}, remove_stopwords=True, limit=6000)
	finder = job.bigram_collocation_finder(gen)
	finder.apply_freq_filter(4)
	finder.apply_word_filter(lambda w: w in stopwords)
	bigram_measures = nltk.collocations.BigramAssocMeasures()
	scored = finder.score_ngrams(bigram_measures.raw_freq)
	ft_db.upsert({"subreddit" : "all"}, {"bigrams" : sorted(finder.ngram_fd.items(), key=lambda t:(-t[1], t[0])) })
	# print(sorted(finder.ngram_fd.items(), key=lambda t:(-t[1], t[0]))[:10])
	# print(len(finder.ngram_fd.items()))