コード例 #1
0
"""
This script fills the subreddit_id field 
for wayback submissions by extracting the 
subreddit name from the submission's url. 
"""

# Example: http://www.reddit.com/r/mildlyinteresting/comments/... --> mildlyinteresting
def extract_subreddit(url):
	start = url.find("/r/")
	end = url.find("/", start + 3)
	return url[start + 3:end] 

from redditDB import RedditDB

rdb = RedditDB("mciot", "r3dd1tmorgane", "blacksun.cs.mcgill.ca", 31050, "reddit_topics")

for submission in rdb.get_wayback_submissions():
	url = submission.get("comment_url")
	if url:
		subreddit = extract_subreddit(url)
		subreddit_obj = rdb.subreddit_exists(subreddit)
		# If the subreddit exists and the submission doesn't have an id, update 
		if subreddit_obj and submission.get("subreddit_id") is None:
			rdb.update_wayback_submission(submission.get("_id"), "subreddit_id", subreddit_obj.get("_id"))



コード例 #2
0
"""
Get db stats of submissions/comments we have per-month. 
"""

from redditDB import RedditDB
from datetime import datetime 

rdb = RedditDB("mciot", "r3dd1tmorgane", "blacksun.cs.mcgill.ca", 31050, "reddit_topics")

stats = {}

num_no_created = 0
counter = 0

submissions = rdb.get_wayback_submissions()

for submission in submissions:
	counter += 1
	if counter % 100 == 0:
		print counter 

	created = submission.get(u'created')
	if created is None:
		num_no_created += 1
		continue 

	if type(created) == float:
		date = datetime.fromtimestamp(created)
	elif type(created) == datetime:
		date = created
	else:
コード例 #3
0
ファイル: stats.py プロジェクト: morganecf/topic-modeling
print ""


#TODO: Do this straight with mongodb aggregation 

## Popular domains ###
print "=========Popular domains:current collection========="
print "Finding frequencies of all domains..."
domain_freqs = domain_frequencies(rdb.submission_list())
print "Calculating top domains..."
top_20_domains = top_domains(domain_freqs, top=100)
print "DOMAIN\tCOUNT"
for domain, num in top_20_domains:
	print domain, "\t", num
print ""
print "=========Popular domains:wayback collection========="
print "Finding frequencies of all domains..."
domain_freqs_wb = domain_frequencies(rdb.get_wayback_submissions())
print "Calculating top domains..."
top_20_domains_wb = top_domains(domain_freqs_wb, top=100)
print "DOMAIN\tCOUNT"
for domain, num in top_20_domains_wb:
	print domain, "\t", num
print ""

### logging ####
print "=========logging information========="
types = rdb.log_types()
print 'TYPE', '\t', 'COUNT'
for typ in types:
	print typ, '\t', rdb.logged_errors_count(typ)