コード例 #1
0
def createDataset(r, subreddits, startDate=(datetime.datetime.now()-datetime.timedelta(days=7)).strftime('%y%m%d%H%M%S'),
                  endDate=datetime.datetime.now().strftime('%y%m%d%H%M%S'), nCommentsPerSubmission=100, dbName='reddit',
                  fineScale=12, nPostsPerFineScale=200):
    """
    :param r: reddit object
    :param subreddits: list of subreddits to grab
    :param startDate: start date in format yymmddHHMMSS
    :param endDate: end date in format yymmddHHMMSS
    :param nCommentsPerSubmission: number of comments to grab per submission. Default is 100.
    :param dbName: base of database name
    :param fineScale: scale of database in hours
    :param nPostsPerFineScale: number of posts per fine scale
    :return:
    """

    # initialize database
    dbObj = RedditDB(dbName=dbName)

    # loop through each subreddit
    for sub in subreddits:

        print 'Processing subreddit: ' + sub.title.encode('utf-8')

        # get submissions within the date range
        matchingPosts = getAllPostsWithinRangeFineScale(sub, startDate=startDate, endDate=endDate, fineScale=fineScale,
                                                        nPostsPer=nPostsPerFineScale)

        # loop through each post and get top comments
        for post in matchingPosts:

            print 'Processing post: ' + post.title.encode('utf-8')

            # save post
            dbObj.saveSubmission(post)

            # get comments
            numTries = 0
            gotComments = False
            while not gotComments and numTries < 10:
                try:
                    comments = getCommentsFromSubmission(post, nCommentsPerSubmission)
                    gotComments = True
                except HTTPError:
                    time.sleep(2)
                    numTries += 1

            # save comment data for comments which have not been deleted
            # print [com.author.name for com in comments if isinstance(com, praw.objects.Comment)]
            [dbObj.saveCommentData(com) for com in comments if isinstance(com, praw.objects.Comment)
             and com.author is not None]

    dbObj.closeConnection()
    print ('\nData collection complete!')
コード例 #2
0
from redditDB import RedditDB
import time

username = "******"
password = "******"

rdb = RedditDB(username, password, "blacksun.cs.mcgill.ca", 31050, "reddit_topics")

#
# data = {}
#
# f1 = open("../data/subreddit_popularity.txt", "w")
#
# for subreddit in rdb.get_subreddits():
# 	name = subreddit.get("subreddit_name")
# 	num = rdb.num_submissions(subreddit=name)
# 	data[name] = num
# 	print name, num
# 	f1.write(str(num) + '\t' + name + '\n')
#
# f1.close()
#

f2 = open("../data/subreddit_popularity_wayback.txt", "w")

num_submissions = rdb.num_wayback_submissions()

data = {}

counter = 0
num_not_belonging = 0
コード例 #3
0
modelkey = sys.argv[1]

base_path = "../data/llda-cvb0-cee38496-3-dd7c685c-46a60775/01000"
topic_distrib_path = base_path + "llda_test_"+modelkey+"-document-topic-distributuions.csv"
labels_path = base_path + "label-index.txt"

labels = open(labels_path).read().splitlines()
f = open(topic_distrib_path).read().splitlines()

correct = 0

for line in f:
	parts = line.split(",")
	label = parts[0]	# Actual label
	distribution = map(lambda x: float(x), parts[1:])
	guess = distribution.index(max(distribution))
	if labels[guess] == label:
		correct += 1

# TODO: Compute false positives, negatives, etc. 

accuracy = float(correct)/float(len(f))
print "Accuracy:", str(accuracy*100), "%"

rdb = RedditDB()
result_doc = { "metadata_id":modelkey, "llda_accuracy": accuracy }
rdb.add_result(result_doc)


コード例 #4
0
'''
'''

import time
import praw
from redditDB import RedditDB

# Connect
username = '******'
password = '******'
rdb = RedditDB(username, password, "blacksun.cs.mcgill.ca", 31050, "reddit_topics")

reddit = praw.Reddit(user_agent="my super fun project")

num_submissions = rdb.num_wayback_submissions()
counter = 0
log_interval = 10000

start = time.time()

f = open("../data/wayback_submission_praw_ids.txt", "w")

# Collect submissions
for submission in rdb.wayback_submission_list():
	# Get submission's praw_id
	praw_id = submission.get("reddit_id")

	if praw_id is None:
		continue

	f.write(praw_id + '\n')
コード例 #5
0
"""
Get db stats of submissions/comments we have per-month. 
"""

from redditDB import RedditDB
from datetime import datetime 

rdb = RedditDB("mciot", "r3dd1tmorgane", "blacksun.cs.mcgill.ca", 31050, "reddit_topics")

stats = {}

num_no_created = 0
counter = 0

submissions = rdb.get_wayback_submissions()

for submission in submissions:
	counter += 1
	if counter % 100 == 0:
		print counter 

	created = submission.get(u'created')
	if created is None:
		num_no_created += 1
		continue 

	if type(created) == float:
		date = datetime.fromtimestamp(created)
	elif type(created) == datetime:
		date = created
	else:
コード例 #6
0
"""
This script fills the subreddit_id field 
for wayback submissions by extracting the 
subreddit name from the submission's url. 
"""

# Example: http://www.reddit.com/r/mildlyinteresting/comments/... --> mildlyinteresting
def extract_subreddit(url):
	start = url.find("/r/")
	end = url.find("/", start + 3)
	return url[start + 3:end] 

from redditDB import RedditDB

rdb = RedditDB("mciot", "r3dd1tmorgane", "blacksun.cs.mcgill.ca", 31050, "reddit_topics")

for submission in rdb.get_wayback_submissions():
	url = submission.get("comment_url")
	if url:
		subreddit = extract_subreddit(url)
		subreddit_obj = rdb.subreddit_exists(subreddit)
		# If the subreddit exists and the submission doesn't have an id, update 
		if subreddit_obj and submission.get("subreddit_id") is None:
			rdb.update_wayback_submission(submission.get("_id"), "subreddit_id", subreddit_obj.get("_id"))



コード例 #7
0
ファイル: stats.py プロジェクト: morganecf/topic-modeling
# except DokuWikiError as err:
# 	print err
# 	sys.exit(1)

# => 'Release 2012-10-13 "Adora Belle"'
#print wiki.version 

# print wiki.pages.list() # list all pages of the wiki
# print wiki.pages.list('my:namespace') # list all pages in the given namespace
# print wiki.pages.get('my:namespace:page') # print the content of the page


from redditDB import RedditDB
from utils import domain_frequencies, top_domains

rdb = RedditDB("mciot", "r3dd1tmorgane", "blacksun.cs.mcgill.ca", 31050, "reddit_topics")

### subreddit_collection ####
print "=========subreddit_collection========="
num_subreddits = rdb.num_subreddits()
print "Total number of subreddits:", num_subreddits
print ""

#### submission_collection ####
print "=========submission_collection========="
num_submissions = rdb.num_submissions()
print "Total number of submissions:", num_submissions
print "Average number of submissions/subreddit:", (float(num_submissions) / float(num_subreddits))
print ""
print "Number of cross-posted submissions:", rdb.num_xposts()
print "Number of subreddits involved in xposts: TODO" 
コード例 #8
0
mongodb. Also store words, date range, and id list of all 
subreddits used in creating word distribution. 
"""

import time 
import utils
import preprocess as P
from redditDB import RedditDB

"""
Parameters: for now, are hardcoded,
but will usually get these from a 
user-specified configuration file. 
"""

rdb = RedditDB()

# LIST OF TOPICS 
topics = ('gaming', 'AskReddit', 'worldnews', 'news', 'WTF', 'aww', 'technology', 'science', 'Music',
	'movies', 'books', 'EarthPorn', 'television', 'LifeProTips', 'Showerthoughts', 'food', 'Jokes',
	'firstworldanarchists', 'FoodPorn', 'HistoryPorn', 'trees', 'leagueoflegends', 'pokemon', '4chan', 
	'MakeupAddiction', 'pcmasterrace', 'gentlemanboners', 'politics', 'Bitcoin', 'Games', 'atheism', 'nba')
topics = ('gaming', 'food', 'atheism', 'politics', 'Bitcoin', 'MakeupAddiction', 'nba', 'trees', 
	'pcmasterrace', 'movies', 'firstworldanarchists', 'HistoryPorn', 'news', 'science', 'Music')
topics = ('gaming', 'food', 'atheism', 'science', 'Bitcoin')

# COMMENT LEVEL AT WHICH TO STOP
comment_level = 1

# NUMBER OF DOCUMENTS TO USE PER TOPIC
num_docs = 100