def createDataset(r, subreddits, startDate=(datetime.datetime.now()-datetime.timedelta(days=7)).strftime('%y%m%d%H%M%S'), endDate=datetime.datetime.now().strftime('%y%m%d%H%M%S'), nCommentsPerSubmission=100, dbName='reddit', fineScale=12, nPostsPerFineScale=200): """ :param r: reddit object :param subreddits: list of subreddits to grab :param startDate: start date in format yymmddHHMMSS :param endDate: end date in format yymmddHHMMSS :param nCommentsPerSubmission: number of comments to grab per submission. Default is 100. :param dbName: base of database name :param fineScale: scale of database in hours :param nPostsPerFineScale: number of posts per fine scale :return: """ # initialize database dbObj = RedditDB(dbName=dbName) # loop through each subreddit for sub in subreddits: print 'Processing subreddit: ' + sub.title.encode('utf-8') # get submissions within the date range matchingPosts = getAllPostsWithinRangeFineScale(sub, startDate=startDate, endDate=endDate, fineScale=fineScale, nPostsPer=nPostsPerFineScale) # loop through each post and get top comments for post in matchingPosts: print 'Processing post: ' + post.title.encode('utf-8') # save post dbObj.saveSubmission(post) # get comments numTries = 0 gotComments = False while not gotComments and numTries < 10: try: comments = getCommentsFromSubmission(post, nCommentsPerSubmission) gotComments = True except HTTPError: time.sleep(2) numTries += 1 # save comment data for comments which have not been deleted # print [com.author.name for com in comments if isinstance(com, praw.objects.Comment)] [dbObj.saveCommentData(com) for com in comments if isinstance(com, praw.objects.Comment) and com.author is not None] dbObj.closeConnection() print ('\nData collection complete!')
from redditDB import RedditDB import time username = "******" password = "******" rdb = RedditDB(username, password, "blacksun.cs.mcgill.ca", 31050, "reddit_topics") # # data = {} # # f1 = open("../data/subreddit_popularity.txt", "w") # # for subreddit in rdb.get_subreddits(): # name = subreddit.get("subreddit_name") # num = rdb.num_submissions(subreddit=name) # data[name] = num # print name, num # f1.write(str(num) + '\t' + name + '\n') # # f1.close() # f2 = open("../data/subreddit_popularity_wayback.txt", "w") num_submissions = rdb.num_wayback_submissions() data = {} counter = 0 num_not_belonging = 0
modelkey = sys.argv[1] base_path = "../data/llda-cvb0-cee38496-3-dd7c685c-46a60775/01000" topic_distrib_path = base_path + "llda_test_"+modelkey+"-document-topic-distributuions.csv" labels_path = base_path + "label-index.txt" labels = open(labels_path).read().splitlines() f = open(topic_distrib_path).read().splitlines() correct = 0 for line in f: parts = line.split(",") label = parts[0] # Actual label distribution = map(lambda x: float(x), parts[1:]) guess = distribution.index(max(distribution)) if labels[guess] == label: correct += 1 # TODO: Compute false positives, negatives, etc. accuracy = float(correct)/float(len(f)) print "Accuracy:", str(accuracy*100), "%" rdb = RedditDB() result_doc = { "metadata_id":modelkey, "llda_accuracy": accuracy } rdb.add_result(result_doc)
''' ''' import time import praw from redditDB import RedditDB # Connect username = '******' password = '******' rdb = RedditDB(username, password, "blacksun.cs.mcgill.ca", 31050, "reddit_topics") reddit = praw.Reddit(user_agent="my super fun project") num_submissions = rdb.num_wayback_submissions() counter = 0 log_interval = 10000 start = time.time() f = open("../data/wayback_submission_praw_ids.txt", "w") # Collect submissions for submission in rdb.wayback_submission_list(): # Get submission's praw_id praw_id = submission.get("reddit_id") if praw_id is None: continue f.write(praw_id + '\n')
""" Get db stats of submissions/comments we have per-month. """ from redditDB import RedditDB from datetime import datetime rdb = RedditDB("mciot", "r3dd1tmorgane", "blacksun.cs.mcgill.ca", 31050, "reddit_topics") stats = {} num_no_created = 0 counter = 0 submissions = rdb.get_wayback_submissions() for submission in submissions: counter += 1 if counter % 100 == 0: print counter created = submission.get(u'created') if created is None: num_no_created += 1 continue if type(created) == float: date = datetime.fromtimestamp(created) elif type(created) == datetime: date = created else:
""" This script fills the subreddit_id field for wayback submissions by extracting the subreddit name from the submission's url. """ # Example: http://www.reddit.com/r/mildlyinteresting/comments/... --> mildlyinteresting def extract_subreddit(url): start = url.find("/r/") end = url.find("/", start + 3) return url[start + 3:end] from redditDB import RedditDB rdb = RedditDB("mciot", "r3dd1tmorgane", "blacksun.cs.mcgill.ca", 31050, "reddit_topics") for submission in rdb.get_wayback_submissions(): url = submission.get("comment_url") if url: subreddit = extract_subreddit(url) subreddit_obj = rdb.subreddit_exists(subreddit) # If the subreddit exists and the submission doesn't have an id, update if subreddit_obj and submission.get("subreddit_id") is None: rdb.update_wayback_submission(submission.get("_id"), "subreddit_id", subreddit_obj.get("_id"))
# except DokuWikiError as err: # print err # sys.exit(1) # => 'Release 2012-10-13 "Adora Belle"' #print wiki.version # print wiki.pages.list() # list all pages of the wiki # print wiki.pages.list('my:namespace') # list all pages in the given namespace # print wiki.pages.get('my:namespace:page') # print the content of the page from redditDB import RedditDB from utils import domain_frequencies, top_domains rdb = RedditDB("mciot", "r3dd1tmorgane", "blacksun.cs.mcgill.ca", 31050, "reddit_topics") ### subreddit_collection #### print "=========subreddit_collection=========" num_subreddits = rdb.num_subreddits() print "Total number of subreddits:", num_subreddits print "" #### submission_collection #### print "=========submission_collection=========" num_submissions = rdb.num_submissions() print "Total number of submissions:", num_submissions print "Average number of submissions/subreddit:", (float(num_submissions) / float(num_subreddits)) print "" print "Number of cross-posted submissions:", rdb.num_xposts() print "Number of subreddits involved in xposts: TODO"
mongodb. Also store words, date range, and id list of all subreddits used in creating word distribution. """ import time import utils import preprocess as P from redditDB import RedditDB """ Parameters: for now, are hardcoded, but will usually get these from a user-specified configuration file. """ rdb = RedditDB() # LIST OF TOPICS topics = ('gaming', 'AskReddit', 'worldnews', 'news', 'WTF', 'aww', 'technology', 'science', 'Music', 'movies', 'books', 'EarthPorn', 'television', 'LifeProTips', 'Showerthoughts', 'food', 'Jokes', 'firstworldanarchists', 'FoodPorn', 'HistoryPorn', 'trees', 'leagueoflegends', 'pokemon', '4chan', 'MakeupAddiction', 'pcmasterrace', 'gentlemanboners', 'politics', 'Bitcoin', 'Games', 'atheism', 'nba') topics = ('gaming', 'food', 'atheism', 'politics', 'Bitcoin', 'MakeupAddiction', 'nba', 'trees', 'pcmasterrace', 'movies', 'firstworldanarchists', 'HistoryPorn', 'news', 'science', 'Music') topics = ('gaming', 'food', 'atheism', 'science', 'Bitcoin') # COMMENT LEVEL AT WHICH TO STOP comment_level = 1 # NUMBER OF DOCUMENTS TO USE PER TOPIC num_docs = 100