def test_submissions_between_order(self): all_subs = list(submissions_between(self.r, self.sr, highest_timestamp=time.time(), verbosity=self.verbosity)) for i in range(len(all_subs) - 1): self.assertGreaterEqual(all_subs[i].created_utc, all_subs[i + 1].created_utc) sr_obj = self.r.get_subreddit(self.sr) all_subs_sr_object = list( submissions_between(self.r, sr_obj, verbosity=self.verbosity) ) self.assertEqual(all_subs, all_subs_sr_object) all_subs_reversed = list(submissions_between(self.r, sr_obj, newest_first=False, verbosity=self.verbosity)) self.assertEqual(all_subs, list(reversed(all_subs_reversed)))
def test_submissions_between_raises_correctly(self): with self.assertRaises(PRAWException): list( submissions_between(self.r, self.sr, extra_cloudsearch_fields={'self': 'yes'}, verbosity=self.verbosity))
def test_submissions_between_special_subreddit(self): for submission in submissions_between( self.r, "all", # so it doesn't take forever highest_timestamp=1121000000, verbosity=self.verbosity): self.assertLessEqual(submission.created_utc, 1121000000) self.assertGreaterEqual(submission.created_utc, 0)
def main(*args): """ Main loop of scraping iteration. Arguments: terms is a string of search terms subreddit is a string representing subreddit to search in Returns: None (write to file instead) """ if len(args) == 1: terms = args[0] subreddit = 'all' elif len(args) == 2: terms = args[0] subreddit = args[1] else: print ('Usage: python reddit_scrape.py "<search terms>" ' '[<subreddit>]') return output_file = "__".join(['_'.join(terms.split()), subreddit]) + '.json' # initialize Reddit session (see praw.ini for OAuth details) r = praw.Reddit(user_agent='Full Scraper by ' '/u/<your_username_here> version 1.0', site_name='my_app') # gather refresh token from praw.ini try: r.refresh_access_information() except OAuthAppRequired: print "Check your OAuth login information (see ./praw.ini)." # get list of search results with the given terms search_results = list(submissions_between( r, subreddit, extra_cloudsearch_fields={'selftext': terms} ) ) # get comments for each submission above # and pull the ones that match the search search_results_with_comments = get_comments(search_results) # get list of cleaned search results cleaned_results = clean_results(search_results_with_comments, terms) # write results to file in directory ./output/ write_results(cleaned_results, terms, output_file)
def get_past_comments(lowest_timestamp = None, highest_timestamp = None, from_oldest = False): """gets comments that were made before the given time""" if from_oldest: highest_timestamp = find_oldest() subs = submissions_between(r, "soccer", highest_timestamp = highest_timestamp, lowest_timestamp = lowest_timestamp) comments = (sub.comments for sub in subs) for sub_comments in comments: for comment in sub_comments: if not isinstance(comment, MoreComments): yield Comment(comment) elif isinstance(comment, MoreComments): comments = comment.comments
def get_subreddit_stats(sublist, subreddit_stats): for sub in sublist: if str(sub) in subreddit_stats: continue downvoted_submissions = 0 total_submissions = 0 for submission in submissions_between(r, sub, lowest_timestamp=lowest_timestamp, highest_timestamp=highest_timestamp, verbosity=0): assert submission.created_utc <= highest_timestamp and submission.created_utc >= lowest_timestamp if submission.score <= 0: downvoted_submissions += 1 total_submissions += 1 subreddit_stats[sub] = {"downvoted_submissions": downvoted_submissions, "total_submissions": total_submissions} subreddit_stats.sync() print sub, total_submissions, downvoted_submissions, downvoted_submissions / float(total_submissions) if total_submissions else 0
def get_flair_stats(): user = os.environ['REDDIT_USERNAME'] user_agent = 'Calculating ignored bugs by {}'.format(user) r = praw.Reddit(user_agent) flair_stats = defaultdict(lambda: defaultdict(lambda: 0)) for s in submissions_between(r, 'bugs', lowest_timestamp=1400000000): created = datetime.utcfromtimestamp(s.created_utc) month = (created.year, created.month) # They started to add flairs since Janury, 2015 if month < (2015, 2): break # Current month has incomplete data if month == (date.today().year, date.today().month): continue # Submissions without flairs seems to be mainly duplicate submissions removed by mods # They are not viewable in the interface, so we aren't counting them if not s.link_flair_text: print "IGNORING POST WITHOUT A FLAIR", s.permalink, s.title continue flair_stats[month][s.link_flair_text] += 1 return flair_stats
def test_submissions_between_with_filters(self): all_subs = list(submissions_between(self.r, self.sr, verbosity=self.verbosity)) t1 = 1420000000 t2 = 1441111111 t1_t2_subs = list(submissions_between(self.r, self.sr, lowest_timestamp=t1, highest_timestamp=t2, verbosity=self.verbosity)) def filter_subs(subs, lowest_timestamp=0, highest_timestamp=10**10, criterion=None): filtered = [s for s in subs if s.created_utc <= highest_timestamp and s.created_utc >= lowest_timestamp and (criterion is None or criterion(s))] # make sure we never accidentally craft a bad test case self.assertGreater(len(filtered), 0) return filtered t1_t2_subs_canon = filter_subs(all_subs, t1, t2) self.assertEqual(t1_t2_subs, t1_t2_subs_canon) self_subs = list( submissions_between(self.r, self.sr, extra_cloudsearch_fields={"self": "1"}, verbosity=self.verbosity) ) self_subs_canon = filter_subs(all_subs, criterion=lambda s: s.is_self) self.assertEqual(self_subs, self_subs_canon) def wa_criterion(s): return not s.is_self and \ urlparse(s.url).netloc == "web.archive.org" wa_cs_fields = {"self": "0", "site": "web.archive.org"} subs_wa = list( submissions_between(self.r, self.sr, extra_cloudsearch_fields=wa_cs_fields, verbosity=self.verbosity) ) subs_wa_canon = filter_subs(all_subs, criterion=wa_criterion) self.assertEqual(subs_wa, subs_wa_canon) patu_cs_fields = {"self": "1", "author": "PyAPITestUser2", "title": "test"} def patu_criterion(s): return s.is_self and \ s.author.name == "PyAPITestUser2" and\ "test" in s.title.lower() subs_patu = list( submissions_between(self.r, self.sr, extra_cloudsearch_fields=patu_cs_fields, verbosity=self.verbosity) ) subs_patu_canon = filter_subs(all_subs, criterion=patu_criterion) self.assertEqual(subs_patu, subs_patu_canon)
def test_submissions_between_raises_correctly(self): with self.assertRaises(PRAWException): list(submissions_between(self.r, self.sr, extra_cloudsearch_fields={'self': 'yes'}, verbosity=self.verbosity))
def scrape(): reddit_obj = praw.Reddit(user_agent="Fake Images Scraper") psbattle = reddit_obj.get_subreddit( "photoshopbattles") #subreddit of interest #submissions = psbattle.get_top_from_year(limit=None)#, params={"after" : "t3_4cd8qr"}) submissions = submissions_between( reddit_obj, psbattle, highest_timestamp=1416475603.0 ) #leave lowest_timestamp and highest_timestamp blank to get all submissions img_count = 0 count_submissions = 1 img_count_train = 0 img_count_test = 0 link_file_train = open(LINK_FILE_TRAIN, "w") link_file_test = open(LINK_FILE_TEST, "w") while img_count < NUM_LINKS_TO_GET: #TODO need this? for submission in submissions: #go through each submission try: if img_count > NUM_LINKS_TO_GET: break print "next submission: ", submission if not submission: print "Not a submission" else: print "submission id: ", submission.id print "submission timestamp: ", submission.created_utc #decide if images from this submission will be for training or test link_file = None testing = False #are we adding images to testing or training #add first images to test (avoid divide by 0) #then add to test if we have more than 5x number of training images as testing #NOTE: add all images from a submission to train or test #otherwise classifier might have very similar images in training dataset if img_count_test == 0 or img_count_train / img_count_test >= 5: testing = True link_file = link_file_test else: link_file = link_file_train #get all comments for now (note: may cause dataset imbalance? #also takes longer because more API calls) submission.replace_more_comments( limit=None, threshold=0) #limit=None for all comments comments = [comment for comment in submission.comments if \ not isinstance(comment, praw.objects.MoreComments) and comment.is_root][:NUM_IMGS_PER_COMMENT] #look for at most 10 images a comment count_comments = 1 for comment in comments: #each (root) comment (containing image) #if we've made it this far assume image is original links = find_links( comment.body) #get links (presumably of images) #this link is valid so download image at this link for link in links: if "](" in link: #correct mistake made by regex: sometimes get http://imgur.com...](http://imgur.com...) link = link[:link.index( "]" )] #get only up to and not including the ] link = link.replace( ")", "" ) #because sometimes Imgur links have trailing ) link_file.write(link + " ") link_file.write(submission.id + " ") link_file.write(str(count_comments) + " ") link_file.write(str(img_count + 1) + "\n") img_count += 1 if testing: img_count_test += 1 else: img_count_train += 1 count_comments += 1 #count comment count_submissions += 1 #count this submission as a new one print("%d valid comments on submission %d. Now %d image links total: %d train, %d test" \ % (count_comments - 1, count_submissions - 1, img_count, img_count_train, img_count_test)) except Exception as e: print "exception: ", e break #finish up link_file_train.close() link_file_test.close() print("%d image links scraped in total" % img_count)
def scrape(): reddit_obj = praw.Reddit(user_agent = "Fake Images Scraper") psbattle = reddit_obj.get_subreddit("photoshopbattles") #subreddit of interest #submissions = psbattle.get_top_from_year(limit=None)#, params={"after" : "t3_4cd8qr"}) submissions = submissions_between(reddit_obj, psbattle, highest_timestamp = 1416475603.0) #leave lowest_timestamp and highest_timestamp blank to get all submissions img_count = 0 count_submissions = 1 img_count_train = 0 img_count_test = 0 link_file_train = open(LINK_FILE_TRAIN, "w") link_file_test = open(LINK_FILE_TEST, "w") while img_count < NUM_LINKS_TO_GET: #TODO need this? for submission in submissions: #go through each submission try: if img_count > NUM_LINKS_TO_GET: break print "next submission: ", submission if not submission: print "Not a submission" else: print "submission id: ", submission.id print "submission timestamp: ", submission.created_utc #decide if images from this submission will be for training or test link_file = None testing = False #are we adding images to testing or training #add first images to test (avoid divide by 0) #then add to test if we have more than 5x number of training images as testing #NOTE: add all images from a submission to train or test #otherwise classifier might have very similar images in training dataset if img_count_test == 0 or img_count_train / img_count_test >= 5: testing = True link_file = link_file_test else: link_file = link_file_train #get all comments for now (note: may cause dataset imbalance? #also takes longer because more API calls) submission.replace_more_comments(limit=None, threshold = 0) #limit=None for all comments comments = [comment for comment in submission.comments if \ not isinstance(comment, praw.objects.MoreComments) and comment.is_root][:NUM_IMGS_PER_COMMENT] #look for at most 10 images a comment count_comments = 1 for comment in comments: #each (root) comment (containing image) #if we've made it this far assume image is original links = find_links(comment.body) #get links (presumably of images) #this link is valid so download image at this link for link in links: if "](" in link: #correct mistake made by regex: sometimes get http://imgur.com...](http://imgur.com...) link = link[:link.index("]")] #get only up to and not including the ] link = link.replace(")","") #because sometimes Imgur links have trailing ) link_file.write(link + " ") link_file.write(submission.id + " ") link_file.write(str(count_comments) + " ") link_file.write(str(img_count + 1) + "\n") img_count += 1 if testing: img_count_test += 1 else: img_count_train += 1 count_comments += 1 #count comment count_submissions += 1 #count this submission as a new one print("%d valid comments on submission %d. Now %d image links total: %d train, %d test" \ % (count_comments - 1, count_submissions - 1, img_count, img_count_train, img_count_test)) except Exception as e: print "exception: ", e break #finish up link_file_train.close() link_file_test.close() print("%d image links scraped in total" % img_count)