def user_liked_saved(username, scan_upvoted=True, scan_saved=True, scan_sub=None): """ Gets all the upvoted/saved comments and/or submissions for the given User. Allows filtering by Subreddit. """ params = {'sr': scan_sub} if scan_sub else None try: if _user.name.lower() == username.lower(): redditor = _user else: redditor = _reddit.redditor(username) if scan_saved: for saved in redditor.saved(limit=None, params=params): re = RedditElement(saved) yield re if scan_upvoted: for upvoted in redditor.upvoted(limit=None, params=params): re = RedditElement(upvoted) yield re except prawcore.exceptions.NotFound: stringutil.error( 'Cannot locate comments or submissions for nonexistent user: %s' % username) except prawcore.Forbidden: stringutil.error( 'Cannot load Upvoted/Saved Posts from the User "%s", because they are private!' % username)
def user_posts(username, find_submissions, find_comments, find_limit=None, deep_find_submissions=False, deep_find_comments=False): #vy """ Generator for all the posts made by the given Redditor. """ try: limit = find_limit if (find_limit is not None and find_limit > 0) else None if find_comments or deep_find_comments: for c in _reddit.redditor(username).comments.new(limit=limit): yield RedditElement(c) if deep_find_comments: for c in _reddit.redditor(username).comments.top(limit=limit): yield RedditElement(c) if find_submissions or deep_find_submissions: for c in _reddit.redditor(username).submissions.new(limit=limit): yield RedditElement(c) if deep_find_submissions: for c in _reddit.redditor(username).submissions.top(limit=limit): yield RedditElement(c) except prawcore.exceptions.NotFound: stringutil.error( 'Cannot locate comments or submissions for nonexistent user: %s' % username) with open('user.nonexistent.log', 'a+', newline='\n') as f: f.write(username + '\n') except prawcore.exceptions.Forbidden: stringutil.error( 'Cannot locate posts from a suspended user account: %s' % username) with open('user.suspended.log', 'a+', newline='\n') as f: f.write(username + '\n')
def get_elements(self): ps = PushshiftAPI() for user in self.data['users'].split(','): user = user.replace('/u/', '', 1).strip() _params = {'author': user} if self.data['limit']: _params['limit'] = self.data['limit'] if self.data['scan_submissions']: for post in ps.search_submissions(**_params): p = RedditElement(post) if self.check_filters(p): yield p if self.data['scan_comments']: for post in ps.search_comments(**_params): parents = list( ps.search_submissions(ids=post.link_id.replace( 't3_', '', 1), limit=1)) if not len(parents): print( "PushShift Warning: Unable to locate parent Submission:", post.link_id) continue submission = parents[0] p = RedditElement(post, ext_submission_obj=submission) if self.check_filters(p): yield p
def get_elements(self): url = self.data['url'] submission = re.search(r'\/comments\/([a-zA-Z0-9]+)\/?', url) comment = re.search(r'\/comments\/.+?\/.+?\/([a-zA-Z0-9]+)\/?', url) ps = PushshiftAPI() if comment: for post in ps.search_comments(ids=[comment.group(1)]): parents = list( ps.search_submissions(ids=post.link_id.replace( 't3_', '', 1), limit=1)) if not len(parents): raise AssertionError( "PushShift Warning: Unable to locate direct parent Submission:", post.link_id) submission = parents[0] p = RedditElement(post, ext_submission_obj=submission) if self.check_filters(p): yield p elif submission: for post in ps.search_submissions( ids=[submission.group(1).replace('t3_', '', 1)], limit=1): p = RedditElement(post) if self.check_filters(p): yield p else: raise TypeError('Invalid Reddit URL provided! "%s"' % url)
def get_subs(self, subs): for post in self._ps.search_submissions(ids=subs, limit=len(subs)): subs = list( filter(lambda s: s != post.id.replace('t3_', '', 1), subs)) p = RedditElement(post) if self.check_filters(p): yield p if self.data['slow_fallback'] and len(subs): for comm in get_info(['t3_' + c for c in subs]): yield RedditElement(comm)
def user_posts(username, find_submissions, find_comments): """ Generator for all the posts made by the given Redditor. """ try: if find_comments: for c in _reddit.redditor(username).comments.new(): yield RedditElement(c) if find_submissions: for c in _reddit.redditor(username).submissions.new(): yield RedditElement(c) except prawcore.exceptions.NotFound: stringutil.error('Cannot locate comments or submissions for nonexistent user: %s' % username)
def test_submission_gallery(self): """ Parse all gallery links """ p = pw.get_submission(t3_id='t3_hrrh23') re = RedditElement(p) self.assertEqual( len(re.get_urls()), 3, msg='Got incorrect image count from reddit gallery submission!') for url in re.get_urls(): self.assertIn('https', url, msg='Failed to extract valid gallery URL: %s' % url)
def test_gallery(self): """ Should load all gallery images """ ps = PushshiftAPI() post, = ps.search_submissions(limit=1, ids=['t3_hrrh23']) re = RedditElement(post) self.assertEqual( len(re.get_urls()), 3, msg='Got incorrect image count from PSAW gallery submission!') for url in re.get_urls(): self.assertIn('https', url, msg='Failed to extract valid gallery URL: %s' % url)
def process_posts(self): """ Iterate through all the located PendingPosts, and process them. """ submissions = [p.reddit_id for p in self.posts.values() if p.reddit_id.startswith('t3_')] comments = [p.reddit_id for p in self.posts.values() if p.reddit_id.startswith('t1_')] submissions = [RedditElement(s) for s in batch_submission_lookup(submissions)] comments = [c for c in batch_comment_lookup(comments)] for idx, pend in enumerate(self.posts.values()): r = pend.ele if not r: try: found = [re for re in submissions+comments if re.id == pend.reddit_id] if not found: raise Exception('Unable to locate via PushShift!') r = found[0] except Exception as ex: self.failures.append(FailedPost(pend.reddit_id, pend.title, pend.files, reason="Error parsing: %s" % ex)) continue r.source_alias = pend.source + '-imported' post = self.session.query(sql.Post).filter(sql.Post.reddit_id == r.id).first() if not post: post = sql.Post.convert_element_to_post(r) self.find_missing_files(pending=pend, post=post) total = len(self.posts.keys()) print("\n\nFinished Post: %s/%s" % (idx, total), ' :: ', "%s%%" % round((idx/total)*100, 2))
def _praw_apply_filter(praw_object, order_by='new', limit=None, time='all'): """ Accepts a Praw object (subreddit/multireddit/user posts/etc) and applies filters to it. Returns a Generator. """ if order_by == 'best': print( 'Sorting submissions by "best" is no longer supported. Use "top" instead.' ) order_by = 'top' time = 'day' order = [o for o in post_orders() if o[0] == order_by] assert len(order) > 0 # The order must be a valid value. assert time in time_filters() if limit < 1: limit = None order = order[0] try: if not order[1]: gen = getattr(praw_object, order[0])(limit=limit) else: gen = getattr(praw_object, order[0])(limit=limit, time_filter=time) for g in gen: yield RedditElement(g) except TypeError as e: stringutil.error('Invalid Praw order configuration! [%s]' % order_by) print(order) print(e)
def get_elements(self): ps = PushshiftAPI() for user in self.data['users'].split(','): user = user.replace('/u/', '', 1).strip() _params = {'author': user} if self.data['limit']: _params['limit'] = self.data['limit'] if self.data['scan_submissions']: for post in ps.search_submissions(**_params): p = RedditElement(post) if self.check_filters(p): yield p if self.data['scan_comments']: for post in ps.search_comments(**_params): sub = list(ps.search_submissions(ids=post.link_id.replace('t3_', '', 1), limit=1))[0] p = RedditElement(post, ext_submission_obj=sub) if self.check_filters(p): yield p
def test_load_submission(self): """ Load submission directly by ID """ p = pw.get_submission(t3_id='t3_6es0u8') re = RedditElement(p) self.assertEqual(re.author, 'theshadowmoose', msg='Submission has invalid Author!') self.assertEqual(re.title, 'Test Direct Link', msg='Submission has invalid Title!')
def get_elements(self): ps = PushshiftAPI() for sub in self.data['subreddit'].split(','): sub = sub.replace('/r/', '', 1).strip() _params = {'subreddit': sub} if self.data['limit']: _params['limit'] = self.data['limit'] for post in ps.search_submissions(**_params): p = RedditElement(post) if self.check_filters(p): yield p
def get_comments(self, comments): found = list( self._ps.search_comments(ids=comments, limit=len(comments))) subs = list( self._ps.search_submissions( limit=len(found), ids=[c.link_id.replace('t3_', '', 1) for c in found])) for s in subs: search = list( filter(lambda c: c.link_id.replace('t3_', '', 1) == s.id, found)) if not search: print('Failed to locate comment using parent ID!', s.id) continue com = search[0] found.remove(com) comments = list( filter( lambda c: c.replace('t1_', '', 1) != com.id.replace( 't1_', '', 1), comments)) yield RedditElement(com, ext_submission_obj=s) if self.data['slow_fallback'] and len(comments): for comm in get_info(['t1_' + c for c in comments]): yield RedditElement(comm)
def batch_comment_lookup(comments): chunks = [comments[x:x+ps_query_size] for x in range(0, len(comments), ps_query_size)] found = [] for idx, ch in enumerate(chunks): print("Scanning comments... Batch: %s/%s" % (idx+1, len(chunks))) for com in ps.search_comments(limit=len(ch), ids=[c for c in ch]): found.append(com) subs = [s for s in batch_submission_lookup([c.link_id for c in found])] for s in subs: search = list(filter(lambda c: c.link_id.replace('t3_', '', 1) == s.id, found)) if not search: continue com = search[0] found.remove(com) yield RedditElement(com, ext_submission_obj=s)
def get_elements(self): ps = PushshiftAPI() term = self.data['search_term'] sub_list = self.data['subreddits'] or None subs = sub_list.split(',') if sub_list else [None] for sub in subs: if sub: sub = sub.strip() gen = ps.search_submissions(q=term, subreddit=sub, limit=self.data['limit']) for post in gen: p = RedditElement(post) if self.check_filters(p): yield p
def get_elements(self): check_last = self.data['check_last_seen_posts'] if not check_last or check_last < 1: check_last = None ps = PushshiftAPI() for user in self.data['users'].split(','): user = user.replace('/u/', '', 1).strip() _params = {'author': user} if self.data['limit']: _params['limit'] = self.data['limit'] if self.data['scan_submissions']: if check_last is not None: last_seen = get_last_seen_posts( user, check_last, self.data['check_last_seen_utc']) else: last_seen = None last_seen_i = None for post in ps.search_submissions(**_params): p = RedditElement(post) if check_last is not None: res = self.is_new_post(p, last_seen) if type(res) is int: if last_seen_i is None: print( "Reached start of last seen posts at: (%s/%s) [%s] %s %s" % (res, len(last_seen), p.strf_created_utc(), p.author, p.id), debug=True) last_seen_i = res elif res is False: print( "Reached end of last seen posts at: (%s/%s) [%s] %s %s" % (last_seen_i, len(last_seen), p.strf_created_utc(), p.author, p.id), debug=True) break if self.check_filters(p): yield p if self.data['scan_comments']: for post in ps.search_comments(**_params): parents = list( ps.search_submissions(ids=post.link_id.replace( 't3_', '', 1), limit=1)) if not len(parents): print( "PushShift Warning: Unable to locate parent Submission:", post.link_id) continue submission = parents[0] p = RedditElement(post, ext_submission_obj=submission) if self.check_filters(p): yield p
def get_elements(self): ps = PushshiftAPI() for sub in self.data['subreddit'].split(','): sub = sub.replace('/r/', '', 1).strip() _params = { 'subreddit': sub, 'sort_type': self.data['sort_by'], 'after': self.convert_offset() } if self.data['limit']: _params['limit'] = self.data['limit'] if 'desc' in self.data['sort_order'].lower(): _params['sort'] = 'desc' else: _params['sort'] = 'asc' print(_params) for post in ps.search_submissions(**_params): p = RedditElement(post) if self.check_filters(p): yield p
def test_load_comment(self): """ Load Comment directly by ID """ com = pw.get_comment(t1_id='t1_dxz6n80') re = RedditElement(com) vals = { "_urls": ['https://stackoverflow.com/a/23709194'], "type": 'Comment', "id": 't1_dxz6n80', "title": 'Reddit Media Downloader is now Threaded - Scrape all the subreddits, *much* faster now.', "author": 'theshadowmoose', "parent": 't3_8ewkx2', "subreddit": 'DataHoarder', "over_18": False, "created_utc": 1524705293.0, "link_count": 1, "source_alias": None, } for k, v in vals.items(): self.assertEqual(getattr(re, k), v, msg='%s was not properly set in Comment!' % k.title())
def __init__(self, reddit_id, files, source, title, reddit_ele=None): self.reddit_id = reddit_id self.title = title self.files = files self.source = source self.ele = RedditElement(reddit_ele) if reddit_ele else None
def test_missing_author(self): """ Really old posts should still work without old author data """ ps = PushshiftAPI() post, = ps.search_submissions(limit=1, ids=['t3_otfrw']) re = RedditElement(post) self.assertEqual(re.author, 'Deleted')