Exemple #1
0
def user_liked_saved(username,
                     scan_upvoted=True,
                     scan_saved=True,
                     scan_sub=None):
    """ Gets all the upvoted/saved comments and/or submissions for the given User. Allows filtering by Subreddit. """
    params = {'sr': scan_sub} if scan_sub else None
    try:
        if _user.name.lower() == username.lower():
            redditor = _user
        else:
            redditor = _reddit.redditor(username)
        if scan_saved:
            for saved in redditor.saved(limit=None, params=params):
                re = RedditElement(saved)
                yield re

        if scan_upvoted:
            for upvoted in redditor.upvoted(limit=None, params=params):
                re = RedditElement(upvoted)
                yield re
    except prawcore.exceptions.NotFound:
        stringutil.error(
            'Cannot locate comments or submissions for nonexistent user: %s' %
            username)
    except prawcore.Forbidden:
        stringutil.error(
            'Cannot load Upvoted/Saved Posts from the User "%s", because they are private!'
            % username)
Exemple #2
0
def user_posts(username,
               find_submissions,
               find_comments,
               find_limit=None,
               deep_find_submissions=False,
               deep_find_comments=False):  #vy
    """ Generator for all the posts made by the given Redditor. """
    try:
        limit = find_limit if (find_limit is not None
                               and find_limit > 0) else None
        if find_comments or deep_find_comments:
            for c in _reddit.redditor(username).comments.new(limit=limit):
                yield RedditElement(c)
        if deep_find_comments:
            for c in _reddit.redditor(username).comments.top(limit=limit):
                yield RedditElement(c)
        if find_submissions or deep_find_submissions:
            for c in _reddit.redditor(username).submissions.new(limit=limit):
                yield RedditElement(c)
        if deep_find_submissions:
            for c in _reddit.redditor(username).submissions.top(limit=limit):
                yield RedditElement(c)
    except prawcore.exceptions.NotFound:
        stringutil.error(
            'Cannot locate comments or submissions for nonexistent user: %s' %
            username)
        with open('user.nonexistent.log', 'a+', newline='\n') as f:
            f.write(username + '\n')
    except prawcore.exceptions.Forbidden:
        stringutil.error(
            'Cannot locate posts from a suspended user account: %s' % username)
        with open('user.suspended.log', 'a+', newline='\n') as f:
            f.write(username + '\n')
 def get_elements(self):
     ps = PushshiftAPI()
     for user in self.data['users'].split(','):
         user = user.replace('/u/', '', 1).strip()
         _params = {'author': user}
         if self.data['limit']:
             _params['limit'] = self.data['limit']
         if self.data['scan_submissions']:
             for post in ps.search_submissions(**_params):
                 p = RedditElement(post)
                 if self.check_filters(p):
                     yield p
         if self.data['scan_comments']:
             for post in ps.search_comments(**_params):
                 parents = list(
                     ps.search_submissions(ids=post.link_id.replace(
                         't3_', '', 1),
                                           limit=1))
                 if not len(parents):
                     print(
                         "PushShift Warning: Unable to locate parent Submission:",
                         post.link_id)
                     continue
                 submission = parents[0]
                 p = RedditElement(post, ext_submission_obj=submission)
                 if self.check_filters(p):
                     yield p
Exemple #4
0
    def get_elements(self):
        url = self.data['url']
        submission = re.search(r'\/comments\/([a-zA-Z0-9]+)\/?', url)
        comment = re.search(r'\/comments\/.+?\/.+?\/([a-zA-Z0-9]+)\/?', url)
        ps = PushshiftAPI()

        if comment:
            for post in ps.search_comments(ids=[comment.group(1)]):
                parents = list(
                    ps.search_submissions(ids=post.link_id.replace(
                        't3_', '', 1),
                                          limit=1))
                if not len(parents):
                    raise AssertionError(
                        "PushShift Warning: Unable to locate direct parent Submission:",
                        post.link_id)
                submission = parents[0]
                p = RedditElement(post, ext_submission_obj=submission)
                if self.check_filters(p):
                    yield p
        elif submission:
            for post in ps.search_submissions(
                    ids=[submission.group(1).replace('t3_', '', 1)], limit=1):
                p = RedditElement(post)
                if self.check_filters(p):
                    yield p
        else:
            raise TypeError('Invalid Reddit URL provided! "%s"' % url)
 def get_subs(self, subs):
     for post in self._ps.search_submissions(ids=subs, limit=len(subs)):
         subs = list(
             filter(lambda s: s != post.id.replace('t3_', '', 1), subs))
         p = RedditElement(post)
         if self.check_filters(p):
             yield p
     if self.data['slow_fallback'] and len(subs):
         for comm in get_info(['t3_' + c for c in subs]):
             yield RedditElement(comm)
def user_posts(username, find_submissions, find_comments):
	""" Generator for all the posts made by the given Redditor. """
	try:
		if find_comments:
			for c in _reddit.redditor(username).comments.new():
				yield RedditElement(c)
		if find_submissions:
			for c in _reddit.redditor(username).submissions.new():
				yield RedditElement(c)
	except prawcore.exceptions.NotFound:
		stringutil.error('Cannot locate comments or submissions for nonexistent user: %s' % username)
 def test_submission_gallery(self):
     """ Parse all gallery links """
     p = pw.get_submission(t3_id='t3_hrrh23')
     re = RedditElement(p)
     self.assertEqual(
         len(re.get_urls()),
         3,
         msg='Got incorrect image count from reddit gallery submission!')
     for url in re.get_urls():
         self.assertIn('https',
                       url,
                       msg='Failed to extract valid gallery URL: %s' % url)
 def test_gallery(self):
     """ Should load all gallery images """
     ps = PushshiftAPI()
     post, = ps.search_submissions(limit=1, ids=['t3_hrrh23'])
     re = RedditElement(post)
     self.assertEqual(
         len(re.get_urls()),
         3,
         msg='Got incorrect image count from PSAW gallery submission!')
     for url in re.get_urls():
         self.assertIn('https',
                       url,
                       msg='Failed to extract valid gallery URL: %s' % url)
	def process_posts(self):
		""" Iterate through all the located PendingPosts, and process them. """
		submissions = [p.reddit_id for p in self.posts.values() if p.reddit_id.startswith('t3_')]
		comments = [p.reddit_id for p in self.posts.values() if p.reddit_id.startswith('t1_')]

		submissions = [RedditElement(s) for s in batch_submission_lookup(submissions)]
		comments = [c for c in batch_comment_lookup(comments)]

		for idx, pend in enumerate(self.posts.values()):
			r = pend.ele
			if not r:
				try:
					found = [re for re in submissions+comments if re.id == pend.reddit_id]
					if not found:
						raise Exception('Unable to locate via PushShift!')
					r = found[0]
				except Exception as ex:
					self.failures.append(FailedPost(pend.reddit_id, pend.title, pend.files, reason="Error parsing: %s" % ex))
					continue
			r.source_alias = pend.source + '-imported'
			post = self.session.query(sql.Post).filter(sql.Post.reddit_id == r.id).first()
			if not post:
				post = sql.Post.convert_element_to_post(r)
			self.find_missing_files(pending=pend, post=post)
			total = len(self.posts.keys())
			print("\n\nFinished Post: %s/%s" % (idx, total), ' :: ', "%s%%" % round((idx/total)*100, 2))
Exemple #10
0
def _praw_apply_filter(praw_object, order_by='new', limit=None, time='all'):
    """ Accepts a Praw object (subreddit/multireddit/user posts/etc) and applies filters to it. Returns a Generator. """
    if order_by == 'best':
        print(
            'Sorting submissions by "best" is no longer supported. Use "top" instead.'
        )
        order_by = 'top'
        time = 'day'
    order = [o for o in post_orders() if o[0] == order_by]
    assert len(order) > 0  # The order must be a valid value.
    assert time in time_filters()
    if limit < 1:
        limit = None
    order = order[0]
    try:
        if not order[1]:
            gen = getattr(praw_object, order[0])(limit=limit)
        else:
            gen = getattr(praw_object, order[0])(limit=limit, time_filter=time)
        for g in gen:
            yield RedditElement(g)
    except TypeError as e:
        stringutil.error('Invalid Praw order configuration! [%s]' % order_by)
        print(order)
        print(e)
	def get_elements(self):
		ps = PushshiftAPI()
		for user in self.data['users'].split(','):
			user = user.replace('/u/', '', 1).strip()
			_params = {'author': user}
			if self.data['limit']:
				_params['limit'] = self.data['limit']
			if self.data['scan_submissions']:
				for post in ps.search_submissions(**_params):
					p = RedditElement(post)
					if self.check_filters(p):
						yield p
			if self.data['scan_comments']:
				for post in ps.search_comments(**_params):
					sub = list(ps.search_submissions(ids=post.link_id.replace('t3_', '', 1), limit=1))[0]
					p = RedditElement(post, ext_submission_obj=sub)
					if self.check_filters(p):
						yield p
 def test_load_submission(self):
     """ Load submission directly by ID """
     p = pw.get_submission(t3_id='t3_6es0u8')
     re = RedditElement(p)
     self.assertEqual(re.author,
                      'theshadowmoose',
                      msg='Submission has invalid Author!')
     self.assertEqual(re.title,
                      'Test Direct Link',
                      msg='Submission has invalid Title!')
	def get_elements(self):
		ps = PushshiftAPI()
		for sub in self.data['subreddit'].split(','):
			sub = sub.replace('/r/', '', 1).strip()
			_params = {'subreddit': sub}
			if self.data['limit']:
				_params['limit'] = self.data['limit']
			for post in ps.search_submissions(**_params):
				p = RedditElement(post)
				if self.check_filters(p):
					yield p
 def get_comments(self, comments):
     found = list(
         self._ps.search_comments(ids=comments, limit=len(comments)))
     subs = list(
         self._ps.search_submissions(
             limit=len(found),
             ids=[c.link_id.replace('t3_', '', 1) for c in found]))
     for s in subs:
         search = list(
             filter(lambda c: c.link_id.replace('t3_', '', 1) == s.id,
                    found))
         if not search:
             print('Failed to locate comment using parent ID!', s.id)
             continue
         com = search[0]
         found.remove(com)
         comments = list(
             filter(
                 lambda c: c.replace('t1_', '', 1) != com.id.replace(
                     't1_', '', 1), comments))
         yield RedditElement(com, ext_submission_obj=s)
     if self.data['slow_fallback'] and len(comments):
         for comm in get_info(['t1_' + c for c in comments]):
             yield RedditElement(comm)
def batch_comment_lookup(comments):
	chunks = [comments[x:x+ps_query_size] for x in range(0, len(comments), ps_query_size)]
	found = []
	for idx, ch in enumerate(chunks):
		print("Scanning comments... Batch: %s/%s" % (idx+1, len(chunks)))
		for com in ps.search_comments(limit=len(ch), ids=[c for c in ch]):
			found.append(com)
	subs = [s for s in batch_submission_lookup([c.link_id for c in found])]
	for s in subs:
		search = list(filter(lambda c: c.link_id.replace('t3_', '', 1) == s.id, found))
		if not search:
			continue
		com = search[0]
		found.remove(com)
		yield RedditElement(com, ext_submission_obj=s)
Exemple #16
0
 def get_elements(self):
     ps = PushshiftAPI()
     term = self.data['search_term']
     sub_list = self.data['subreddits'] or None
     subs = sub_list.split(',') if sub_list else [None]
     for sub in subs:
         if sub:
             sub = sub.strip()
         gen = ps.search_submissions(q=term,
                                     subreddit=sub,
                                     limit=self.data['limit'])
         for post in gen:
             p = RedditElement(post)
             if self.check_filters(p):
                 yield p
Exemple #17
0
    def get_elements(self):
        check_last = self.data['check_last_seen_posts']
        if not check_last or check_last < 1:
            check_last = None

        ps = PushshiftAPI()
        for user in self.data['users'].split(','):
            user = user.replace('/u/', '', 1).strip()
            _params = {'author': user}
            if self.data['limit']:
                _params['limit'] = self.data['limit']
            if self.data['scan_submissions']:
                if check_last is not None:
                    last_seen = get_last_seen_posts(
                        user, check_last, self.data['check_last_seen_utc'])
                else:
                    last_seen = None

                last_seen_i = None
                for post in ps.search_submissions(**_params):
                    p = RedditElement(post)
                    if check_last is not None:
                        res = self.is_new_post(p, last_seen)
                        if type(res) is int:
                            if last_seen_i is None:
                                print(
                                    "Reached start of last seen posts at: (%s/%s) [%s] %s %s"
                                    % (res, len(last_seen),
                                       p.strf_created_utc(), p.author, p.id),
                                    debug=True)
                            last_seen_i = res
                        elif res is False:
                            print(
                                "Reached end of last seen posts at: (%s/%s) [%s] %s %s"
                                % (last_seen_i, len(last_seen),
                                   p.strf_created_utc(), p.author, p.id),
                                debug=True)
                            break
                    if self.check_filters(p):
                        yield p
            if self.data['scan_comments']:
                for post in ps.search_comments(**_params):
                    parents = list(
                        ps.search_submissions(ids=post.link_id.replace(
                            't3_', '', 1),
                                              limit=1))
                    if not len(parents):
                        print(
                            "PushShift Warning: Unable to locate parent Submission:",
                            post.link_id)
                        continue
                    submission = parents[0]
                    p = RedditElement(post, ext_submission_obj=submission)
                    if self.check_filters(p):
                        yield p
 def get_elements(self):
     ps = PushshiftAPI()
     for sub in self.data['subreddit'].split(','):
         sub = sub.replace('/r/', '', 1).strip()
         _params = {
             'subreddit': sub,
             'sort_type': self.data['sort_by'],
             'after': self.convert_offset()
         }
         if self.data['limit']:
             _params['limit'] = self.data['limit']
         if 'desc' in self.data['sort_order'].lower():
             _params['sort'] = 'desc'
         else:
             _params['sort'] = 'asc'
         print(_params)
         for post in ps.search_submissions(**_params):
             p = RedditElement(post)
             if self.check_filters(p):
                 yield p
 def test_load_comment(self):
     """ Load Comment directly by ID """
     com = pw.get_comment(t1_id='t1_dxz6n80')
     re = RedditElement(com)
     vals = {
         "_urls": ['https://stackoverflow.com/a/23709194'],
         "type": 'Comment',
         "id": 't1_dxz6n80',
         "title":
         'Reddit Media Downloader is now Threaded - Scrape all the subreddits, *much* faster now.',
         "author": 'theshadowmoose',
         "parent": 't3_8ewkx2',
         "subreddit": 'DataHoarder',
         "over_18": False,
         "created_utc": 1524705293.0,
         "link_count": 1,
         "source_alias": None,
     }
     for k, v in vals.items():
         self.assertEqual(getattr(re, k),
                          v,
                          msg='%s was not properly set in Comment!' %
                          k.title())
 def __init__(self, reddit_id, files, source, title, reddit_ele=None):
     self.reddit_id = reddit_id
     self.title = title
     self.files = files
     self.source = source
     self.ele = RedditElement(reddit_ele) if reddit_ele else None
 def test_missing_author(self):
     """ Really old posts should still work without old author data """
     ps = PushshiftAPI()
     post, = ps.search_submissions(limit=1, ids=['t3_otfrw'])
     re = RedditElement(post)
     self.assertEqual(re.author, 'Deleted')