forked from makuto/Liked-Saved-Image-Downloader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
redditScraper.py
132 lines (98 loc) · 5.4 KB
/
redditScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-
import logger
import praw
import submission
from submission import Submission
#import pprint
user_agent = 'Python Script: v2.0: Reddit Liked Saved Image Downloader (by /u/makuto9)'
# Helper function. Print percentage complete
def percentageComplete(currentItem, numItems):
if numItems:
return str(int(((float(currentItem + 1) / float(numItems)) * 100))) + '%'
return 'Invalid'
def getSubmissionsFromRedditList(redditList, source, earlyOutPoint = None, unlikeUnsave = False):
submissions = []
comments = []
numTotalSubmissions = len(redditList)
for currentSubmissionIndex, singleSubmission in enumerate(redditList):
if currentSubmissionIndex and currentSubmissionIndex % 100 == 0:
logger.log('Got {} submissions...'.format(currentSubmissionIndex))
if type(singleSubmission) is praw.models.Submission:
newSubmission = Submission()
newSubmission.source = u'reddit'
newSubmission.title = singleSubmission.title
newSubmission.author = singleSubmission.author.name if singleSubmission.author else u'no_author'
newSubmission.subreddit = singleSubmission.subreddit.url
newSubmission.subredditTitle = singleSubmission.subreddit.title
newSubmission.body = singleSubmission.selftext
newSubmission.bodyUrl = singleSubmission.url
newSubmission.postUrl = singleSubmission.permalink
submissions.append(newSubmission)
logger.log(percentageComplete(currentSubmissionIndex, numTotalSubmissions))
if unlikeUnsave:
if source == 'liked':
singleSubmission.clear_vote()
else:
singleSubmission.unsave()
logger.log('Unsaved/cleared vote on submission ' + singleSubmission.permalink)
# Check to see if we've already downloaded this submission; if so, early out
if (earlyOutPoint
and earlyOutPoint[0]
and newSubmission.postUrl == earlyOutPoint[0].postUrl):
logger.log('Found early out point after ' + str(len(submissions)) + ' new submissions.'
' If you e.g. changed your total requests value and want to go deeper, set'
' Reddit_Try_Request_Only_New to False in your settings.txt')
break
# The submission is actually a saved comment
else:
# I looked at https://praw.readthedocs.io/en/latest/getting_started/quick_start.html
# very bottom to learn how to enumerate what information a submission can provide
# logger.log(singleSubmission.body)
# pprint.plogger.log(vars(singleSubmission))
newSubmission = Submission()
newSubmission.source = u'reddit'
newSubmission.title = u'Comment on ' + singleSubmission.link_title
newSubmission.author = singleSubmission.author.name if singleSubmission.author else u'no_author'
newSubmission.subreddit = singleSubmission.subreddit.url
newSubmission.subredditTitle = singleSubmission.subreddit.title
newSubmission.body = singleSubmission.body
newSubmission.bodyUrl = singleSubmission.permalink
newSubmission.postUrl = singleSubmission.link_permalink
comments.append(newSubmission)
return submissions, comments
def getRedditUserLikedSavedSubmissions(user_name, user_password, client_id, client_secret,
request_limit = 10, saveLiked = True, saveSaved = True,
earlyOutPointSaved = None, earlyOutPointLiked = None,
unlikeLiked = False, unsaveSaved = False):
r = praw.Reddit(client_id = client_id,
client_secret=client_secret,
username=user_name,
password=user_password,
user_agent=user_agent)
logger.log('\n\nCommunicating with reddit. This should only take a minute...\n')
savedLinks = None
if saveSaved:
logger.log('\tGetting saved links...')
savedLinks = r.user.me().saved(limit=request_limit)
savedLinks = list(savedLinks)
likedLinks = None
if saveLiked:
logger.log('\tGetting liked links...')
likedLinks = r.user.me().upvoted(limit=request_limit)
likedLinks = list(likedLinks)
savedSubmissions = []
savedComments = []
if saveSaved:
logger.log('\n\nRetrieving your saved submissions. This can take several minutes...\n')
savedSubmissions, savedComments = getSubmissionsFromRedditList(savedLinks, 'saved', earlyOutPointSaved, unsaveSaved)
likedSubmissions = []
likedComments = []
if saveLiked:
logger.log('\n\nRetrieving your liked submissions. This can take several minutes...\n')
likedSubmissions, likedComments = getSubmissionsFromRedditList(likedLinks, 'liked', earlyOutPointLiked, unlikeLiked)
submissions = savedSubmissions + likedSubmissions
# I don't think you can ever have liked comments, but I'm including it anyways
comments = savedComments + likedComments
newEarlyOutSaved = savedSubmissions[0] if len(savedSubmissions) else None
newEarlyOutLiked = likedSubmissions[0] if len(likedSubmissions) else None
return submissions, comments, (newEarlyOutSaved, newEarlyOutLiked)