def getSubmissionsFromRedditList(redditList, source, earlyOutPoint=None, unlikeUnsave=False): submissions = [] comments = [] numTotalSubmissions = len(redditList) for currentSubmissionIndex, singleSubmission in enumerate(redditList): if currentSubmissionIndex and currentSubmissionIndex % 100 == 0: logger.log('Got {} submissions...'.format(currentSubmissionIndex)) if type(singleSubmission) is praw.models.Submission: newSubmission = Submission() newSubmission.source = u'reddit' newSubmission.title = singleSubmission.title newSubmission.author = singleSubmission.author.name if singleSubmission.author else u'no_author' newSubmission.subreddit = singleSubmission.subreddit.url newSubmission.subredditTitle = singleSubmission.subreddit.title newSubmission.body = singleSubmission.selftext newSubmission.bodyUrl = singleSubmission.url newSubmission.postUrl = singleSubmission.permalink submissions.append(newSubmission) logger.log( percentageComplete(currentSubmissionIndex, numTotalSubmissions)) if unlikeUnsave: if source == 'liked': singleSubmission.clear_vote() else: singleSubmission.unsave() logger.log('Unsaved/cleared vote on submission ' + singleSubmission.permalink) # Check to see if we've already downloaded this submission; if so, early out if (earlyOutPoint and earlyOutPoint[0] and newSubmission.postUrl == earlyOutPoint[0].postUrl): logger.log( 'Found early out point after ' + str(len(submissions)) + ' new submissions.' ' If you e.g. changed your total requests value and want to go deeper, set' ' Reddit_Try_Request_Only_New to False in your settings.txt' ) break # The submission is actually a saved comment else: # I looked at https://praw.readthedocs.io/en/latest/getting_started/quick_start.html # very bottom to learn how to enumerate what information a submission can provide # logger.log(singleSubmission.body) # pprint.plogger.log(vars(singleSubmission)) newSubmission = Submission() newSubmission.source = u'reddit' newSubmission.title = u'Comment on ' + singleSubmission.link_title newSubmission.author = singleSubmission.author.name if singleSubmission.author else u'no_author' newSubmission.subreddit = singleSubmission.subreddit.url newSubmission.subredditTitle = singleSubmission.subreddit.title newSubmission.body = singleSubmission.body newSubmission.bodyUrl = singleSubmission.permalink newSubmission.postUrl = singleSubmission.link_permalink comments.append(newSubmission) return submissions, comments
def getTumblrUserLikedSubmissions(clientId, clientSecret, tokenId, tokenSecret, likeRequestLimit=100, requestOnlyNewCache=None): tumblrClient = pytumblr.TumblrRestClient(clientId, clientSecret, tokenId, tokenSecret) # This is an annoying limit the api seems to impose POSTS_PER_PAGE = 50 oldestPageTimestamp = 0 totalRequests = 0 submissions = [] foundOldSubmission = False while totalRequests < likeRequestLimit: if oldestPageTimestamp: tumblrLikes = tumblrClient.likes(**{ 'limit': POSTS_PER_PAGE, 'offset': totalRequests }) else: tumblrLikes = tumblrClient.likes(**{'limit': POSTS_PER_PAGE}) numPostsThisPage = len(tumblrLikes['liked_posts']) if not numPostsThisPage: break logger.log( str(numPostsThisPage) + ' Tumblr likes requested. Total likes: ' + str(tumblrLikes['liked_count'])) for postIndex, post in reversed( list(enumerate(tumblrLikes['liked_posts']))): if 'photos' in post: for photoIndex, photo in enumerate(post['photos']): newSubmission = Submission() newSubmission.source = u'Tumblr' # Tumblr submissions don't have titles, so make one # This'll look ugly in the file browser, unfortunately if len(post['photos']) > 1: newSubmission.title = str( signedCrc32(post['short_url'].encode())) newSubmission.title += u'_' newSubmission.title += str(photoIndex) else: newSubmission.title = str( signedCrc32(post['short_url'].encode())) """logger.log(post) return""" newSubmission.author = post['blog_name'] newSubmission.subreddit = post['short_url'] newSubmission.subredditTitle = post['blog_name'] + '_Tumblr' newSubmission.body = post['caption'] newSubmission.bodyUrl = photo['original_size']['url'] newSubmission.postUrl = post['short_url'] submissions.append(newSubmission) if (requestOnlyNewCache and requestOnlyNewCache[0] and newSubmission.postUrl == requestOnlyNewCache[0].postUrl): logger.log( 'Found early out point after ' + str(len(submissions)) + ' new submissions.' ' If you e.g. changed your total requests value and want to go deeper, set' ' Tumblr_Try_Request_Only_New to False in your settings.txt' ) foundOldSubmission = True break else: logger.log('Skipped ' + post['short_url'] + ' (does not have images)') if foundOldSubmission: break if foundOldSubmission: break oldestPageTimestamp = tumblrLikes['liked_posts'][-1]['liked_timestamp'] # If we didn't get a full page's worth of posts, we're on the last page # Sometimes pages don't have POSTS_PER_PAGE, they're a little under RANDOM_PAGE_TOLERANCE = 10 if numPostsThisPage < POSTS_PER_PAGE - RANDOM_PAGE_TOLERANCE: break totalRequests += numPostsThisPage newEarlyOut = submissions[0] if len(submissions) else None return submissions, newEarlyOut
def getPinterestUserPinnedSubmissions(email, username, password, cacheFileName): submissions = [] lastIds = {} if not cacheFileName else loadPinterestCache(cacheFileName) updatedLastIds = lastIds pinterest = Pinterest(email=email, password=password, username=username, cred_root='pinterest_creds') logger.log("Logging in to Pinterest...") pinterest.login() boards = pinterest.boards(username=username) for board in boards: # Get all pins for the board board_pins = [] pin_batch = pinterest.board_feed(board_id=board['id']) while len(pin_batch) > 0: for pin in pin_batch: if pin['id'] not in lastIds: # Only using the dict for its key lookup updatedLastIds[pin['id']] = 1 board_pins.append(pin) pin_batch = pinterest.board_feed(board_id=board['id']) for pin in board_pins: # I'm not sure how important it is to support these if pin['type'] == 'story': continue newSubmission = Submission() newSubmission.source = u'Pinterest' # While pins do have titles, 90% of the time they seem useless newSubmission.title = pin['id'] # There is probably a way to figure out who the original pinner is, but oh well newSubmission.author = 'N/A' newSubmission.subreddit = board['url'] newSubmission.subredditTitle = board['name'] + '_Pinterest' if 'rich_summary' in pin and pin['rich_summary']: if 'display_description' in pin['rich_summary']: newSubmission.body = pin['rich_summary'][ 'display_description'] else: newSubmission.body = 'N/A' newSubmission.postUrl = pin['rich_summary']['url'] # What is actually downloaded newSubmission.bodyUrl = pin['images']['orig']['url'] submissions.append(newSubmission) if cacheFileName: savePinterestCache(cacheFileName, updatedLastIds) logger.log("Found {} new Pinterest submissions".format(len(submissions))) return submissions