def get_upvoted_wallpapers(subs, upvotes_data): """ Get the urls for upvotes in a specific subset of subreddits. This is the list of candidate upvotes from which we try to extract an actionable url to either download directly from or we pass to a plugin for the purpose or parsing and eventual odwnload link extraction. :param list subs: A list of strings naming each subreddit to get images from. :param list upvotes_data: the list of dictionaries returned from :func: get_upvotes. The 'url' key of these dictionaries can go to any webpage but need an explicit handler for albums or to parse non-direct links. See the included plugins for examples. :returns list: A list of dictionaries where each key 'url' is a direct link to an image """ if [] == subs or '*' == subs: subs = [child['data']['subreddit'].lower() for child in upvotes_data] print 'Getting candidates...\n' #make the list of subreddits lowercase for easier comparisons subs = map(string.lower, subs) matches = 0 candidates = [] print '\nCandidates found so far:\n' for child in upvotes_data: if child['data']['subreddit'].lower() in subs: candidates.append(child) matches += 1 so_far = '%d: [%s] %s (%s)' % ( matches, child['data']['subreddit'], child['data']['title'], child['data']['url']) if child['data']['over_18']: so_far += ' [NSFW]' so_far += '\n' print ensure_ascii(so_far) return candidates
def check_unhandled_posts(self): """ Create a list of unhandled posts along with domains for those links which we output at the end to help target plugin development/maintenance """ handled_posts = self.handled_posts.keys() unhandled_posts = self.candidates_backup.difference(handled_posts) for each in unhandled_posts: self.unhandled_posts.add( (extract_domain(each.url), '%s (%s)' % (ensure_ascii(each.title, each.url))))
def hand_off_to_plugins(self): """Calls each plugin module and hand the CandidateList off to it. """ for plugin in loaded_plugins: print 'Loading plugin: %s.\n' % plugin.__name__ plug_inst = plugin(self.database, self.candidates, self.output, self.type, self.config, self.categorize, self.nsfw) self.handled_posts.update(plug_inst.handled_posts) #lazy instantiation so we only get it on the first loop if len(self.candidates_backup) == 0: # candidates backup is the original list of candidates self.candidates_backup.update(plug_inst.candidates_backup) #trim down the candidates from what got parsed self.candidates = plug_inst.revised #this shouldn't(?) change so assigning them each time is fine self.image_urls_already_fetched = \ plug_inst.image_urls_already_fetched print '%s handled the following posts:\n' % plugin.__name__ if len(plug_inst.handled_posts): for post in plug_inst.handled_posts: print '%s (%s)' % \ (ensure_ascii(post.title, post.url)) print '\n\t...which provided the following image urls:\n' for link in plug_inst.handled_posts[post]: if link.duplicate: print '\t%s (Duplicate)\n' % ensure_ascii(link.url) elif link.skipped: print '\t%s (Skipped)\n' % ensure_ascii(link.url) else: print '\t%s\n' % ensure_ascii(link.url) else: print 'None.' print '\n'
def acquire(self): """ Handles the calls to the database and the requests out to the world for the image candidates. This handles images returned directly as well as gzipped images and does all of the reads and writes to and from the database. Images that are already in the database are skipped. This is done on a filename only basis and is not very smart if say there are two different images called "image1.jpg" for example. :param list candidates: The list of dictionaries that is returned from :func: get_upvoted_wallpapers, where each 'url' key s a direct link to an image. :param str output: The location to save the downloaded images to as a string """ if not os.path.exists(self.output): os.makedirs(self.output) #parse through links once and try to remove any unneeded plugins # (saves time once you have a lot of plugins) self.remove_unneeded_plugins() #parse links through plugins print '\nProcessing: parse links through plugins...' self.hand_off_to_plugins() print 'The following posts had links that were unhandled:' self.check_unhandled_posts() if len(self.unhandled_posts) > 0: #iterating through these sorted puts them in alpha order by domain #so you should be able to see which domains you want or need to # target for uh in sorted(self.unhandled_posts): print uh[0], '\t', ensure_ascii(uh[1]) print '\n' else: print 'None\n' def filter_dupes(handled): unduped = set() for h in handled: if not h.duplicate: unduped.add(h) return unduped def filter_new(handled): duped = set() for h in handled: if h.duplicate: duped.add(h) return duped len_posts = len(self.handled_posts) len_urls = sum([len(self.handled_posts[p]) for p in self.handled_posts]) len_new = sum([len(filter_dupes(self.handled_posts[p])) for p in self.handled_posts]) len_dupes = sum([len(filter_new(self.handled_posts[p])) for p in self.handled_posts]) c = PluginExceptionCounter.Instance() len_bad = c.get_count() print '\nComplete.' \ '\n%d posts were processed.' \ '\n%d urls were attempted.' \ '\n%d new images were acquired this run.' \ '\n%d were duplicate images.' \ '\n%d were not handled or invalid.\n' \ % (len_posts, len_urls, len_new, len_dupes, len_bad) #emit the exit code based on whether everything went well or not exit(len_bad)