Python ensure_ascii Examples, reddit_scraper.util.ensure_ascii Python Examples

Example #1

0

Show file

File: reddit_connect.py Project: oscillot/reddit-scraper

    def get_upvoted_wallpapers(subs, upvotes_data):
        """
        Get the urls for upvotes in a specific subset of subreddits. This is
        the list of candidate upvotes from which we try to extract an
        actionable url to either download directly from or we pass to a
        plugin for the purpose or parsing and eventual odwnload link
        extraction.
        :param list subs: A list of strings naming each subreddit to get
        images from.

        :param list upvotes_data: the list of dictionaries returned from :func:
        get_upvotes. The 'url' key of these dictionaries can go to any webpage
        but need an explicit handler for albums or to parse non-direct links.
        See the included plugins for examples.
        :returns list: A list of dictionaries where each key 'url' is a
        direct link to an image
        """
        if [] == subs or '*' == subs:
            subs = [child['data']['subreddit'].lower() for child in upvotes_data]
        print 'Getting candidates...\n'
        #make the list of subreddits lowercase for easier comparisons
        subs = map(string.lower, subs)

        matches = 0
        candidates = []
        print '\nCandidates found so far:\n'
        for child in upvotes_data:
            if child['data']['subreddit'].lower() in subs:
                candidates.append(child)
                matches += 1
                so_far = '%d: [%s] %s (%s)' % (
                    matches,
                    child['data']['subreddit'],
                    child['data']['title'],
                    child['data']['url'])
                if child['data']['over_18']:
                    so_far += ' [NSFW]'
                so_far += '\n'
                print ensure_ascii(so_far)
        return candidates

Example #2

0

Show file

File: plugin_interface.py Project: oscillot/reddit-scraper

    def check_unhandled_posts(self):
        """
        Create a list of unhandled posts along with domains for those
        links which we output at the end to help target plugin
        development/maintenance
        """
        handled_posts = self.handled_posts.keys()
        unhandled_posts = self.candidates_backup.difference(handled_posts)

        for each in unhandled_posts:
            self.unhandled_posts.add(
                (extract_domain(each.url), '%s (%s)' %
                    (ensure_ascii(each.title, each.url))))

Example #3

0

Show file

File: plugin_interface.py Project: oscillot/reddit-scraper

    def hand_off_to_plugins(self):
        """Calls each plugin module and hand the CandidateList off to it.
        """
        for plugin in loaded_plugins:
            print 'Loading plugin: %s.\n' % plugin.__name__
            plug_inst = plugin(self.database, self.candidates, self.output,
                               self.type, self.config, self.categorize,
                               self.nsfw)
            self.handled_posts.update(plug_inst.handled_posts)

            #lazy instantiation so we only get it on the first loop
            if len(self.candidates_backup) == 0:
                # candidates backup is the original list of candidates
                self.candidates_backup.update(plug_inst.candidates_backup)

            #trim down the candidates from what got parsed
            self.candidates = plug_inst.revised

            #this shouldn't(?) change so assigning them each time is fine
            self.image_urls_already_fetched = \
                plug_inst.image_urls_already_fetched

            print '%s handled the following posts:\n' % plugin.__name__
            if len(plug_inst.handled_posts):
                for post in plug_inst.handled_posts:
                    print '%s (%s)' % \
                          (ensure_ascii(post.title, post.url))

                    print '\n\t...which provided the following image urls:\n'
                    for link in plug_inst.handled_posts[post]:
                        if link.duplicate:
                            print '\t%s (Duplicate)\n' % ensure_ascii(link.url)
                        elif link.skipped:
                            print '\t%s (Skipped)\n' % ensure_ascii(link.url)
                        else:
                            print '\t%s\n' % ensure_ascii(link.url)
            else:
                print 'None.'
            print '\n'

Example #4

0

Show file

File: plugin_interface.py Project: oscillot/reddit-scraper

    def acquire(self):
        """
        Handles the calls to the database and the requests out to the world
        for the image candidates. This handles images returned directly as
        well as gzipped images and does all of the reads and writes to and
        from the database. Images that are already in the database are
        skipped. This is done on a filename only basis and is not very smart if
         say there are two different images called "image1.jpg" for example.

        :param list candidates: The list of dictionaries that is returned
        from :func: get_upvoted_wallpapers, where each 'url' key s a direct
        link to an image.
        :param str output: The location to save the downloaded images to as a
         string
        """
        if not os.path.exists(self.output):
            os.makedirs(self.output)

        #parse through links once and try to remove any unneeded plugins
        # (saves time once you have a lot of plugins)
        self.remove_unneeded_plugins()

        #parse links through plugins
        print '\nProcessing: parse links through plugins...'
        self.hand_off_to_plugins()

        print 'The following posts had links that were unhandled:'
        self.check_unhandled_posts()
        if len(self.unhandled_posts) > 0:
            #iterating through these sorted puts them in alpha order by domain
            #so you should be able to see which domains you want or need to
            # target
            for uh in sorted(self.unhandled_posts):
                print uh[0], '\t', ensure_ascii(uh[1])
            print '\n'
        else:
            print 'None\n'

        def filter_dupes(handled):
            unduped = set()
            for h in handled:
                if not h.duplicate:
                    unduped.add(h)
            return unduped

        def filter_new(handled):
            duped = set()
            for h in handled:
                if h.duplicate:
                    duped.add(h)
            return duped

        len_posts = len(self.handled_posts)
        len_urls = sum([len(self.handled_posts[p]) for p in self.handled_posts])
        len_new = sum([len(filter_dupes(self.handled_posts[p]))
                      for p in self.handled_posts])
        len_dupes = sum([len(filter_new(self.handled_posts[p]))
                        for p in self.handled_posts])
        c = PluginExceptionCounter.Instance()
        len_bad = c.get_count()
        print '\nComplete.' \
              '\n%d posts were processed.' \
              '\n%d urls were attempted.' \
              '\n%d new images were acquired this run.' \
              '\n%d were duplicate images.' \
              '\n%d were not handled or invalid.\n' \
              % (len_posts, len_urls, len_new, len_dupes, len_bad)
        #emit the exit code based on whether everything went well or not
        exit(len_bad)