def download_images(self):
        logger.info('Beginning download_images()')
        logger.debug("Downloading images using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        def create_download_jobs(key_func):
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = get_file_path(image_url, rootdir=self.reddit_cache)
                if not path.isfile(file_path):
                    workpool.put(DownloadJob(self._requests,
                                             urlparse.urljoin('https://s3.amazonaws.com/',image_url),
                                             retry=5,
                                             rate_limit_lock=self.rate_limit_lock,
                                             callback=self._callback_download_image,
                                             **{'image_path': file_path}))

        with self.mutex:
            create_download_jobs( lambda e: e['background-image'])
            create_download_jobs( lambda e: e.get('hover-background-image'))

        workpool.shutdown()
        workpool.join()
Esempio n. 2
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)

        for subreddit in self.subreddits:
            if subreddit in self.legacy_subreddits:
                legacy_file = '{}/../legacy_css/{}.css'.format(
                    os.path.dirname(__file__), subreddit)
                if os.path.exists(legacy_file):
                    with open(legacy_file) as fh:
                        css = fh.read()
                        self._process_stylesheet_response(
                            200, css, "text/css", subreddit)
                else:
                    logger.error(
                        "No css file found for legacy subreddit {}".format(
                            subreddit))
            else:
                workpool.put(
                    DownloadJob(
                        self._requests,
                        'https://old.reddit.com/r/{}/stylesheet'.format(
                            subreddit),
                        retry=5,
                        rate_limit_lock=self.rate_limit_lock,
                        callback=self._callback_fetch_stylesheet,
                        **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Esempio n. 3
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(
            self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url,
                                               rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    # Temp workaround for downloading apngs straight from amazon instead of broken ones from cloudflare
                    image_url = re.sub(r'^(https?:)?//',
                                       'https://s3.amazonaws.com/', image_url)
                    workpool.put(
                        DownloadJob(self._requests,
                                    image_url,
                                    retry=5,
                                    rate_limit_lock=self.rate_limit_lock,
                                    callback=self._callback_download_image,
                                    **{'image_path': file_path}))

        workpool.shutdown()
        workpool.join()
Esempio n. 4
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        if not os.path.exists(self.cache_dir):
            os.makedirs(self.cache_dir)

        for subreddit in self.subreddits:
            css_path = os.path.sep.join([self.cache_dir, subreddit + ".css"])
            if self.prefer_cache and os.path.exists(css_path):
                with open(css_path) as css_file:
                    css = css_file.read().decode("utf8")
                    self._handle_css(css, subreddit)
            else:
                workpool.put(
                    DownloadJob(
                        self._requests,
                        "http://www.reddit.com/r/{}/stylesheet".format(subreddit),
                        retry=5,
                        rate_limit_lock=self.rate_limit_lock,
                        callback=self._callback_fetch_stylesheet,
                        **{"subreddit": subreddit}
                    )
                )

        workpool.shutdown()
        workpool.join()
Esempio n. 5
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(
            self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url,
                                               rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    workpool.put(
                        DownloadJob(self._requests,
                                    image_url,
                                    retry=5,
                                    rate_limit_lock=self.rate_limit_lock,
                                    callback=self._callback_download_image,
                                    **{'image_path': file_path}))

        workpool.shutdown()
        workpool.join()
Esempio n. 6
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e["background-image"]
        with self.mutex:
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url, rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    image_url = re.sub(r"^(https?:)?//", "https://", image_url)
                    workpool.put(
                        DownloadJob(
                            self._requests,
                            image_url,
                            retry=5,
                            rate_limit_lock=self.rate_limit_lock,
                            callback=self._callback_download_image,
                            **{"image_path": file_path}
                        )
                    )

        workpool.shutdown()
        workpool.join()
    def download_images(self):
        logger.info('Beginning download_images()')
        logger.debug("Downloading images using {} threads".format(
            self.workers))
        workpool = WorkerPool(size=self.workers)

        def create_download_jobs(key_func):
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = get_file_path(image_url, rootdir=self.reddit_cache)
                if not path.isfile(file_path):
                    workpool.put(
                        DownloadJob(self._requests,
                                    urlparse.urljoin(
                                        'https://s3.amazonaws.com/',
                                        image_url),
                                    retry=5,
                                    rate_limit_lock=self.rate_limit_lock,
                                    callback=self._callback_download_image,
                                    **{'image_path': file_path}))

        with self.mutex:
            create_download_jobs(lambda e: e['background-image'])
            create_download_jobs(lambda e: e.get('hover-background-image'))

        workpool.shutdown()
        workpool.join()
    def fetch_css(self):
        logger.info('Beginning fetch_css()')

        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            try:
                css_subreddit_path = path.join(self.session_cache,
                                               subreddit.lower()) + '.css'
                with open(css_subreddit_path, 'r') as f:
                    pass
            except:
                workpool.put(
                    DownloadJob(
                        self._requests,
                        'https://pay.reddit.com/r/{}/stylesheet'.format(
                            subreddit),
                        retry=5,
                        rate_limit_lock=self.rate_limit_lock,
                        callback=self._callback_fetch_stylesheet,
                        **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Esempio n. 9
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            workpool.put(DownloadJob(self._requests,
                                     'http://www.reddit.com/r/{}/stylesheet'.format(subreddit),
                                     retry=5,
                                     rate_limit_lock=self.rate_limit_lock,
                                     callback=self._callback_fetch_stylesheet,
                                     **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Esempio n. 10
0
    def _process_emotes(self):
        logger.debug("Processing emotes using {} threads".format(self.workers))
        workpool = WorkerPool(self.workers)

        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                workpool.put(self.processor_factory.new_processor(scraper=self, image_url=image_url, group=list(group)))

        workpool.shutdown()
        workpool.join()
Esempio n. 11
0
    def _fetch_css(self):
        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            workpool.put(
                DownloadJob(
                    self._requests,
                    'http://www.reddit.com/r/{}/stylesheet'.format(subreddit),
                    retry=5,
                    rate_limit_lock=self.rate_limit_lock,
                    callback=self._callback_fetch_stylesheet,
                    **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Esempio n. 12
0
    def _process_emotes(self):
        logger.debug("Processing emotes using {} threads".format(self.workers))
        workpool = WorkerPool(self.workers)

        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(
                    sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                workpool.put(
                    self.processor_factory.new_processor(scraper=self,
                                                         image_url=image_url,
                                                         group=list(group)))

        workpool.shutdown()
        workpool.join()
Esempio n. 13
0
    def fetch_css(self):
        logger.info('Beginning fetch_css()')

        logger.debug("Fetching css using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        for subreddit in self.subreddits:
            try:
                css_subreddit_path = path.join(self.session_cache, subreddit.lower()) + '.css'
                with open(css_subreddit_path, 'r') as f:
                    pass
            except:
                workpool.put(DownloadJob(self._requests,
                                         'https://pay.reddit.com/r/{}/stylesheet'.format(subreddit),
                                         retry=5,
                                         rate_limit_lock=self.rate_limit_lock,
                                         callback=self._callback_fetch_stylesheet,
                                         **{'subreddit': subreddit}))

        workpool.shutdown()
        workpool.join()
Esempio n. 14
0
    def _download_images(self):
        logger.debug("Downloading images using {} threads".format(self.workers))
        workpool = WorkerPool(size=self.workers)

        # cache emotes
        key_func = lambda e: e['background-image']
        with self.mutex:
            for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func):
                if not image_url:
                    continue

                file_path = self.get_file_path(image_url, rootdir=self.cache_dir)
                if not os.path.isfile(file_path):
                    # Temp workaround for downloading apngs straight from amazon instead of broken ones from cloudflare
                    image_url = image_url.replace('http://', 'https://s3.amazonaws.com/')
                    workpool.put(DownloadJob(self._requests,
                                             image_url,
                                             retry=5,
                                             rate_limit_lock=self.rate_limit_lock,
                                             callback=self._callback_download_image,
                                             **{'image_path': file_path}))

        workpool.shutdown()
        workpool.join()