def _fetch_css(self): logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) if not os.path.exists(self.cache_dir): os.makedirs(self.cache_dir) for subreddit in self.subreddits: css_path = os.path.sep.join([self.cache_dir, subreddit + ".css"]) if self.prefer_cache and os.path.exists(css_path): with open(css_path) as css_file: css = css_file.read().decode("utf8") self._handle_css(css, subreddit) else: workpool.put( DownloadJob( self._requests, "http://www.reddit.com/r/{}/stylesheet".format(subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{"subreddit": subreddit} ) ) workpool.shutdown() workpool.join()
def _download_images(self): logger.debug("Downloading images using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) # cache emotes key_func = lambda e: e["background-image"] with self.mutex: for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = self.get_file_path(image_url, rootdir=self.cache_dir) if not os.path.isfile(file_path): image_url = re.sub(r"^(https?:)?//", "https://", image_url) workpool.put( DownloadJob( self._requests, image_url, retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{"image_path": file_path} ) ) workpool.shutdown() workpool.join()
def _download_images(self): logger.debug("Downloading images using {} threads".format( self.workers)) workpool = WorkerPool(size=self.workers) # cache emotes key_func = lambda e: e['background-image'] with self.mutex: for image_url, group in itertools.groupby( sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = self.get_file_path(image_url, rootdir=self.cache_dir) if not os.path.isfile(file_path): # Temp workaround for downloading apngs straight from amazon instead of broken ones from cloudflare image_url = re.sub(r'^(https?:)?//', 'https://s3.amazonaws.com/', image_url) workpool.put( DownloadJob(self._requests, image_url, retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{'image_path': file_path})) workpool.shutdown() workpool.join()
def fetch_css(self): logger.info('Beginning fetch_css()') logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) for subreddit in self.subreddits: try: css_subreddit_path = path.join(self.session_cache, subreddit.lower()) + '.css' with open(css_subreddit_path, 'r') as f: pass except: workpool.put( DownloadJob( self._requests, 'https://pay.reddit.com/r/{}/stylesheet'.format( subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{'subreddit': subreddit})) workpool.shutdown() workpool.join()
def download_images(self): logger.info('Beginning download_images()') logger.debug("Downloading images using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) def create_download_jobs(key_func): for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = get_file_path(image_url, rootdir=self.reddit_cache) if not path.isfile(file_path): workpool.put(DownloadJob(self._requests, urlparse.urljoin('https://s3.amazonaws.com/',image_url), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{'image_path': file_path})) with self.mutex: create_download_jobs( lambda e: e['background-image']) create_download_jobs( lambda e: e.get('hover-background-image')) workpool.shutdown() workpool.join()
def download_images(self): logger.info('Beginning download_images()') logger.debug("Downloading images using {} threads".format( self.workers)) workpool = WorkerPool(size=self.workers) def create_download_jobs(key_func): for image_url, group in itertools.groupby( sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = get_file_path(image_url, rootdir=self.reddit_cache) if not path.isfile(file_path): workpool.put( DownloadJob(self._requests, urlparse.urljoin( 'https://s3.amazonaws.com/', image_url), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{'image_path': file_path})) with self.mutex: create_download_jobs(lambda e: e['background-image']) create_download_jobs(lambda e: e.get('hover-background-image')) workpool.shutdown() workpool.join()
def _fetch_css(self): logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) if not os.path.exists(self.cache_dir): os.makedirs(self.cache_dir) for subreddit in self.subreddits: if subreddit in self.legacy_subreddits: legacy_file = '{}/../legacy_css/{}.css'.format( os.path.dirname(__file__), subreddit) if os.path.exists(legacy_file): with open(legacy_file) as fh: css = fh.read() self._process_stylesheet_response( 200, css, "text/css", subreddit) else: logger.error( "No css file found for legacy subreddit {}".format( subreddit)) else: workpool.put( DownloadJob( self._requests, 'https://old.reddit.com/r/{}/stylesheet'.format( subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{'subreddit': subreddit})) workpool.shutdown() workpool.join()
def _download_images(self): logger.debug("Downloading images using {} threads".format( self.workers)) workpool = WorkerPool(size=self.workers) # cache emotes key_func = lambda e: e['background-image'] with self.mutex: for image_url, group in itertools.groupby( sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = self.get_file_path(image_url, rootdir=self.cache_dir) if not os.path.isfile(file_path): workpool.put( DownloadJob(self._requests, image_url, retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{'image_path': file_path})) workpool.shutdown() workpool.join()
def _fetch_css(self): logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) for subreddit in self.subreddits: workpool.put(DownloadJob(self._requests, 'http://www.reddit.com/r/{}/stylesheet'.format(subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{'subreddit': subreddit})) workpool.shutdown() workpool.join()
def _process_emotes(self): logger.debug("Processing emotes using {} threads".format(self.workers)) workpool = WorkerPool(self.workers) key_func = lambda e: e['background-image'] with self.mutex: for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func): if not image_url: continue workpool.put(self.processor_factory.new_processor(scraper=self, image_url=image_url, group=list(group))) workpool.shutdown() workpool.join()
def main(): DOMAIN = "benchmark" conn = boto.connect_sdb() domain = conn.get_domain(DOMAIN) # Prepare item list items = [] now = time.time() for i in domain: items.append(i) elapsed = time.time() - now if not items: print "No items found." return msg = "Fetched manifest of %d items in %f seconds, proceeding." print msg % (len(items), elapsed) # THE REAL MEAT: # Prepare the pool print "Initializing pool." def toolbox_factory(): return SDBToolBox(DOMAIN) def worker_factory(job_queue): return EquippedWorker(job_queue, toolbox_factory) pool = WorkerPool(size=20, worker_factory=worker_factory) print "Starting to fetch items..." now = time.time() # Insert jobs results_queue = Queue() for i in items: j = SdbJob(results_queue, boto.sdb.domain.Domain.get_item, [i]) pool.put(j) # Fetch results r = [results_queue.get() for i in items] elapsed = time.time() - now print "Fetched %d items paralleled in %f seconds." % (len(r), elapsed) pool.shutdown()
def _fetch_css(self): logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) for subreddit in self.subreddits: workpool.put( DownloadJob( self._requests, 'http://www.reddit.com/r/{}/stylesheet'.format(subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{'subreddit': subreddit})) workpool.shutdown() workpool.join()
def _process_emotes(self): logger.debug("Processing emotes using {} threads".format(self.workers)) workpool = WorkerPool(self.workers) key_func = lambda e: e['background-image'] with self.mutex: for image_url, group in itertools.groupby( sorted(self.emotes, key=key_func), key_func): if not image_url: continue workpool.put( self.processor_factory.new_processor(scraper=self, image_url=image_url, group=list(group))) workpool.shutdown() workpool.join()
def fetch_css(self): logger.info('Beginning fetch_css()') logger.debug("Fetching css using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) for subreddit in self.subreddits: try: css_subreddit_path = path.join(self.session_cache, subreddit.lower()) + '.css' with open(css_subreddit_path, 'r') as f: pass except: workpool.put(DownloadJob(self._requests, 'https://pay.reddit.com/r/{}/stylesheet'.format(subreddit), retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_fetch_stylesheet, **{'subreddit': subreddit})) workpool.shutdown() workpool.join()
def _download_images(self): logger.debug("Downloading images using {} threads".format(self.workers)) workpool = WorkerPool(size=self.workers) # cache emotes key_func = lambda e: e['background-image'] with self.mutex: for image_url, group in itertools.groupby(sorted(self.emotes, key=key_func), key_func): if not image_url: continue file_path = self.get_file_path(image_url, rootdir=self.cache_dir) if not os.path.isfile(file_path): # Temp workaround for downloading apngs straight from amazon instead of broken ones from cloudflare image_url = image_url.replace('http://', 'https://s3.amazonaws.com/') workpool.put(DownloadJob(self._requests, image_url, retry=5, rate_limit_lock=self.rate_limit_lock, callback=self._callback_download_image, **{'image_path': file_path})) workpool.shutdown() workpool.join()
class Send(Job): def __init__(self, user, message): self.user = user self.message = message def run(self): api.CreateChatWith(self.user).SendMessage(self.message) if __name__ == "__main__": pool = WorkerPool(size=pool_size) # create new pool message = open(message).read() print "Sending message..." total = api.Friends.Count print "Total: %s" % total current = 0 for user in list(api.Friends): job = Send(user.Handle, message) pool.put(job) print "Sending... %3.2f" % (current * 100 / float(total)) current += 1 print "Shutting down..." pool.shutdown() # close pool pool.wait() # wait to finish print "Done."