def is_green_pool_free(pool): """ Return True if the provided green pool is free, False otherwise. """ if CONCURRENCY_LIBRARY == "eventlet": return pool.free() elif CONCURRENCY_LIBRARY == "gevent": return not pool.full() else: raise ValueError("Unsupported concurrency library")
def crawler(u): global crawled response = requests.get(u) print response.status_code, u for link in re.findall('<a href="(http.*?)"', response.content): if crawled < 10 and not pool.full(): crawled += 1 pool.spawn(crawler, link)
def crawler(u): '''A very simple pooled gevent web crawler''' global crawled # Crawl the page, print the status response = requests.get(u) print response.status_code, u # Extract some links to follow for link in re.findall('<a href="(http.*?)"', response.content): # Limit to 10 pages (ignores links when the pool is already full) if crawled < 10 and not pool.full(): crawled += 1 pool.spawn(crawler, link)
def scrape_base_url(): global data startTime = datetime.now() tree = html.fromstring(session.get(base_url).text) func = lambda x: queue.put_nowait((parse_comp, { 'url': domain + x.xpath('./@href')[0], 'name': x.xpath('./text()')[0] })) [ func(x) for x in tree.xpath('//div[@class="st-text"]//td/a') if x.xpath('./text()') != [] ] while not queue.empty() and not pool.full(): for x in xrange(0, min(queue.qsize(), pool.free_count())): t = queue.get_nowait() pool.start(pool.spawn(t[0], t[1])) pool.join() print 'Time Taken : ', datetime.now() - startTime with open('data.json', 'w') as fp: json.dump(data, fp)
def green_producer(iterator, map_func, queue, consumer_instance, pool_size, consumer_args=(), consumer_kwargs={}): """A map function: Map pool.spawn(func, iterated_element) on iterator until the gevent pool is full or the iterator is exhausted. Put the return value (a gevent.event.AsyncResult) of each spawned func call into the queue. Whenever the pool is full, switch control to the consumer_instance greenlet. When control switches back to this instance, fill up pool again. """ log.debug('initializing producer. This will only appear once per greenlet') consumer_instance.switch(*consumer_args, **consumer_kwargs) pool = gevent.pool.Pool(7) for elem in iterator: log.debug("Queued job for elem {elem}".format(**locals())) queue.put(pool.spawn(map_func, elem)) if pool.full(): consumer_instance.switch() gevent.joinall(pool)
def wait_available(pool, pool_name): statsd = stats.get_statsd_client() if pool.full(): statsd.incr('%s.pool.full' % pool_name) pool.wait_available()
def _spawn_gm_worker(self): global pool if not pool.full(): gm_worker = _GMWorker(self.parent) gm_worker.register_task('message.send', send) pool.add(gevent.spawn(gm_worker.work, time_to_work=20))
def _download_res( self, filepath, rate, uploading=True, callback=None, cb_kwargs=None): try: peers = self.get_peers() self._record_get_peer_ts() peers_num = len(peers) count = 0 # just get resource size while True: ip, port = peers[count] logger.info('get resource size') try: ret = self._requests_session.get( "{protocol}{ip}:{port}/?{res}" .format( protocol=PROTOCOL, ip=ip, port=port, res=urlencode({'res_url': self.upload_res_url})), stream=True, headers={"Range": "bytes=0-0"}, timeout=1) if ret.ok: #: bytes=0-1/17) content_range = ret.headers.get("Content-Range") res_length = content_range.split('/')[-1] break else: logger.warn( 'get piece from ip: %s port: %s error, code: %s ' % (ip, port, ret.status_code)) count += 1 self.del_from_tracker(ip=ip, peer_port=port) except ConnectionError: logger.warn( 'get piece from ip: %s port: %s error ConnectionError' % (ip, port)) count += 1 self.del_from_tracker(ip=ip, peer_port=port) except Timeout: logger.warn( 'get piece from ip: %s port: %s error Timeout' % (ip, port)) count += 1 self.del_from_tracker(ip=ip, peer_port=port) finally: if count >= peers_num: logger.warn("No peers avaliable") peers = self.get_peers() peers_num = len(peers) count = 0 logger.info('%s is size of %s' % (self.upload_res_url, sizeof_fmt_human(res_length))) self.piece_file = PieceFile(res_length, filepath) pool_work_num = 15 pool_q_size = pool_work_num * 2 pool = gevent.pool.Pool(pool_work_num) self.start_ready_upload_thread() if rate: self.download_rate = rate else: rate = self.download_rate if rate: self.token_bucket = TokenBucket(rate) while self.piece_file.has_unalloc(): args_list = list() for peer in peers: if peer not in self._peer_in_conn: args_list.append((peer, None)) [pool.apply_async(self._download_piece_thread, *args) for args in args_list[:pool_q_size]] # update peers if peer run out while pool.full(): gevent.sleep(0.2) if not self.piece_file.has_empty(): pool.join() logger.debug( 'test get_empty_block: %s' % self.piece_file.get_empty_piece()) logger.debug('peer in connection: %s' % self._peer_in_conn) if self.piece_file.has_unalloc(): try: tv = self._get_last_get_peer_tv() if tv < GET_PEER_INTERVAL: gevent.sleep(GET_PEER_INTERVAL - tv) g = gevent.spawn(self.get_peers) peers = g.get() self._record_get_peer_ts() except NoPeersFound: # if pool.workRequests: if pool_work_num - pool.free_count() > 0: # some remained piece maybe on the way pool.join() if self.piece_file.has_unalloc(): tv = self._get_last_get_peer_tv() if tv > GET_PEER_INTERVAL: gevent.sleep(GET_PEER_INTERVAL - tv) g = gevent.spawn(self.get_peers) peers = g.get() self._record_get_peer_ts() else: break else: logger.error("no worker running, and get no peers") raise else: break logger.info('File is complete, size: %s' % self.piece_file.get_real_filesize()) except NoPeersFound: if self.fallback: logger.info('Use fallback way to get resouce') try: res_length = get_res_length(self.res_url) except ConnectionError: raise OriginURLConnectError(self.res_url) logger.info( 'get resource length %s' % sizeof_fmt_human(res_length)) if not self.piece_file: self.piece_file = PieceFile(res_length, filepath) self.start_ready_upload_thread() http_download_to_piecefile( self.res_url, self.piece_file) else: self.deactivate() raise # self.piece_file.tofile() self.res_is_downloaded = True if callback: logger.info('Run callback') callback(**cb_kwargs)
def wait_available(pool, pool_name): statsd = stats.get_statsd_client() if pool.full(): statsd.incr('%s.pool.full' % pool_name) pool.wait_available() return not STATE['shutdown']
if result != '': print 'Found [%s][%s] in %s' % (result, link, tag) a += 1 if tag in json_dict: json_dict[tag].append((result, link, imgs[i])) else: json_dict[tag] = list() json_dict[tag].append((result, link, imgs[i])) r = session.get(url) tree = html.fromstring(r.text) a_tags = tree.xpath('//li[@class="menu-item"]//a') tags = [(x.xpath('.//@href'), repr(x.xpath('.//text()'))) for x in a_tags] for t in tags: url = t[0] result = regex.findall(t[1]) # print url, result # scrape(url[0], result[0]) queue.put((url[0], result[0])) while not queue.empty() and not pool.full(): for x in xrange(0, min(queue.qsize(), pool.free_count())): pool.spawn(worker) pool.join() print a print 'Time Taken : ', datetime.now() - start_time with open('data.json', 'w') as fp: json.dump(json_dict, fp)