def run(self): """This method puts the daemon into a poll/action loop. This method should not be extended or overwritten. Instead, implementations of this daemon should implement the 'get_message()' and 'handle_message()' methods. This loop makes use of an gevent Pool to manage maximum greenthread concurrency. The behaviour of the pool, and greenthreads in general, is such that there must be a cooperative yield in order for gevent to switch context into another greenthread. This loop, by default, will only yield on an empty message and when the Pool has allocated the maximum allowed greenthreads. To yield the loop after each message, set the aggressive_yield bit to True. """ pool = gevent.pool.Pool(size=self.pool_size) while True: message = self.get_message() LOG.debug( "Daemon (%r) got message (%r).", self.pidfile, message ) if message is None: LOG.debug( "Daemon (%r) received no message. Going idle for (%r).", self.pidfile, self.idle_time ) self.sleep(self.idle_time) continue LOG.debug( "Daemon (%r) attempting to start new greenthread with (%r) " "active and (%r) free", self.pidfile, pool.size - pool.free_count(), pool.free_count() ) pool.spawn(self.handle_message, message) if self.aggressive_yield is True: gevent.sleep(0)
def cron(pool): """ Assigned to a worker to perform the following tasks periodically to maintain a continuous network-wide connections: [Master] 1) Checks for a new snapshot 2) Loads new reachable nodes into the reachable set in Redis 3) Signals listener to get reachable nodes from opendata set 4) Sets bestblockhash in Redis [Master/Slave] 1) Spawns workers to establish and maintain connection with reachable nodes """ publish_key = 'snapshot:{}'.format(hexlify(CONF['magic_number'])) snapshot = None while True: if CONF['master']: new_snapshot = get_snapshot() if new_snapshot != snapshot: nodes = get_nodes(new_snapshot) if len(nodes) == 0: continue logging.info("New snapshot: %s", new_snapshot) snapshot = new_snapshot logging.info("Nodes: %d", len(nodes)) reachable_nodes = set_reachable(nodes) logging.info("New reachable nodes: %d", reachable_nodes) # Allow connections to stabilize before publishing snapshot gevent.sleep(CONF['socket_timeout']) REDIS_CONN.publish(publish_key, int(time.time())) connections = REDIS_CONN.scard('open') logging.info("Connections: %d", connections) set_bestblockhash() for _ in xrange(min(REDIS_CONN.scard('reachable'), pool.free_count())): pool.spawn(task) workers = CONF['workers'] - pool.free_count() logging.info("Workers: %d", workers) gevent.sleep(CONF['cron_delay'])
def test_stderr_raising(self): if greentest.PYPY: # Does not work on PyPy return # testing that really egregious errors in the error handling code # (that prints tracebacks to stderr) don't cause the pool to lose # any members import sys pool = self.klass(size=1) # we're going to do this by causing the traceback.print_exc in # safe_apply to raise an exception and thus exit _main_loop normal_err = sys.stderr try: sys.stderr = FakeFile() waiter = pool.spawn(crash) with gevent.Timeout(2): self.assertRaises(RuntimeError, waiter.get) # the pool should have something free at this point since the # waiter returned # pool.Pool change: if an exception is raised during execution of a link, # the rest of the links are scheduled to be executed on the next hub iteration # this introduces a delay in updating pool.sem which makes pool.free_count() report 0 # therefore, sleep: gevent.sleep(0) self.assertEqual(pool.free_count(), 1) # shouldn't block when trying to get t = gevent.Timeout.start_new(0.1) try: pool.apply(gevent.sleep, (0, )) finally: t.cancel() finally: sys.stderr = normal_err pool.join()
def test_stderr_raising(self): # testing that really egregious errors in the error handling code # (that prints tracebacks to stderr) don't cause the pool to lose # any members import sys pool = self.klass(size=1) # we're going to do this by causing the traceback.print_exc in # safe_apply to raise an exception and thus exit _main_loop normal_err = sys.stderr try: sys.stderr = FakeFile() waiter = pool.spawn(crash) with gevent.Timeout(2): self.assertRaises(RuntimeError, waiter.get) # the pool should have something free at this point since the # waiter returned # pool.Pool change: if an exception is raised during execution of a link, # the rest of the links are scheduled to be executed on the next hub iteration # this introduces a delay in updating pool.sem which makes pool.free_count() report 0 # therefore, sleep: gevent.sleep(0) self.assertEqual(pool.free_count(), 1) # shouldn't block when trying to get with gevent.Timeout.start_new(0.1): pool.apply(gevent.sleep, (0, )) finally: sys.stderr = normal_err pool.join()
def cron(pool): """ Assigned to a worker to perform the following tasks periodically to maintain a continuous network-wide connections: [Master] 1) Checks for a new snapshot 2) Loads new reachable nodes into the reachable set in Redis 3) Signals listener to get reachable nodes from opendata set [Master/Slave] 1) Spawns workers to establish and maintain connection with reachable nodes """ snapshot = None while True: if SETTINGS['master']: new_snapshot = get_snapshot() if new_snapshot != snapshot: nodes = get_nodes(new_snapshot) if len(nodes) == 0: continue logging.info("New snapshot: {}".format(new_snapshot)) snapshot = new_snapshot logging.info("Nodes: {}".format(len(nodes))) reachable_nodes = set_reachable(nodes) logging.info("New reachable nodes: {}".format(reachable_nodes)) # Allow connections to stabilize before publishing snapshot gevent.sleep(SETTINGS['cron_delay']) REDIS_CONN.publish('snapshot', int(time.time())) connections = REDIS_CONN.scard('open') logging.info("Connections: {}".format(connections)) for _ in xrange(min(REDIS_CONN.scard('reachable'), pool.free_count())): pool.spawn(task) workers = SETTINGS['workers'] - pool.free_count() logging.info("Workers: {}".format(workers)) gevent.sleep(SETTINGS['cron_delay'])
def init(): #queue init #main.queue.put("") #main.pool.spawn(getLink).join() #give worker pool print('start crwaling') #while not pool.free_count() == 15: while not queue.empty(): gevent.sleep(0.8) for x in range(0, min(queue.qsize(), pool.free_count())): pool.spawn(getData) #wait for everything complete pool.join()
def cron(pool): """ Assigned to a worker to perform the following tasks periodically to maintain a continuous network-wide connections: 1) Checks for a new snapshot 2) Loads new reachable nodes into the reachable set in Redis 3) Spawns workers to establish and maintain connection with reachable nodes 4) Signals listener to get reachable nodes from opendata set """ snapshot = None while True: logging.debug("") new_snapshot = get_snapshot() if new_snapshot != snapshot: logging.info("New snapshot: {}".format(new_snapshot)) nodes = get_nodes(new_snapshot) if len(nodes) == 0: continue logging.info("Nodes: {}".format(len(nodes))) snapshot = new_snapshot reachable_nodes = set_reachable(nodes) logging.info("Reachable nodes: {}".format(reachable_nodes)) try: SETTINGS['keepalive'] = int(REDIS_CONN.get('elapsed')) except TypeError as err: logging.warning(err) logging.debug("Keepalive: {}".format(SETTINGS['keepalive'])) for _ in xrange(reachable_nodes): pool.spawn(task) gevent.sleep(SETTINGS['cron_delay']) REDIS_CONN.publish('snapshot', int(time.time())) workers = SETTINGS['workers'] - pool.free_count() logging.info("Workers: {}".format(workers)) logging.info("Connections: {}".format(REDIS_CONN.scard('open'))) else: gevent.sleep(SETTINGS['cron_delay'])
def cron(pool): """ Assigned to a worker to perform the following tasks periodically to maintain a continuous network-wide connections: 1) Checks for a new snapshot 2) Loads new reachable nodes into the reachable set in Redis 3) Spawns workers to establish and maintain connection with reachable nodes 4) Signals listener to get reachable nodes from opendata set """ snapshot = None while True: logging.debug("") new_snapshot = get_snapshot() if new_snapshot != snapshot: logging.info("New snapshot: {}".format(new_snapshot)) nodes = get_nodes(new_snapshot) if len(nodes) == 0: continue logging.info("Nodes: {}".format(len(nodes))) snapshot = new_snapshot reachable_nodes = set_reachable(nodes) logging.info("Reachable nodes: {}".format(reachable_nodes)) SETTINGS['keepalive'] = int(REDIS_CONN.get('elapsed')) logging.debug("Keepalive: {}".format(SETTINGS['keepalive'])) for _ in xrange(reachable_nodes): pool.spawn(task) gevent.sleep(SETTINGS['cron_delay']) REDIS_CONN.publish('snapshot', int(time.time())) workers = SETTINGS['workers'] - pool.free_count() logging.info("Workers: {}".format(workers)) logging.info("Connections: {}".format(REDIS_CONN.scard('open'))) else: gevent.sleep(SETTINGS['cron_delay'])
def scrape_base_url(): global data startTime = datetime.now() tree = html.fromstring(session.get(base_url).text) func = lambda x: queue.put_nowait((parse_comp, { 'url': domain + x.xpath('./@href')[0], 'name': x.xpath('./text()')[0] })) [ func(x) for x in tree.xpath('//div[@class="st-text"]//td/a') if x.xpath('./text()') != [] ] while not queue.empty() and not pool.full(): for x in xrange(0, min(queue.qsize(), pool.free_count())): t = queue.get_nowait() pool.start(pool.spawn(t[0], t[1])) pool.join() print 'Time Taken : ', datetime.now() - startTime with open('data.json', 'w') as fp: json.dump(data, fp)
def scheduler(): """Coordinate downloading in greenlet threads. When the worker queue fills up the scheduler will block on the put() operation. If the job queue is empty and no workers are active the pool is stopped.""" while True: # join dead greenlets for greenlet in list(pool): if greenlet.dead: pool.discard(greenlet) try: url = inq.get_nowait() except queue.Empty: # No urls remaining if pool.free_count() != pool.size: worker_finished.wait() worker_finished.clear() else: # No workers left, shutting down.") pool.join() return True else: # spawn worker for url pool.spawn(worker, url)
if result != '': print 'Found [%s][%s] in %s' % (result, link, tag) a += 1 if tag in json_dict: json_dict[tag].append((result, link, imgs[i])) else: json_dict[tag] = list() json_dict[tag].append((result, link, imgs[i])) r = session.get(url) tree = html.fromstring(r.text) a_tags = tree.xpath('//li[@class="menu-item"]//a') tags = [(x.xpath('.//@href'), repr(x.xpath('.//text()'))) for x in a_tags] for t in tags: url = t[0] result = regex.findall(t[1]) # print url, result # scrape(url[0], result[0]) queue.put((url[0], result[0])) while not queue.empty() and not pool.full(): for x in xrange(0, min(queue.qsize(), pool.free_count())): pool.spawn(worker) pool.join() print a print 'Time Taken : ', datetime.now() - start_time with open('data.json', 'w') as fp: json.dump(json_dict, fp)
break print "job done" handler.log("job done") print "so far crawled %s pages" % crawled handler.log("so far crawled %s pages" % crawled) queue.put(start_url_1) queue.put(start_url_2) pool.spawn(crawler) handler = Handler() print 'starting Crawler...' handler.log('starting Crawler...') while not queue.empty() and not pool.free_count() == workers_count: gevent.sleep(0.8) for x in xrange(0, min(queue.qsize(), pool.free_count())): pool.spawn(crawler) #wait for jobs to finish pool.join() print "Done" handler.log("Done+\n") print '\n' print "collected %s imgs" % ITEMS_COUNT handler.log("collected %s imgs" % ITEMS_COUNT) print "see generated output and log files" handler.close() #close the IO files
def log_qsize(wait): while True: log.info(u'{}: number of concurrent delete tasks = {}'.format( group_id, pool_size - pool.free_count())) gevent.sleep(wait)
def _download_res( self, filepath, rate, uploading=True, callback=None, cb_kwargs=None): try: peers = self.get_peers() self._record_get_peer_ts() peers_num = len(peers) count = 0 # just get resource size while True: ip, port = peers[count] logger.info('get resource size') try: ret = self._requests_session.get( "{protocol}{ip}:{port}/?{res}" .format( protocol=PROTOCOL, ip=ip, port=port, res=urlencode({'res_url': self.upload_res_url})), stream=True, headers={"Range": "bytes=0-0"}, timeout=1) if ret.ok: #: bytes=0-1/17) content_range = ret.headers.get("Content-Range") res_length = content_range.split('/')[-1] break else: logger.warn( 'get piece from ip: %s port: %s error, code: %s ' % (ip, port, ret.status_code)) count += 1 self.del_from_tracker(ip=ip, peer_port=port) except ConnectionError: logger.warn( 'get piece from ip: %s port: %s error ConnectionError' % (ip, port)) count += 1 self.del_from_tracker(ip=ip, peer_port=port) except Timeout: logger.warn( 'get piece from ip: %s port: %s error Timeout' % (ip, port)) count += 1 self.del_from_tracker(ip=ip, peer_port=port) finally: if count >= peers_num: logger.warn("No peers avaliable") peers = self.get_peers() peers_num = len(peers) count = 0 logger.info('%s is size of %s' % (self.upload_res_url, sizeof_fmt_human(res_length))) self.piece_file = PieceFile(res_length, filepath) pool_work_num = 15 pool_q_size = pool_work_num * 2 pool = gevent.pool.Pool(pool_work_num) self.start_ready_upload_thread() if rate: self.download_rate = rate else: rate = self.download_rate if rate: self.token_bucket = TokenBucket(rate) while self.piece_file.has_unalloc(): args_list = list() for peer in peers: if peer not in self._peer_in_conn: args_list.append((peer, None)) [pool.apply_async(self._download_piece_thread, *args) for args in args_list[:pool_q_size]] # update peers if peer run out while pool.full(): gevent.sleep(0.2) if not self.piece_file.has_empty(): pool.join() logger.debug( 'test get_empty_block: %s' % self.piece_file.get_empty_piece()) logger.debug('peer in connection: %s' % self._peer_in_conn) if self.piece_file.has_unalloc(): try: tv = self._get_last_get_peer_tv() if tv < GET_PEER_INTERVAL: gevent.sleep(GET_PEER_INTERVAL - tv) g = gevent.spawn(self.get_peers) peers = g.get() self._record_get_peer_ts() except NoPeersFound: # if pool.workRequests: if pool_work_num - pool.free_count() > 0: # some remained piece maybe on the way pool.join() if self.piece_file.has_unalloc(): tv = self._get_last_get_peer_tv() if tv > GET_PEER_INTERVAL: gevent.sleep(GET_PEER_INTERVAL - tv) g = gevent.spawn(self.get_peers) peers = g.get() self._record_get_peer_ts() else: break else: logger.error("no worker running, and get no peers") raise else: break logger.info('File is complete, size: %s' % self.piece_file.get_real_filesize()) except NoPeersFound: if self.fallback: logger.info('Use fallback way to get resouce') try: res_length = get_res_length(self.res_url) except ConnectionError: raise OriginURLConnectError(self.res_url) logger.info( 'get resource length %s' % sizeof_fmt_human(res_length)) if not self.piece_file: self.piece_file = PieceFile(res_length, filepath) self.start_ready_upload_thread() http_download_to_piecefile( self.res_url, self.piece_file) else: self.deactivate() raise # self.piece_file.tofile() self.res_is_downloaded = True if callback: logger.info('Run callback') callback(**cb_kwargs)
def log_qsize(wait): while True: log.info(u'{}: number of concurrent delete tasks = {}' .format(group_id, pool_size - pool.free_count())) gevent.sleep(wait)
(response.status_code, url)) except gevent.queue.Empty: print('queue empty') break if __name__ == '__main__': if len(sys.argv) != 3: print('USAGE:\n\t%s <base_url> <entry_path>' % sys.argv[0]) sys.exit(1) if validators.url(sys.argv[1]) != True: print('Invalid Url') sys.exit(1) queue.put(getUrl(sys.argv[2])) pool.spawn(crawler) while 1: if queue.empty() and pool.free_count() == WORKER_COUNT: print('No more links left and nothing running') break for x in range(0, min(queue.qsize(), pool.free_count())): pool.spawn(crawler) gevent.sleep(0.1) # Wait for everything to complete pool.join()