def test_readline_size_zero(): pr, pw = pipe() def writer(): try: print "writing lines..." while True: pw.write("hello\n") except IOError: pass print "writer bye bye" def reader(): print "reading line of size 0..." assert pr.readline(0) == "" print "reading line of size 1..." assert pr.readline(1) == "h" print "reading line normal..." assert pr.readline() == "ello\n" print "reader close" pr.close() print "reader bye bye" p = Pool() p.spawn(reader) p.spawn(writer) p.join(raise_error=True)
def test_raw_events_queue_consumer_waits_streams(self, _get_all_streams_exhausted, _get_min_timestamp, sleep): _get_min_timestamp.side_effect = [5, 5, 6, 7, 8, 9, 10] _get_all_streams_exhausted.side_effect = [ False, False, False, False, False, True, True ] self.aws.stream_status = {('A', 'B'): self.aws.ACTIVE, ('A', 'C'): self.aws.EXHAUSTED} self.aws.raw_events_queue.put((8, {'message': 'Hello 8'})) self.aws.raw_events_queue.put((7, {'message': 'Hello 7'})) self.aws.raw_events_queue.put((9, {'message': 'Hello 9'})) self.aws.raw_events_queue.put((6, {'message': 'Hello 6'})) pool = Pool(size=1) pool.spawn(self.aws._raw_events_queue_consumer) pool.join() self.assertEqual(self.aws.events_queue.get(), 'Hello 6\n') self.assertEqual(self.aws.events_queue.get(), 'Hello 7\n') self.assertEqual(self.aws.events_queue.get(), 'Hello 8\n') self.assertEqual(self.aws.events_queue.get(), 'Hello 9\n') self.assertEqual(self.aws.events_queue.get(), NO_MORE_EVENTS) self.assertTrue(self.aws.events_queue.empty()) self.assertEqual(sleep.call_args_list, [call(0.3), call(0.3)])
class GServer(ProtoBufRPCServer): def __init__(self, host, port, service, poolsize=128): self.gpool = Pool(poolsize) self.stop_event = Event() context = zmq.Context() self.port = port self.socket = context.socket(zmq.ROUTER) self.socket.bind("tcp://%s:%s" % (host, port)) self.service = service def serve_forever(self,): while not self.stop_event.is_set(): try: msg = self.socket.recv_multipart() except zmq.ZMQError: if self.socket.closed: break raise e self.gpool.spawn(self.handle_request, msg) def shutdown(self,): self.socket.close() self.stop_event.set() def handle_request(self, msg): assert len(msg) == 3 (id_, null, request) = msg assert null == '' response = self.handle(request) self.socket.send_multipart([id_, null, response.SerializeToString()])
def test_big_data(): pr, pw = pipe() big = "x" * 1024 * 962 big += "END" print "big data size", len(big) def writer(): print "writing..." pw.write(big) print "writer bye bye" def reader(): print "reading..." data = "" for x in xrange(len(big) / 4096): data += pr.read(4096) data += pr.read(len(big) % 4096) assert len(data) == len(big) assert data[-3:] == "END" print "reader bye bye" p = Pool() p.spawn(reader) p.spawn(writer) p.join(raise_error=True)
def test_raw_events_queue_consumer_exit_if_exhausted(self): self.aws.stream_status = {('A', 'B'): self.aws.EXHAUSTED} pool = Pool(size=1) pool.spawn(self.aws._raw_events_queue_consumer) pool.join() self.assertEqual(self.aws.events_queue.get(), NO_MORE_EVENTS) self.assertTrue(self.aws.events_queue.empty())
def main(): options, args = parseCommandLine() bucket_name = 'stamped.com.static.images' conn = S3Connection(keys.aws.AWS_ACCESS_KEY_ID, keys.aws.AWS_SECRET_KEY) rs = conn.get_all_buckets() rs = filter(lambda b: b.name == bucket_name, rs) if 1 != len(rs): utils.log("error finding bucket to warm cache with") return pool = Pool(64) count = 0 bucket = rs[0] result = list(bucket.list(prefix='search/v2/')) utils.log("warming %d keys" % len(reslt)) for key in result: pool.spawn(_warm, key, options) count += 1 if 0 == (count % 100): utils.log("warmed %d keys" % count) pool.join()
def start_subscribers(self): processing_queue = gevent.queue.Queue(None) subscribers_pool = GPool(self.num_of_processes) subscriber_jobs = [subscribers_pool.spawn(self.start_listening_subscriber, self.sockets[x], processing_queue) for x in xrange(self.num_of_processes)] processing_pool = GPool(5000) processing_jobs = [processing_pool.spawn(self.send_msg_to_user_socket, processing_queue) for x in xrange(5000)]
def main(): """spawn""" val = rclient.get('f1') print(val) pool = Pool(20) start('f1') #loop forever while True: #print( time.time() ) pool.spawn(func1) #print pool.wait_available() print ( pool.free_count() ) #sleep gevent.sleep(2)
def concu(): p = Pool(10) # 设置并发数为2 for url in urls: p.spawn(down, url) p.join()
def gevent_case(): pool = Pool(8) pool.map(ping, (range(8),)) # print(result) gevent_list = [gevent.spawn(ping, str(i)) for i in range(10)] gevent.joinall(gevent_list)
def watch_pr(): state_key = 'github_pr.last_updated' date_format = '%Y-%m-%dT%H:%M:%SZ' st = State() last_updated = st.get(state_key) st.set(state_key, datetime.utcnow().strftime(date_format)) if last_updated is None: # first run return else: last_updated = datetime.strptime(last_updated, date_format) watched = defaultdict(list) for chat_id in get_module_chat_ids(module_name): wl = get_chat_conf(chat_id, module_name, default_options)['watch_list'] for repo in wl: watched[tuple(repo)].append(chat_id) if not watched: return pool = Pool(10) res = pool.imap_unordered(partial(get_new_pr, last_updated), watched) for prs in res: for pr in prs: repo = get_repo(pr['base']['repo']['full_name']) pr['chat_ids'] = watched[repo] yield pr
def crawl_listing(addrs): pool = Pool(30*len(addrs)) clients = [zerorpc.Client(addr) for addr in addrs] for item in col_cats.find({'leaf': 1, 'num': {'$exists': True}}, fields=['url', 'catstr']): pool.spawn(random.choice(clients).crawl_listing, item['url'], item['catstr']) progress() pool.join()
def process(s): global urls urls = {} # Hackity hack. s = s.split('class="attach')[0].split('<') s.pop() s = '<'.join(s) # Cut out bad tags. for t in SKIP_TAGS: s = re.sub(FLAGS + '\s*<(?P<tag>' + t + ').*?</(?P=tag)>\s*', '', s) # Apply simple rules. for (k, r) in SIMPLE_RULES: s = re.sub(FLAGS + k, r, s) # Close tags that should be closed, leave already closed as-is for t in CLOSED_TAGS: s = re.sub(FLAGS + r'<({0}[^>]*?)/?>'.format(t), r'<\1/>', s) # Maybe this is overkill, but why not. s = s.replace('</{0}>'.format(t), '') # Apply complex rules. (s, n) = ntag_re.subn(proctag, s) m, n = n, 1 while n > 0: (s, n) = ptag_re.subn(proctag, s) m += n # Strip out any HTML leftovers. s = re.sub('<[^>]+>','',s) if m > 0: print('Replaced {0} tags'.format(m)) if not args.no_rehost and len(urls) > 0: def print_urls(a, b): if a != b: print('{0} >> {1}'.format(a, b)) print('Processing {0} URLs...'.format(len(urls))) # Rehost images. if gevent: pool = Pool(POOL_SIZE) def fin(h, url): def f(g): urls[h] = g.value print_urls(url, g.value) return f for h, url in urls.iteritems(): j = pool.spawn(rehost, url, image=True, referer=target_root) j.link_value(fin(h, url)) pool.join() else: for h, url in urls.iteritems(): new_url = rehost(url, image=True, referer=target_root) urls[h] = new_url print_urls(url, new_url) # Bring URLs back in places. imgs = 0 for p, url in urls.iteritems(): if hashurl(url) != p: imgs += 1 s = s.replace(p, urls[p]) if imgs > 0: print('Found and replaced {0} images'.format(imgs)) return decode_html_entities(s).strip()
def test_proxies(proxies, timeout=10, single_url=None, many_urls=None, call_back=None): """ Test proxies, or process html source using callback in the meantime. :type proxies: list :param proxies: proxies :param timeout: response timeout :param single_url: The URL for testing :param many_urls: The list of URLs for testing. Pick one of them when perform request. :param call_back: Process the html source if status code is 200. callback(url, source) :return: """ proxies = set(proxies) errors = set() pool = Pool(100) def test(proxy): code = None url = random.choice(many_urls) if many_urls is not None else single_url start_time = time.time() try: with gevent.Timeout(seconds=timeout, exception=Exception('[Connection Timeout]')): _headers['User-Agent'] = random.choice(_user_agents) res = requests.get(url, proxies={'http': 'http://{}'.format(proxy.strip()), 'https': 'https://{}'.format(proxy.strip())}, headers=_headers ) code = res.status_code source = res.text _log('[Proxy: {:d} {:s}]'.format(code, proxy)) # 回调 if source is not None and call_back is not None and code == 200: call_back(url, source) if code != 200: errors.add(proxy) except Exception as e: # log(e.args) errors.add(proxy) end_time = time.time() escaped = end_time - start_time if code else None store_in_db(proxy, escaped=escaped, status_code=code) # store in db for proxy in proxies: pool.spawn(test, proxy) pool.join() proxies = proxies - errors _log('[HTTP Proxies] Available:{:d} Deprecated:{:d}'.format(len(proxies), len(errors))) return list(proxies)
class Task: def __init__(self, queue, pool_max=100): self.work = None self.pool_max = pool_max self.pool = Pool(pool_max) self.queue = queue def initTaskWork(self, func): self.work = func def start(self): while True: if not self.queue.empty(): t = self.queue.pop() self.pool.spawn(self.work, *t) elif self.pool.free_count() == self.pool.size or self.queue.isLock: # print 'queue is empty' # print self.pool.free_count(), self.pool.size break else: # print 'queue is empty but...' sleep(0) def stop(self): # 只让进队列,不让出队列 self.queue.lock(True) for item in self.pool: self.queue.push(list(item.args)) # print item # self.pool.killone(item) # self.pool.kill() # print '开始 stop的save' self.queue.save() self.queue.clear()
def parallel_map(func, iterable, args=None, kwargs=None, workers=None): """Map func on a list using gevent greenlets. :param func: function applied on iterable elements :type func: function :param iterable: elements to map the function over :type iterable: iterable :param args: arguments of func :type args: tuple :param kwargs: keyword arguments of func :type kwargs: dict :param workers: limit the number of greenlets running in parrallel :type workers: int """ if args is None: args = () if kwargs is None: kwargs = {} if workers is not None: pool = Pool(workers) else: pool = Group() iterable = [pool.spawn(func, i, *args, **kwargs) for i in iterable] pool.join(raise_error=True) for idx, i in enumerate(iterable): i_type = type(i.get()) i_value = i.get() if issubclass(i_type, BaseException): raise i_value iterable[idx] = i_value return iterable
def postcommit_after_request(response, base_status_error_code=500): if response.status_code >= base_status_error_code: _local.postcommit_queue = OrderedDict() _local.postcommit_celery_queue = OrderedDict() return response try: if postcommit_queue(): number_of_threads = 30 # one db connection per greenlet, let's share pool = Pool(number_of_threads) for func in postcommit_queue().values(): pool.spawn(func) pool.join(timeout=5.0, raise_error=True) # 5 second timeout and reraise exceptions if postcommit_celery_queue(): if settings.USE_CELERY: for task_dict in postcommit_celery_queue().values(): task = Signature.from_dict(task_dict) task.apply_async() else: for task in postcommit_celery_queue().values(): task() except AttributeError as ex: if not settings.DEBUG_MODE: logger.error('Post commit task queue not initialized: {}'.format(ex)) return response
def test_wait(self): p = Pool() w = p.spawn(self.run_wait) f = p.spawn(self.fire_event) p.join() assert w.value, 'Event not fired in while it was waited.'
def test_cancel(self): p = Pool() w = p.spawn(self.run_cancel) f = p.spawn(self.fire_event) p.join() assert w.value, 'Event fired while it was canceled.'
def validate_character(self, server_id, character_name, linkshell_names): results = [] pool = Pool() # Finds all linkshell URLs def find_linkshell_url(linkshell): return linkshell, self.find_linkshell_url(server_id, linkshell) for linkshell, linkshell_url in itertools.imap(find_linkshell_url, linkshell_names): if linkshell_url: results.append(dict(ls_name=linkshell, ls_url=linkshell_url, char_url=None)) else: results.append(dict(ls_name=linkshell, ls_url=None, char_url=None)) def find_character_url(linkshell): # Finds all characters URLs linkshell['char_url'] = self.find_character_url(character_name, linkshell['ls_url']) for linkshell in results: if linkshell.get('ls_url'): pool.spawn(find_character_url, linkshell) pool.join() return results
def getgome(cat): for i in range(3): try: url = ''.join(('http://www.gome.com.cn/p/json?module=async_search¶mJson={%22pageNumber%22%3A', '1', '%2C%22envReq%22%3A{%22catId%22%3A%22', str( cat), '%22%2C%22regionId%22%3A%2231010100%22%2C%22et%22%3A%22%22%2C%22XSearch%22%3Afalse%2C%22pageNumber%22%3A1%2C%22pageSize%22%3A48}}')) r = requests.get(url) totalpage = int(r.json()['num']['totalPage']) urls = [''.join(('http://www.gome.com.cn/p/json?module=async_search¶mJson={%22pageNumber%22%3A', str(i), '%2C%22envReq%22%3A{%22catId%22%3A%22', str( cat), '%22%2C%22regionId%22%3A%2231010100%22%2C%22et%22%3A%22%22%2C%22XSearch%22%3Afalse%2C%22pageNumber%22%3A1%2C%22pageSize%22%3A48}}')) for i in xrange(1, totalpage + 1)] def ff(url): while 1: try: r = requests.get(url, timeout=3) return '\n'.join([i['pId'] for i in r.json()['products']]) except: continue pp = Pool(30) ss = pp.map(ff, urls) global jishu jishu += 1 sys.stderr.write(str(jishu) + ' / ' + zongshu + '\r') return '\n'.join(ss) + '\n' except: continue
def main(): num_worker_threads = UPTO pool = Pool(num_worker_threads) for n in xrange(1, UPTO): pool.apply_async(process, args=(n,)) pool.join() print cnt
class Zerg: def __init__(self, hosts, username, key, max_threads=2): self.hosts = hosts self.username = username self.key = paramiko.RSAKey.from_private_key_file(key) self.pool = Pool(max_threads) self.connections = [] def _connect(self, host): client = paramiko.SSHClient() client.set_missing_host_key_policy(paramiko.WarningPolicy()) client.connect(host, username=self.username, pkey=self.key) self.connections.append(client) def connect(self): self.pool.map(self._connect, self.hosts) def _command(self, cmd, conn): stdin, stdout, stderr = conn.exec_command(cmd) rc = stdout.channel.recv_exit_status() lines = stdout.read().splitlines() return rc, lines def command(self, cmd): out = self.pool.map(lambda c: self._command(cmd, c), self.connections) return out
def tracks(self): tracks = {} def lookupTrack(key): result = self.spotify.lookup(key, 'trackdetail', priority='low', timeout=MERGE_TIMEOUT) track_list = result['album']['tracks'] for track in track_list: track_key = track['href'] if track_key not in tracks: data = { 'key': track_key, 'name': track['name'], } try: # (travis): as of 4/3/12, track length is only sometimes returned by spotify data['length'] = int(track['length']), except KeyError: pass tracks[track_key] = data size = min(1 + len(self.albums), 20) pool = Pool(size) for album in self.albums: key = album['key'] pool.spawn(lookupTrack, key) pool.join() return list(tracks.values())
def sc_process(pid, p_start, p_end): # init file_200 = open(res_folder + list_200_pre + str(pid) + file_ext, 'w') file_302 = open(res_folder + list_302_pre + str(pid) + file_ext, 'w') file_max = open(res_folder + list_max_pre + str(pid) + file_ext, 'w') file_others = open(res_folder + list_others_pre + str(pid) + file_ext, 'w') file_log = open(res_folder + list_log_pre + str(pid) + file_ext, 'w') file_list = [file_200, file_302, file_max, file_others, file_log] s = requests.Session() pool = Pool(pool_size) for i in xrange(p_start, p_end): if i % step == 0: file_log.write('%i/%i\n' % (i, p_end)) file_log.flush() shareid = i url = url_tpl % shareid pool.spawn(sc_worker, pid, s, shareid, url, file_list) pool.join() # finalize file_200.close() file_302.close() file_max.close() file_others.close() file_log.close()
def handle(self, *args, **options): if len(args) < 1 or not(args[0] in self.ZEITRAUM.keys()): self.stdout.write("Usage: manage.py fetch {24H|48H|7D|1M|3M|6M|1Y}") sys.exit(0) gevent.monkey.patch_socket() params = {} self.s = requests.Session() self.tz = timezone.get_current_timezone() response, soup = self.post(params) stations = soup.find(id=self.STATION_ID).find_all('option') for station in stations: self.STATIONEN[station.string] = station['value'] params[self.TARGET_KEY] = self.STATION_KEY stationPool = Pool(len(self.STATIONEN)) self.inv_stations = self.invert_dict(self.STATIONEN) self.inv_schadstoff = self.invert_dict(self.SCHADSTOFFE) # self.inv_schadstoff['109;2'] = 'PM2.5' # csv uses PM2.5 for station in self.STATIONEN.keys(): tmp = params.copy() tmp[self.STATION_KEY] = self.STATIONEN[station] stationPool.spawn(self.fetchStation, tmp, args[0]) stationPool.join()
def gen(): try: pool = Pool(len(source_functions)) sources = [] def _helper(source_function): source = source_function() if source is not None: sources.append(source) for source_function in source_functions: pool.spawn(_helper, source_function) pool.join(timeout=initial_timeout) offset = 0 found = True while found: found = False for source in sources: cur = source(offset, 1) for item in cur: found = True yield item offset += 1 except GeneratorExit: pass
def api_jobs(): ''' The Main JSON view which gives a list of all tasks in all projects. This route uses a gevent pool go get all the project tasks in parallel (31 at a time), which makes life a lot quicker. (Down to 7 or 8 seconds load time for us.) ''' try: asana = SimpleAsana(app.config['API_KEY']) my_project_tasks = lambda p: get_project_tasks(app.config['API_KEY'], p) projects = asana.workspace_projects( app.config['WORKSPACE'], cachetime=4000, as_type='dict', opt_fields='name,team,archived,notes') all_tasks = [] pool = Pool(31) lists = pool.map(my_project_tasks, [p for p in projects if not p['archived']]) for project_tasks in lists: all_tasks += project_tasks return jsonify({"tasks": all_tasks}) except Exception as e: # pylint: disable=broad-except return str(e)
def test_close_writer(): pr, pw = pipe() big = "x" * 1024 * 50 print "big data size", len(big) def writer(): print "writing, first round..." pw.write(big) print "writing, second round..." pw.write(big) print "writing, end tag..." pw.write("END") print "writter close" pw.close() print "writer bye bye" def reader(): print "reading all..." data = pr.read() assert len(data) == len(big) * 2 + 3 assert data[-3:] == "END" print "reader bye bye" p = Pool() p.spawn(reader) p.spawn(writer) p.join(raise_error=True)
class GEvent2Worker(Worker): base_env = { 'GATEWAY_INTERFACE': 'CGI/1.1', 'SERVER_SOFTWARE': 'gevent/%s gunicorn/%s' % (gevent.__version__, gunicorn.__version__), 'SCRIPT_NAME': '', 'wsgi.version': (1, 0), 'wsgi.url_scheme': 'http', 'wsgi.multithread': False, 'wsgi.multiprocess': True, 'wsgi.run_once': False } def __init__(self, *args, **kwargs): super(GEvent2Worker, self).__init__(*args, **kwargs) self.worker_connections = self.cfg.worker_connections self.pool = None @classmethod def setup(cls): from gevent import monkey monkey.patch_all(dns=False) def handle_request(self, req): self.pool.spawn(self.handle, req) def handle(self, req): handle = WSGIHandler(req) handle.handle(self) def run(self): self.socket.setblocking(1) env = self.base_env.copy() env.update({ 'SERVER_NAME': self.address[0], 'SERVER_PORT': str(self.address[1]) }) self.base_env = env http = core.http() http.set_gencb(self.handle_request) self.pool = Pool(self.worker_connections) self.application = self.wsgi acceptor = gevent.spawn(http.accept, self.socket.fileno()) try: while self.alive: self.notify() if self.ppid != os.getppid(): self.log.info("Parent changed, shutting down: %s" % self) gevent.kill(acceptor) break gevent.sleep(0.1) self.pool.join(timeout=self.timeout) except KeyboardInterrupt: pass
def init(self, inventory, config, initial_limit=None): # Config validation # # If no config, create one using the defaults if config is None: config = Config() # Error if our min version is not met if config.MIN_PYINFRA_VERSION is not None: # TODO: remove this if config.REQUIRE_PYINFRA_VERSION is None: config.REQUIRE_PYINFRA_VERSION = '>={0}'.format( config.MIN_PYINFRA_VERSION) logger.warning( '`MIN_PYINFRA_VERSION` is deprecated, please use `REQUIRE_PYINFRA_VERSION`.', ) else: logger.warning( 'Ignoring legacy `MIN_PYINFRA_VERSION` because ' '`REQUIRE_PYINFRA_VERSION` also exists.', ) if config.REQUIRE_PYINFRA_VERSION is not None: running_version = parse_version(__version__) required_versions = Requirement.parse( 'pyinfra{0}'.format(config.REQUIRE_PYINFRA_VERSION), ) if running_version not in required_versions: raise PyinfraError(('pyinfra version requirement not met ' '(requires {0}, running {1})').format( config.REQUIRE_PYINFRA_VERSION, __version__, )) if config.REQUIRE_PACKAGES is not None: if isinstance(config.REQUIRE_PACKAGES, (list, tuple)): requirements = config.REQUIRE_PACKAGES else: with open(path.join(self.deploy_dir, config.REQUIRE_PACKAGES)) as f: requirements = [ line.split('#egg=')[-1] for line in f.read().splitlines() ] try: require(requirements) except ResolutionError as e: raise PyinfraError( 'Deploy requirements ({0}) not met: {1}'.format( config.REQUIRE_PACKAGES, e, )) if not config.PARALLEL: # TODO: benchmark this # In my own tests the optimum number of parallel SSH processes is # ~20 per CPU core - no science here yet, needs benchmarking! cpus = cpu_count() ideal_parallel = cpus * 20 config.PARALLEL = (min(ideal_parallel, len(inventory), MAX_PARALLEL) if MAX_PARALLEL is not None else min(ideal_parallel, len(inventory))) # If explicitly set, just issue a warning elif MAX_PARALLEL is not None and config.PARALLEL > MAX_PARALLEL: logger.warning(( 'Parallel set to {0}, but this may hit the open files limit of {1}.\n' ' Max recommended value: {2}').format( config.PARALLEL, nofile_limit, MAX_PARALLEL)) # Actually initialise the state object # self.callback_handlers = [] # Setup greenlet pools self.pool = Pool(config.PARALLEL) self.fact_pool = Pool(config.PARALLEL) # Connection storage self.ssh_connections = {} self.sftp_connections = {} # Private keys self.private_keys = {} # Assign inventory/config self.inventory = inventory self.config = config # Hosts we've activated at any time self.activated_hosts = set() # Active hosts that *haven't* failed yet self.active_hosts = set() # Hosts that have failed self.failed_hosts = set() # Limit hosts changes dynamically to limit operations to a subset of hosts self.limit_hosts = initial_limit # Op basics self.op_line_numbers_to_hash = {} self.op_meta = {} # maps operation hash -> names/etc self.ops_run = set() # list of ops which have been started/run # Op dict for each host self.ops = {host: {} for host in inventory} # Facts dict for each host self.facts = {host: {} for host in inventory} # Meta dict for each host self.meta = { host: { 'ops': 0, # one function call in a deploy file 'commands': 0, # actual # of commands to run 'op_hashes': set(), } for host in inventory } # Results dict for each host self.results = { host: { 'ops': 0, # success_ops + failed ops w/ignore_errors 'success_ops': 0, 'error_ops': 0, 'commands': 0, } for host in inventory } # Assign state back references to inventory & config inventory.state = config.state = self for host in inventory: host.state = self self.initialised = True # Flag to track added users (via `server.user` operation calls). This is # specifically to address users not existing during fact gathering phase # causing failures with su_user/sudo_user. If we expect to add the user # those facts should not fail but default. self.will_add_users = []
def ff(url): while 1: try: r = requests.get(url, timeout=3) return '\n'.join([i['pId'] for i in r.json()['products']]) except: continue pp = Pool(30) ss = pp.map(ff, urls) try: pp.close() pp.join() except: pass global jishu jishu += 1 sys.stderr.write(str(jishu) + ' / ' + zongshu + '\r') return '\n'.join(ss) + '\n' except: continue with open('allcategory.txt') as f: allcategory = [i.strip() for i in f.readlines()] zongshu = str(len(allcategory)) jishu = 0 with open('allids.txt', 'w') as f: # 这里又开了Pool,但一开始我是用for单线程做的,因为每个类目已经开了多线程,结果用了很久 p1 = Pool(50) ss = p1.map(getgome, allcategory) f.writelines(ss)
class DoubanSpider(DBMixin): """" 豆瓣爬虫 """ def __init__(self, proxy_manager=None): self.result_page = self.db.result_page self.result_topic = self.db.result_topic self.cache = self.db.cache_page self.group_list = GROUP_LIST self.rules = RULES self.interval = WATCH_INTERVAL self.pool = Pool(size=POOL_SIZE) self.page_queue = Queue() self.topic_queue = Queue() self.proxy_manager = proxy_manager def fetch(self, url, timeout=10, retury_num=10): """发起HTTP请求 @url, str, URL @timeout, int, 超时时间 @retury_num, int, 重试次数 """ kwargs = { "headers": { "User-Agent": USER_AGENT, "Referer": "https://www.douban.com/" }, } kwargs["timeout"] = timeout resp = None proxy = None for i in range(retury_num): try: # 是否启动代理 if self.proxy_manager is not None: proxy = self.proxy_manager.get_proxy() kwargs["proxies"] = { "http": 'http://%s' % proxy, "https": 'https://%s' % proxy } # print('proxies: ', kwargs['proxies']) resp = requests.get(url, **kwargs) if resp.status_code != 200: raise HTTPError(resp.status_code, url) break except Exception as exc: logger.warn("%s %d failed!\n%s", url, i, str(exc)) self.proxy_manager.remove(proxy) time.sleep(2) continue if resp is None: raise URLFetchError(url) return resp.content.decode('utf-8') def extract(self, regx, body, multi=False): """解析元素,xpath语法 @regx, str, 解析表达式 @body, str or element, 网页源码或元素 @multi, bool, 是否取多个 """ if isinstance(body, str): body = etree.HTML(body) res = body.xpath(regx) if multi: return res return res[0] if res else None def run(self): """run """ all_greenlet = [] # 定时爬取 for group_url in self.group_list: # timer = Timer(random.randint(0, self.interval), self.interval) timer = Timer(random.randint(0, 2), self.interval) greenlet = gevent.spawn(timer.run, self._init_page_tasks, group_url) all_greenlet.append(greenlet) # 生产 & 消费 all_greenlet.append(gevent.spawn(self._page_loop)) all_greenlet.append(gevent.spawn(self._topic_loop)) # 重载代理,10分 proxy_timer = Timer(PROXY_INTERVAL, PROXY_INTERVAL) all_greenlet.append(gevent.spawn(proxy_timer.run(self.reload_proxies))) gevent.joinall(all_greenlet) def reload_proxies(self): """重新加载代理 """ self.proxy_manager.reload_proxies() def _init_page_tasks(self, group_url): """初始化页面任务 @group_url, str, 小组URL """ for page in range(MAX_PAGE): base_url = "%s%s" % (group_url, GROUP_SUFFIX) url = base_url % (page * 25) self.page_queue.put(url) def _page_loop(self): """page loop """ while 1: page_url = self.page_queue.get(block=True) gevent.sleep(1) self.pool.spawn(self._crawl_page, page_url) def _topic_loop(self): """topic loop """ while 1: topic_url = self.topic_queue.get(block=True) self.pool.spawn(self._crawl_detail, topic_url) def _crawl_page(self, url): """爬取帖子 @url, str, 当前页面URL """ logger.info("processing page: %s", url) html = self.fetch(url) topic_urls = self.extract(self.rules["url_list"], html, multi=True) # 找出新增的帖子URL diff_urls = self._diff_urls(topic_urls) if not diff_urls: logger.info("%s no update ...", url) return logger.info("%s new add : %d", url, len(diff_urls)) topic_list = self.extract(self.rules["topic_item"], html, multi=True) # 获取每一页的信息 topics = self._get_page_info(topic_list) # 过滤,找到新增的和之前的帖子 new_topics, old_topics = self._filter_topics(topics, diff_urls) # 保存每页的信息 self.result_page.insert(new_topics) # 更新老帖子的时间和回复数 self._update_old_topics(old_topics) # 初始化帖子任务 self._init_topic_tasks(diff_urls) # 更新缓存 self._update_cache(diff_urls) def _get_page_info(self, topic_list): """获取每一页的帖子基本信息 @topic_list, list, 当前页的帖子项 """ topics = [] # 第一行是标题头,舍掉 for topic_item in topic_list[1:]: topic = {} topic["title"] = self.extract(self.rules["title"], topic_item) topic["author"] = self.extract(self.rules["author"], topic_item) topic["reply"] = self.extract(self.rules["reply"], topic_item) or 0 topic["last_reply_time"] = self.extract( self.rules["last_reply_time"], topic_item) topic["url"] = self.extract(self.rules["url"], topic_item) now = time.time() topic["got_time"] = now topic["last_update_time"] = now # print('page info topic: {}'.format(topic)) if not self._is_intermediary(topic['author'], topic['title'], None): topics.append(topic) return topics @staticmethod def _filter_topics(topics, diff_urls): """过滤帖子,找出新增的和老的帖子 @topics, list, 当前页所有帖子信息 @diff_urls, list, 新增的帖子URL """ new_topics, old_topics = [], [] for topic in topics: if topic["url"] in diff_urls: new_topics.append(topic) else: old_topics.append(topic) return new_topics, old_topics def _diff_urls(self, topic_urls): """过滤重复帖子URL @topic_urls, list, 当前页所有帖子URL """ # 与缓存比较 cache_urls = [] cursor = self.cache.find() for item in cursor: cache_urls.extend(item["urls"]) # 找出新增的URL diff_urls = list(set(topic_urls) - set(cache_urls)) return diff_urls def _update_old_topics(self, old_topics): """更新老帖子的信息,标题,回应时间和回复数量 @old_topics, list, 老帖子列表 """ for topic in old_topics: new_info = { "title": topic["title"], "reply": topic["reply"], "last_reply_time": topic["last_reply_time"], "last_update_time": time.time() } self.result_page.update({"url": topic["url"]}, {"$set": new_info}) logger.info("%s updated ...", topic["url"]) def _init_topic_tasks(self, topic_urls): """初始化帖子任务 @topic_urls, list, 当前页面帖子的URL """ for url in topic_urls: self.topic_queue.put(url) def _update_cache(self, diff_urls): """更新缓存 @diff_urls, list, 新增的帖子URL """ self.cache.insert({"got_time": time.time(), "urls": diff_urls}) def _crawl_detail(self, url): """爬取每个帖子的详情 @url, str, 每个帖子的URL """ logger.info("processing topic: %s", url) html = self.fetch(url) # 获取每一页的信息 topic = self._get_detail_info(html, url) if not topic: # self.topic_queue.put(url) return topic["url"] = url topic["got_time"] = time.time() # 不存在 & 保存帖子的信息 if self.result_topic.find_one({"url": url}): return self.result_topic.insert(topic) def _get_detail_info(self, html, url): """获取帖子详情 @html, str, 页面 """ if "机器人" in html: logger.warn("%s 403.html", url) return None topic = {} title = self.extract(self.rules["detail_title_sm"], html) \ or self.extract(self.rules["detail_title_lg"], html) if title is None: return None topic["title"] = title.strip() topic["create_time"] = self.extract(self.rules["create_time"], html) topic["author"] = self.extract(self.rules["detail_author"], html) topic["content"] = '\n'.join( self.extract(self.rules["content"], html, multi=True)) # print('detail topic: {}'.format(topic)) if self._is_intermediary(topic['author'], topic['title'], topic['content']): return None return topic def _is_intermediary(self, author, title, content=None): """根据关键词, 内容和豆瓣用户名等判断是否为中介""" full_text = title if content is not None: if len(content) < 20 or len(content) > 500 or content == title: return True full_text += content if author.startswith('豆友') or author.find('直租') != -1: return True exclamation_count = full_text.count('!') + full_text.count('!') if exclamation_count >= 3: return True for kw in INTERMEDIARY_KEYWORDS: if full_text.find(kw) != -1: return True return False
from gtwisted.core.asyncresultfactory import AsyncResultFactory from gtwisted.core.error import RPCDataTooLongError from gevent.timeout import Timeout from gevent.pool import Pool from gfirefly.server.logobj import logger import struct import rpc_pb2 import marshal ASK_SIGNAL = "ASK" # 请求结果的信号 NOTICE_SIGNAL = "NOTICE" # 仅做通知的信号,不要求返回值 ANSWER_SIGNAL = "ANSWER" # 返回结果值的信号 DEFAULT_TIMEOUT = 60 # 默认的结果放回超时时间 RPC_DATA_MAX_LENGTH = 1024 * 1024 # rpc数据包允许的最大长度 GEVENT_POOL = Pool(500) def _write_parameter(proto, arg): if isinstance(arg, str): proto.proto_param = arg elif isinstance(arg, bool): proto.bool_param = arg elif isinstance(arg, unicode): proto.string_param = arg elif isinstance(arg, int) or isinstance(arg, long): proto.int_param = arg elif isinstance(arg, float): proto.float_param = arg elif arg is None: proto.is_null = True
dest = utils.parse_mongo_url(args.dest) dest_client = utils.mongo_connect(dest['host'], dest['port'], max_pool_size=POOL_SIZE, document_class=FasterOrderedDict) dest_collection = dest_client[dest['db']][dest['collection']] if source == dest: raise ValueError("source and destination cannot be the same!") # periodically print stats stats = Stats() stats_greenlet = gevent.spawn(stats_worker, stats) # copy documents! pool = Pool(POOL_SIZE) with open(args.mismatches_file) as mismatches_file: lines = mismatches_file.readlines() # copy everything into memory -- hopefully that isn't huge stats.total = len(lines) for line in lines: query_doc = {'_id': MismatchLogger.decode_mismatch_id(line)} pool.spawn(copy_document_worker, query_doc=query_doc, source_collection=source_collection, dest_collection=dest_collection, stats=stats) # wait for everythng to finish gevent.sleep() pool.join() stats_greenlet.kill()
def handle_download(self, directory, container, threads, verbose): @self.requires_auth def _download(i, files, directory, errors): if verbose: print_('Starting thread %s' % i) s = requests.Session() directory = os.path.abspath(directory) for filename in files: if verbose > 1: print_('Downloading %s' % filename) try: path = os.path.join(directory, filename) try: os.makedirs(os.path.dirname(path), 493) # 0755 except OSError as e: if e.errno != 17: raise with open(path, 'wb+') as f: r = s.get('%s/%s/%s' % (self.endpoint, container, filename), headers={'X-Auth-Token': self.token}, stream=True) if r.status_code == 401: raise AuthenticationError for block in r.iter_content(4096): if not block: break f.write(block) except: e = sys.exc_info()[1] errors.append({ 'name': filename, 'container': container, 'exception': str(e) }) else: if r.status_code != 200: errors.append({ 'name': filename, 'container': container, 'status_code': r.status_code, 'headers': r.headers, 'response': json.loads(r.text) }) if verbose: print_('Completed thread %s' % i) files = collections.defaultdict(list) thread_mark = threads files_per_thread = len(self.objects) / threads / 3 i = 0 for o in self.objects: files[i].append(o['name']) i += 1 if len(files[thread_mark - 1]) == files_per_thread: thread_mark += threads files_per_thread = files_per_thread / 2 i = 0 if i == thread_mark: i = 0 pool = Pool(size=threads) errors = [] for i, file_chunk in iteritems(files): pool.spawn(_download, i, file_chunk, directory, errors) pool.join() return errors
def run(self): servers = [] ssl_args = {} if self.cfg.is_ssl: ssl_args = dict(server_side=True, do_handshake_on_connect=False, **self.cfg.ssl_options) for s in self.sockets: s.setblocking(1) pool = Pool(self.worker_connections) if self.server_class is not None: server = self.server_class(s, application=self.wsgi, spawn=pool, log=self.log, handler_class=self.wsgi_handler, **ssl_args) else: hfun = partial(self.handle, s) server = StreamServer(s, handle=hfun, spawn=pool, **ssl_args) server.start() servers.append(server) pid = os.getpid() try: while self.alive: self.notify() if pid == os.getpid() and self.ppid != os.getppid(): self.log.info("Parent changed, shutting down: %s", self) break gevent.sleep(1.0) except KeyboardInterrupt: pass try: # Stop accepting requests [server.stop_accepting() for server in servers] # Handle current requests until graceful_timeout ts = time.time() while time.time() - ts <= self.cfg.graceful_timeout: accepting = 0 for server in servers: if server.pool.free_count() != server.pool.size: accepting += 1 # if no server is accepting a connection, we can exit if not accepting: return self.notify() gevent.sleep(1.0) # Force kill all active the handlers self.log.warning("Worker graceful timeout (pid:%s)" % self.pid) [server.stop(timeout=1) for server in servers] except: pass
def _run(self, *args, **kwargs): pool = Pool(size=self._config.get("concurrency", None)) for task in self._tasks: pool.start(Greenlet(task)) pool.join(raise_error=True)
def query_activities(self, activity_ids=None, limit=None, after=None, before=None, only_ids=False, summaries=True, streams=False, owner_id=False, build_index=True, pool=None, out_queue=None, cache_timeout=CACHE_ACTIVITIES_TIMEOUT, **kwargs): if self.indexing(): return [{ "error": "Building activity index for {}".format(self.id) + "...<br>Please try again in a few seconds.<br>" }] # convert date strings to datetimes, if applicable if before or after: try: after = self.__class__.to_datetime(after) if before: before = self.__class__.to_datetime(before) assert (before > after) except AssertionError: return [{"error": "Invalid Dates"}] # app.logger.info("query_activities called with: {}".format({ # "activity_ids": activity_ids, # "limit": limit, # "after": after, # "before": before, # "only_ids": only_ids, # "summaries": summaries, # "streams": streams, # "owner_id": owner_id, # "build_index": build_index, # "pool": pool, # "out_queue": out_queue # })) def import_streams(client, queue, activity): # app.logger.debug("importing {}".format(activity["id"])) stream_data = Activities.import_streams(client, activity["id"], STREAMS_TO_CACHE, cache_timeout) data = { s: stream_data[s] for s in STREAMS_OUT + ["error"] if s in stream_data } data.update(activity) queue.put(data) # app.logger.debug("importing {}...queued!".format(activity["id"])) gevent.sleep(0) pool = pool or Pool(CONCURRENCY) client = self.client() # If out_queue is not supplied then query_activities is blocking put_stopIteration = False if not out_queue: out_queue = Queue() put_stopIteration = True index_df = None if (summaries or limit or only_ids or after or before): activity_index = self.get_index() if activity_index: index_df = activity_index["index_df"] elapsed = (datetime.utcnow() - activity_index["dt_last_indexed"]).total_seconds() # update the index if we need to if (not OFFLINE) and (elapsed > INDEX_UPDATE_TIMEOUT): index_df = self.update_index(index_df) if (not activity_ids): # only consider activities with a summary polyline ids_df = (index_df[index_df.summary_polyline.notnull( )].set_index("ts_local").sort_index(ascending=False).id) if limit: ids_df = ids_df.head(int(limit)) elif before or after: # get ids of activities in date-range if after: ids_df = ids_df[:after] if before: ids_df = ids_df[before:] activity_ids = ids_df.tolist() index_df = index_df.astype( Users.index_df_out_dtypes).set_index("id") if only_ids: out_queue.put(activity_ids) out_queue.put(StopIteration) return out_queue def summary_gen(): for aid in activity_ids: A = {"id": int(aid)} if summaries: A.update(index_df.loc[int(aid)].to_dict()) # app.logger.debug(A) yield A gen = summary_gen() elif build_index: # There is no activity index and we are to build one if only_ids: return ["build"] else: gen = Queue() gevent.spawn(self.build_index, gen, limit, after, before, activity_ids) else: # Finally, if there is no index and rather than building one # we are requested to get the summary data directily from Strava # app.logger.info( # "{}: getting summaries from Strava without build" # .format(self)) gen = (Activities.strava2dict(a) for a in self.client().get_activities( limit=limit, before=before, after=after)) for A in gen: if "stop_rendering" in A: pool.join() if "id" not in A: out_queue.put(A) continue if summaries: if ("bounds" not in A): A["bounds"] = Activities.bounds(A["summary_polyline"]) A["ts_local"] = str(A["ts_local"]) # TODO: do this on the client A.update(Activities.atype_properties(A["type"])) if owner_id: A.update({"owner": self.id, "profile": self.profile}) if not streams: out_queue.put(A) else: stream_data = Activities.get(A["id"]) if stream_data: A.update(stream_data) if ("bounds" not in A): A["bounds"] = Activities.bounds(A["polyline"]) out_queue.put(A) elif not OFFLINE: pool.spawn(Activities.import_and_queue_streams, client, out_queue, A) gevent.sleep(0) # If we are using our own queue, we make sure to put a stopIteration # at the end of it so we have to wait for all import jobs to finish. # If the caller supplies a queue, can return immediately and let them # handle responsibility of adding the stopIteration. if put_stopIteration: pool.join() out_queue.put(StopIteration) return out_queue
class XiMaLaYaAllDataSpider(): """喜马拉雅爬虫""" def __init__(self): """初始化""" self.basic_url = 'https://m.ximalaya.com/m-revision/common/album/queryAlbumTrackRecordsByPage?albumId=203355&page={}&pageSize=7' self.headers = { "User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Mobile Safari/537.36" } self.pool = Pool(3) self.url_queue = Queue() self.list_data = [] def fun(self, blocknum, bs, size): """显示下载的进度""" percent = blocknum * bs / size percent = percent * 100 int_data = int(percent) if int_data % 10 == 0: if int_data not in self.list_data: print("id:%s-----download: %d%%" % (blocknum, int_data)) self.list_data.append(int_data) def get_url(self): """获取url列表""" for page in range(10): url = self.basic_url.format(page) # 将获取的url 存放进队列中 self.url_queue.put(url) def exec_task(self): """定义执行任务代码""" # 从队列中获取url url = self.url_queue.get() resp = requests.get(url=url, headers=self.headers) # 提取数据 列表数据 result = json.loads(resp.content)['data']["trackDetailInfos"] for sigle_data in result: """单个数据下的内容""" item = {} item["url"] = sigle_data["trackInfo"]["playPath"] item['name'] = sigle_data["trackInfo"]["title"][6:] print(item) # 下载音频 down_url = request.urlretrieve(url=item['url'], filename='./down_file/' + item["name"] + '.mp3', reporthook=self.fun) print('*' * 50) time.sleep(2) self.url_queue.task_done() def exec_task_finished(self, result): """定义任务执行代码完成后回调""" self.pool.apply_async(self.exec_task, callback=self.exec_task_finished) def run(self): # 把所有url放入队列中 self.get_url() # 让任务在线程池中的线程执行 # callback 表示当任务执行完成后的回调函数 # 默认情况下线程池中的线程都是守护线程 for i in range(3): self.pool.apply_async(self.exec_task, callback=self.exec_task_finished) self.url_queue.join()
from gevent.pool import Pool from opentracing_utils import trace, extract_span_from_kwargs from app.config import MAX_QUERY_TIME_SLICE, UPDATER_CONCURRENCY from app.extensions import db from app.libs.zmon import query_sli from .models import IndicatorValue, Indicator from .models import insert_indicator_value MIN_VAL = math.expm1(1e-10) logger = logging.getLogger(__name__) updater_pool = Pool(UPDATER_CONCURRENCY) def update_all_indicators(app: Flask): """ Update all indicators async! """ if os.environ.get('SLR_LOCAL_ENV'): warnings.warn( 'Running on local env while not setting up gevent properly!') for indicator in Indicator.query.all(): try: if indicator.is_deleted is True: continue updater_pool.spawn(update_indicator, app, indicator)
def related_activities(self, activity_id, streams=False, pool=None, out_queue=None): client = self.client() put_stopIteration = True if not out_queue else False out_queue = out_queue or Queue() pool = pool or Pool(CONCURRENCY) trivial_list = [] # First we put this activity try: A = client.get_activity(int(activity_id)) except Exception as e: app.logger.info("Error getting this activity: {}".format(e)) else: trivial_list.append(A) try: related_activities = list( client.get_related_activities(int(activity_id))) except Exception as e: app.logger.info("Error getting related activities: {}".format(e)) return [{"error": str(e)}] for obj in itertools.chain(related_activities, trivial_list): if streams: owner = self.__class__.get(obj.athlete.id) if owner: # the owner is a Heatflask user A = Activities.strava2dict(obj) A["ts_local"] = str(A["ts_local"]) A["owner"] = owner.id A["profile"] = owner.profile A["bounds"] = Activities.bounds(A["summary_polyline"]) A.update(Activities.atype_properties(A["type"])) stream_data = Activities.get(obj.id) if stream_data: A.update(stream_data) out_queue.put(A) else: pool.spawn(Activities.import_and_queue_streams, owner.client(), out_queue, A) else: # we don't care about activity streams A = Activities.strava2dict(obj) A["ts_local"] = str(A["ts_local"]) A["profile"] = "/avatar/athlete/medium.png" A["owner"] = obj.athlete.id A["bounds"] = Activities.bounds(A["summary_polyline"]) A.update(Activities.atype_properties(A["type"])) out_queue.put(A) if put_stopIteration: out_queue.put(StopIteration) return out_queue
def run_many(tests, expected=None, failfast=False): global NWORKERS, pool start = time() total = 0 failed = {} NWORKERS = min(len(tests), NWORKERS) pool = Pool(NWORKERS) util.BUFFER_OUTPUT = NWORKERS > 1 def run_one(cmd, **kwargs): result = util.run(cmd, **kwargs) if result: if failfast: sys.exit(1) # the tests containing AssertionError might have failed because # we spawned more workers than CPUs # we therefore will retry them sequentially failed[result.name] = [ cmd, kwargs, 'AssertionError' in (result.output or '') ] try: try: for cmd, options in tests: total += 1 spawn(run_one, cmd, **(options or {})) gevent.wait() except KeyboardInterrupt: try: if pool: util.log('Waiting for currently running to finish...') pool.join() except KeyboardInterrupt: util.report(total, failed, exit=False, took=time() - start, expected=expected) util.log('(partial results)\n') raise except: traceback.print_exc() pool.kill() # this needed to kill the processes raise toretry = [ key for (key, (cmd, kwargs, can_retry)) in failed.items() if can_retry ] failed_then_succeeded = [] if NWORKERS > 1 and toretry: util.log('\nWill retry %s failed tests sequentially:\n- %s\n', len(toretry), '\n- '.join(toretry)) for name, (cmd, kwargs, _ignore) in failed.items(): if not util.run(cmd, buffer_output=False, **kwargs): failed.pop(name) failed_then_succeeded.append(name) if failed_then_succeeded: util.log( '\n%s tests failed during concurrent run but succeeded when ran sequentially:', len(failed_then_succeeded)) util.log('- ' + '\n- '.join(failed_then_succeeded)) util.log('gevent version %s from %s', gevent.__version__, gevent.__file__) util.report(total, failed, took=time() - start, expected=expected) assert not pool, pool
def init(self, inventory, config, initial_limit=None): # Config validation # # If no config, create one using the defaults if config is None: config = Config() # Error if our min version is not met if config.MIN_PYINFRA_VERSION is not None: running_version = parse_version(__version__) needed_version = parse_version( # Version must be a string six.text_type(config.MIN_PYINFRA_VERSION), ) if needed_version > running_version: raise PyinfraError(('Minimum pyinfra version not met ' '(minimum={0}, running={1})').format( config.MIN_PYINFRA_VERSION, __version__, )) if not config.PARALLEL: # TODO: benchmark this # In my own tests the optimum number of parallel SSH processes is # ~20 per CPU core - no science here yet, needs benchmarking! cpus = cpu_count() ideal_parallel = cpus * 20 config.PARALLEL = (min(ideal_parallel, len(inventory), MAX_PARALLEL) if MAX_PARALLEL is not None else min(ideal_parallel, len(inventory))) # If explicitly set, just issue a warning elif MAX_PARALLEL is not None and config.PARALLEL > MAX_PARALLEL: logger.warning(( 'Parallel set to {0}, but this may hit the open files limit of {1}.\n' ' Max recommended value: {2}').format( config.PARALLEL, nofile_limit, MAX_PARALLEL)) # Actually initialise the state object # # Setup greenlet pools self.pool = Pool(config.PARALLEL) self.fact_pool = Pool(config.PARALLEL) # Connection storage self.ssh_connections = {} self.sftp_connections = {} # Private keys self.private_keys = {} # Facts storage self.facts = {} self.fact_locks = {} # Assign inventory/config self.inventory = inventory self.config = config # Hosts we've activated at any time self.activated_hosts = set() # Active hosts that *haven't* failed yet self.active_hosts = set() # Hosts that are ready to be deployed to self.ready_hosts = set() # Hosts that have failed self.failed_hosts = set() # Limit hosts changes dynamically to limit operations to a subset of hosts self.limit_hosts = initial_limit # Op basics self.op_line_numbers_to_hash = {} self.op_meta = {} # maps operation hash -> names/etc self.ops_run = set() # list of ops which have been started/run # Op dict for each host self.ops = {host: {} for host in inventory} # Facts dict for each host self.facts = {host: {} for host in inventory} # Meta dict for each host self.meta = { host: { 'ops': 0, # one function call in a deploy file 'commands': 0, # actual # of commands to run 'op_hashes': set(), } for host in inventory } # Results dict for each host self.results = { host: { 'ops': 0, # success_ops + failed ops w/ignore_errors 'success_ops': 0, 'error_ops': 0, 'commands': 0, } for host in inventory } # Assign state back references to inventory & config inventory.state = config.state = self self.initialised = True
def run(self): servers = [] ssl_args = {} if self.cfg.is_ssl: ssl_args = dict(server_side=True, **self.cfg.ssl_options) for s in self.sockets: s.setblocking(1) pool = Pool(self.worker_connections) if self.server_class is not None: environ = base_environ(self.cfg) environ.update({ "wsgi.multithread": True, "SERVER_SOFTWARE": VERSION, }) server = self.server_class(s, application=self.wsgi, spawn=pool, log=self.log, handler_class=self.wsgi_handler, environ=environ, **ssl_args) else: hfun = partial(self.handle, s) server = StreamServer(s, handle=hfun, spawn=pool, **ssl_args) server.start() servers.append(server) while self.alive: self.notify() gevent.sleep(1.0) try: # Stop accepting requests for server in servers: if hasattr(server, 'close'): # gevent 1.0 server.close() if hasattr(server, 'kill'): # gevent < 1.0 server.kill() # Handle current requests until graceful_timeout ts = time.time() while time.time() - ts <= self.cfg.graceful_timeout: accepting = 0 for server in servers: if server.pool.free_count() != server.pool.size: accepting += 1 # if no server is accepting a connection, we can exit if not accepting: return self.notify() gevent.sleep(1.0) # Force kill all active the handlers self.log.warning("Worker graceful timeout (pid:%s)" % self.pid) for server in servers: server.stop(timeout=1) except: pass
def __init__(self): self.mongo_pool = MongoPool() self.queue = Queue() self.coroutine_pool = Pool()
def main_loop(config): """ Основной цикл приложения. :param config: конфигурация :type config: Config Алгоритм: * Открываем соединение с tarantool.queue, использую config.QUEUE_* настройки. * Создаем пул обработчиков. * Создаем очередь куда обработчики будут помещать выполненные задачи. * Пока количество обработчиков <= config.WORKER_POOL_SIZE, берем задачу из tarantool.queue и запускаем greenlet для ее обработки. * Посылаем уведомления о том, что задачи завершены в tarantool.queue. * Спим config.SLEEP секунд. """ logger.info('Connect to queue server on {host}:{port} space #{space}.'.format( host=config.QUEUE_HOST, port=config.QUEUE_PORT, space=config.QUEUE_SPACE )) queue = tarantool_queue.Queue( host=config.QUEUE_HOST, port=config.QUEUE_PORT, space=config.QUEUE_SPACE ) logger.info('Use tube [{tube}], take timeout={take_timeout}.'.format( tube=config.QUEUE_TUBE, take_timeout=config.QUEUE_TAKE_TIMEOUT )) tube = queue.tube(config.QUEUE_TUBE) logger.info('Create worker pool[{size}].'.format(size=config.WORKER_POOL_SIZE)) worker_pool = Pool(config.WORKER_POOL_SIZE) processed_task_queue = gevent_queue.Queue() logger.info('Run main loop. Worker pool size={count}. Sleep time is {sleep}.'.format( count=config.WORKER_POOL_SIZE, sleep=config.SLEEP )) while run_application: free_workers_count = worker_pool.free_count() logger.debug('Pool has {count} free workers.'.format(count=free_workers_count)) for number in xrange(free_workers_count): logger.debug('Get task from tube for worker#{number}.'.format(number=number)) task = tube.take(config.QUEUE_TAKE_TIMEOUT) if task: logger.info('Start worker#{number} for task id={task_id}.'.format( task_id=task.task_id, number=number )) worker = Greenlet( notification_worker, task, processed_task_queue, timeout=config.HTTP_CONNECTION_TIMEOUT, verify=False ) worker_pool.add(worker) worker.start() done_with_processed_tasks(processed_task_queue) sleep(config.SLEEP) else: logger.info('Stop application loop.')
def __init__(self, inventory, config=None): # Connection storage self.ssh_connections = {} self.sftp_connections = {} # Private keys self.private_keys = {} # Facts storage self.facts = {} self.fact_locks = {} # If no config, create one using the defaults if config is None: config = Config() if not config.PARALLEL: # If possible run everything in parallel, otherwise the max if defined above config.PARALLEL = (min(len(inventory), MAX_PARALLEL) if MAX_PARALLEL is not None else len(inventory)) # If explicitly set, just issue a warning elif MAX_PARALLEL is not None and config.PARALLEL > MAX_PARALLEL: logger.warning(( 'Parallel set to {0}, but this may hit the open files limit of {1}.\n' ' Max recommended value: {2}').format( config.PARALLEL, nofile_limit, MAX_PARALLEL)) # Setup greenlet pools self.pool = Pool(config.PARALLEL) self.fact_pool = Pool(config.PARALLEL) # Assign inventory/config self.inventory = inventory self.config = config # Assign self to inventory & config inventory.state = config.state = self # Host tracking self.active_hosts = set() self.ready_hosts = set() self.connected_hosts = set() hostnames = [host.name for host in inventory] # Op basics self.op_order = [] # list of operation hashes self.op_meta = {} # maps operation hash -> names/etc self.ops_run = set() # list of ops which have been started/run # Op dict for each host self.ops = {hostname: {} for hostname in hostnames} # Meta dict for each host self.meta = { hostname: { 'ops': 0, # one function call in a deploy file 'commands': 0, # actual # of commands to run 'latest_op_hash': None } for hostname in hostnames } # Results dict for each host self.results = { hostname: { 'ops': 0, # success_ops + failed ops w/ignore_errors 'success_ops': 0, 'error_ops': 0, 'commands': 0 } for hostname in hostnames } # Pipeline facts context manager attached to self self.pipeline_facts = PipelineFacts(self)
# 使用StreamHandler输出到屏幕 ch = logging.StreamHandler() ch.setLevel(logging.INFO) ch.setFormatter(formatter) logger.addHandler(ch) # Squid的配置文件语法 # 将请求转发到父代理 # PEER_CONF = "cache_peer %s parent %s 0 no-query weighted-round-robin weight=1 connect-fail-limit=2 allow-miss max-conn=5\n" PEER_CONF = "cache_peer %s parent %s 0 proxy-only no-query no-digest round-robin connect-fail-limit=10 connect-timeout=15 max-conn=10 name=proxyip-%s\n" # 可用代理 GOOD_PROXIES = [] pool = Pool(50) def check_proxy(proxy): """验证代理是否可用 :param proxy list:[ip, port]""" global GOOD_PROXIES ip, port = proxy _proxies = {"http": "{}:{}".format(ip, port)} try: ip_url = "http://httpbin.org/ip" res = requests.get(ip_url, proxies=_proxies, timeout=10) assert ip in res.content logger.info("[GOOD] - {}:{}".format(ip, port)) GOOD_PROXIES.append(proxy) except Exception as e:
class BaseServer(object): """An abstract base class that implements some common functionality for the servers in gevent. *listener* can either be an address that the server should bind on or a :class:`gevent.socket.socket` instance that is already bound (and put into listening mode in case of TCP socket). *spawn*, if provided, is called to create a new greenlet to run the handler. By default, :func:`gevent.spawn` is used. Possible values for *spawn*: * a :class:`gevent.pool.Pool` instance -- *handle* will be executed using :meth:`Pool.spawn` method only if the pool is not full. While it is full, all the connection are dropped; * :func:`gevent.spawn_raw` -- *handle* will be executed in a raw greenlet which have a little less overhead then :class:`gevent.Greenlet` instances spawned by default; * ``None`` -- *handle* will be executed right away, in the :class:`Hub` greenlet. *handle* cannot use any blocking functions as it means switching to the :class:`Hub`. * an integer -- a shortcut for ``gevent.pool.Pool(integer)`` """ # the number of seconds to sleep in case there was an error in accept() call # for consecutive errors the delay will double until it reaches max_delay # when accept() finally succeeds the delay will be reset to min_delay again min_delay = 0.01 max_delay = 1 # Sets the maximum number of consecutive accepts that a process may perform on # a single wake up. High values give higher priority to high connection rates, # while lower values give higher priority to already established connections. # Default is 100. Note, that in case of multiple working processes on the same # listening value, it should be set to a lower value. (pywsgi.WSGIServer sets it # to 1 when environ["wsgi.multiprocess"] is true) max_accept = 100 _spawn = Greenlet.spawn # the default timeout that we wait for the client connections to close in stop() stop_timeout = 1 fatal_errors = (errno.EBADF, errno.EINVAL, errno.ENOTSOCK) def __init__(self, listener, handle=None, spawn='default'): self._stop_event = Event() self._stop_event.set() self._watcher = None self._timer = None self.pool = None try: self.set_listener(listener) self.set_spawn(spawn) self.set_handle(handle) self.delay = self.min_delay self.loop = gevent.get_hub().loop if self.max_accept < 1: raise ValueError('max_accept must be positive int: %r' % (self.max_accept, )) except: self.close() raise def set_listener(self, listener): if hasattr(listener, 'accept'): if hasattr(listener, 'do_handshake'): raise TypeError( 'Expected a regular socket, not SSLSocket: %r' % (listener, )) self.family = listener.family self.address = listener.getsockname() self.socket = listener else: self.family, self.address = parse_address(listener) def set_spawn(self, spawn): if spawn == 'default': self.pool = None self._spawn = self._spawn elif hasattr(spawn, 'spawn'): self.pool = spawn self._spawn = spawn.spawn elif isinstance(spawn, (int, long)): from gevent.pool import Pool self.pool = Pool(spawn) self._spawn = self.pool.spawn else: self.pool = None self._spawn = spawn if hasattr(self.pool, 'full'): self.full = self.pool.full if self.pool is not None: self.pool._semaphore.rawlink(self._start_accepting_if_started) def set_handle(self, handle): if handle is not None: self.handle = handle if hasattr(self, 'handle'): self._handle = self.handle else: raise TypeError("'handle' must be provided") def _start_accepting_if_started(self, _event=None): if self.started: self.start_accepting() def start_accepting(self): if self._watcher is None: # just stop watcher without creating a new one? self._watcher = self.loop.io(self.socket.fileno(), 1) self._watcher.start(self._do_read) def stop_accepting(self): if self._watcher is not None: self._watcher.stop() self._watcher = None if self._timer is not None: self._timer.stop() self._timer = None def do_handle(self, *args): spawn = self._spawn if spawn is None: self._handle(*args) else: spawn(self._handle, *args) def _do_read(self): for _ in xrange(self.max_accept): if self.full(): self.stop_accepting() return try: args = self.do_read() self.delay = self.min_delay if not args: return except: self.loop.handle_error(self, *sys.exc_info()) ex = sys.exc_info()[1] if self.is_fatal_error(ex): self.close() sys.stderr.write('ERROR: %s failed with %s\n' % (self, str(ex) or repr(ex))) return if self.delay >= 0: self.stop_accepting() self._timer = self.loop.timer(self.delay) self._timer.start(self._start_accepting_if_started) self.delay = min(self.max_delay, self.delay * 2) break else: try: self.do_handle(*args) except: self.loop.handle_error((args[1:], self), *sys.exc_info()) if self.delay >= 0: self.stop_accepting() self._timer = self.loop.timer(self.delay) self._timer.start(self._start_accepting_if_started) self.delay = min(self.max_delay, self.delay * 2) break def full(self): return False def __repr__(self): return '<%s at %s %s>' % (type(self).__name__, hex( id(self)), self._formatinfo()) def __str__(self): return '<%s %s>' % (type(self).__name__, self._formatinfo()) def _formatinfo(self): if hasattr(self, 'socket'): try: fileno = self.socket.fileno() except Exception: ex = sys.exc_info()[1] fileno = str(ex) result = 'fileno=%s ' % fileno else: result = '' try: if isinstance(self.address, tuple) and len(self.address) == 2: result += 'address=%s:%s' % self.address else: result += 'address=%s' % (self.address, ) except Exception: ex = sys.exc_info()[1] result += str(ex) or '<error>' try: handle = getfuncname(self.__dict__['handle']) except Exception: handle = None if handle is not None: result += ' handle=' + handle return result @property def server_host(self): """IP address that the server is bound to (string).""" if isinstance(self.address, tuple): return self.address[0] @property def server_port(self): """Port that the server is bound to (an integer).""" if isinstance(self.address, tuple): return self.address[1] def init_socket(self): """If the user initialized the server with an address rather than socket, then this function will create a socket, bind it and put it into listening mode. It is not supposed to be called by the user, it is called by :meth:`start` before starting the accept loop.""" pass @property def started(self): return not self._stop_event.is_set() def start(self): """Start accepting the connections. If an address was provided in the constructor, then also create a socket, bind it and put it into the listening mode. """ self.init_socket() self._stop_event.clear() try: self.start_accepting() except: self.kill() raise def close(self): """Close the listener socket and stop accepting.""" self._stop_event.set() try: self.stop_accepting() finally: try: self.socket.close() except Exception: pass finally: self.__dict__.pop('socket', None) self.__dict__.pop('handle', None) self.__dict__.pop('_handle', None) self.__dict__.pop('_spawn', None) self.__dict__.pop('full', None) if self.pool is not None: self.pool._semaphore.unlink( self._start_accepting_if_started) def stop(self, timeout=None): """Stop accepting the connections and close the listening socket. If the server uses a pool to spawn the requests, then :meth:`stop` also waits for all the handlers to exit. If there are still handlers executing after *timeout* has expired (default 1 second), then the currently running handlers in the pool are killed.""" self.close() if timeout is None: timeout = self.stop_timeout if self.pool: self.pool.join(timeout=timeout) self.pool.kill(block=True, timeout=1) def serve_forever(self, stop_timeout=None): """Start the server if it hasn't been already started and wait until it's stopped.""" # add test that serve_forever exists on stop() if not self.started: self.start() try: self._stop_event.wait() finally: gevent.spawn(self.stop, timeout=stop_timeout).join() def is_fatal_error(self, ex): return isinstance(ex, _socket.error) and ex[0] in self.fatal_errors
def __init__(self): #创建MongoDB对象 self.mongo_pool = MongoPool() #在init中创建协程池 self.coroutine_pool = Pool()
def __init__(self, host='127.0.0.1', port= 31337, max_clients=64): self._pool = Pool(max_clients) self._server = StreamServer((host, port), self.connection_handler, spawn = self._pool) self._protocol = ProtocolHandler() self._kv = {} self._commands = self.get_commands()
""" 协程池 """ from gevent import monkey # 打猴子补丁, 让程序在sleep,socket等一些耗时任务的时候, 自动切换 monkey.patch_all() from gevent.pool import Pool import time # 创建协程池对象 p = Pool() # 定义一个执行任务的方法 def func(msg): for i in range(0, 10): print(msg) time.sleep(1) print(i) # time.sleep(1) # 协程池执行异步任务 for i in range(10): p.apply_async(func, (f"协程{i}", )) # 将协程任务加入到主线线程, 让主线程等待协程任务完成 p.join()
import argparse import csv import math from operator import itemgetter import gevent.monkey from closeio_api import Client as CloseIO_API from gevent.pool import Pool gevent.monkey.patch_all() pool = Pool(7) parser = argparse.ArgumentParser( description= 'Find duplicate contacts on a lead in your Close org via contact_name, email address, or phone number' ) parser.add_argument('--api-key', '-k', required=True, help='API Key') parser.add_argument( '--field', '-f', default='all', choices=['contact_name', 'email', 'phone', 'all'], required=False, help="Specify a field to compare uniqueness", ) args = parser.parse_args() # Initialize Close API Wrapper api = CloseIO_API(args.api_key) org_name = api.get('me')['organizations'][0]['name'].replace('/', '')
def concurrency(urls): ''' Open all the greenlet threads ''' in_parallel = 100 pool = Pool(in_parallel) jobs = [pool.spawn(action, url) for url in urls] return joinall(jobs)
class RunSpider(object): def __init__(self): #创建MongoDB对象 self.mongo_pool = MongoPool() #在init中创建协程池 self.coroutine_pool = Pool() def get_spider_from_setting(self): """根据配置文件信息,获取爬虫对象列表""" #遍历配置文件中爬虫信息,获取每隔爬虫的全类名 for full_class_name in PROXIES_SPIDERS: #获取模块名和类名 module_name, class_name = full_class_name.rsplit('.', maxsplit=1) #根据模块名导入模块 module = importlib.import_module(module_name) #根据类名,从模块中获取类 cls = getattr(module, class_name) #创建爬虫对象 spider = cls() yield spider def run(self): #2.1根据配置文件信息,获取爬虫对象列表 spiders = self.get_spider_from_setting() for spider in spiders: #2.5处理异常,防止一个爬虫内部出错,影响其他的爬虫 #3.3 通过异步执行这个方法 self.coroutine_pool.apply_async(self.__execute_one_spider_tack, args=(spider, )) # 调用协程的join方法,让当前线程等待队列完成 self.coroutine_pool.join() def __execute_one_spider_tack(self, spider): #3.2 把处理一个代理爬虫的代码抽到一个方法 #用于处理一个爬虫任务 try: # 遍历爬虫对象的get_proxies方法,获取代理IP for proxy in spider.get_proxies(): # 监测代理IP proxy = check_proxy(proxy) # 2.4如果可用,写入数据库 # 如果spider不为-1,说明可用 if proxy.speed != -1: # 写入数据库 self.mongo_pool.insert_one(proxy) except Exception as ex: logger.exception(ex) @classmethod def start(cls): """ 使用schedule模块,实现每隔一定时间,执行一次爬取任务 定义一个start的类方法 创建当前类的对象,调用run方法 使用schedule模块,每隔一定的时间,执行当前对象的run方法 :return: """ rs = RunSpider() rs.run() schedule.every(RUN_SPIDERS_INTERVAL).hour.do(rs.run) while True: schedule.run_pending() time.sleep(1)
class AsyncFormProcessor(object): def __init__(self, statedb, migrate_form): self.statedb = statedb self.migrate_form = migrate_form def __enter__(self): self.pool = Pool(POOL_SIZE) self.queues = PartiallyLockingQueue() self.retry = RetryForms(self._try_to_process_form) with self.statedb.pop_resume_state(type(self).__name__, []) as form_ids: self._rebuild_queues(form_ids) self.stop_status_logger = run_status_logger( log_status, self.get_status, status_interval=1800, # 30 minutes ) try: self._try_to_empty_queues() except Exception as err: self.__exit__(type(err), err, None) raise return self def __exit__(self, exc_type, exc, exc_tb): queue_ids = self.queues.queue_ids + self.retry.form_ids try: if exc_type is None: queue_ids = self._finish_processing_queues() else: # stop workers -> reduce chaos in logs self.pool.kill() self.retry.kill() finally: key = type(self).__name__ self.statedb.set_resume_state(key, queue_ids) log.info("saved %s state (%s ids)", key, len(queue_ids)) self.stop_status_logger() self.queues = self.pool = None def _rebuild_queues(self, form_ids): for chunk in chunked(form_ids, 100, list): for form in FormAccessorCouch.get_forms(chunk): self._try_to_process_form(form) def process_xform(self, doc): """Process XFormInstance document asynchronously""" form_id = doc["_id"] log.debug('Processing doc: XFormInstance(%s)', form_id) if doc.get('problem'): if str(doc['problem']).startswith(PROBLEM_TEMPLATE_START): doc = _fix_replacement_form_problem_in_couch(doc) else: self.statedb.add_problem_form(form_id) return try: wrapped_form = XFormInstance.wrap(doc) except Exception: log.exception("Error migrating form %s", form_id) self.statedb.save_form_diffs(doc, {}) else: self._try_to_process_form(wrapped_form) self._try_to_empty_queues() def _try_to_process_form(self, wrapped_form, retries=0): try: case_ids = get_case_ids(wrapped_form) except Exception as err: self.retry.later(wrapped_form, retries + 1, err) return if self.queues.try_obj(case_ids, wrapped_form): self.pool.spawn(self._async_migrate_form, wrapped_form, case_ids) @exit_on_error def _async_migrate_form(self, wrapped_form, case_ids): self.migrate_form(wrapped_form, case_ids) self.queues.release_lock(wrapped_form) def _try_to_empty_queues(self): """Process forms waiting in the queue All items in the queue will be processed if the queue becomes full. This is done to ensure that no items become perpetually stuck in the queue. This may be masking a bug in this class or `PartiallyLockingQueue` since the theory of operation should prevent starvation. In any case draining the queue periodically is a good thing since there is a negative correlation between the number of items in the queue and `queue.pop()` performance. """ queue = self.queues was_full = queue.full while True: form, case_ids = queue.pop() if form is not None: self.pool.spawn(self._async_migrate_form, form, case_ids) elif was_full and queue: assert queue.processing, "deadlock!" wait_for_one_task_to_complete(self.pool) else: break if self.pool: gevent.sleep() # swap greenlets def _finish_processing_queues(self): update_interval = timedelta(seconds=10) next_check = datetime.now() pool = self.pool while self.queues: wrapped_form, case_ids = self.queues.pop() if wrapped_form: pool.spawn(self._async_migrate_form, wrapped_form, case_ids) else: gevent.sleep() # swap greenlets now = datetime.now() if now > next_check: log.info('Waiting on %s docs', len(self.queues) + len(pool)) next_check += update_interval self.retry.join() while not pool.join(timeout=10): log.info('Waiting on {} docs'.format(len(pool))) unprocessed = self.queues.queue_ids + self.retry.form_ids if unprocessed: log.error("Unprocessed forms (unexpected): %s", unprocessed) return unprocessed def get_status(self): status = self.queues.get_status() status["retry"] = len(self.retry) return status
class BaseServer(object): """ An abstract base class that implements some common functionality for the servers in gevent. :param listener: Either be an address that the server should bind on or a :class:`gevent.socket.socket` instance that is already bound (and put into listening mode in case of TCP socket). :keyword handle: If given, the request handler. The request handler can be defined in a few ways. Most commonly, subclasses will implement a ``handle`` method as an instance method. Alternatively, a function can be passed as the ``handle`` argument to the constructor. In either case, the handler can later be changed by calling :meth:`set_handle`. When the request handler returns, the socket used for the request will be closed. :keyword spawn: If provided, is called to create a new greenlet to run the handler. By default, :func:`gevent.spawn` is used (meaning there is no artificial limit on the number of concurrent requests). Possible values for *spawn*: - a :class:`gevent.pool.Pool` instance -- ``handle`` will be executed using :meth:`gevent.pool.Pool.spawn` only if the pool is not full. While it is full, no new connections are accepted; - :func:`gevent.spawn_raw` -- ``handle`` will be executed in a raw greenlet which has a little less overhead then :class:`gevent.Greenlet` instances spawned by default; - ``None`` -- ``handle`` will be executed right away, in the :class:`Hub` greenlet. ``handle`` cannot use any blocking functions as it would mean switching to the :class:`Hub`. - an integer -- a shortcut for ``gevent.pool.Pool(integer)`` .. versionchanged:: 1.1a1 When the *handle* function returns from processing a connection, the client socket will be closed. This resolves the non-deterministic closing of the socket, fixing ResourceWarnings under Python 3 and PyPy. """ # pylint: disable=too-many-instance-attributes,bare-except,broad-except #: the number of seconds to sleep in case there was an error in accept() call #: for consecutive errors the delay will double until it reaches max_delay #: when accept() finally succeeds the delay will be reset to min_delay again min_delay = 0.01 max_delay = 1 #: Sets the maximum number of consecutive accepts that a process may perform on #: a single wake up. High values give higher priority to high connection rates, #: while lower values give higher priority to already established connections. #: Default is 100. Note, that in case of multiple working processes on the same #: listening value, it should be set to a lower value. (pywsgi.WSGIServer sets it #: to 1 when environ["wsgi.multiprocess"] is true) max_accept = 100 _spawn = Greenlet.spawn #: the default timeout that we wait for the client connections to close in stop() stop_timeout = 1 fatal_errors = (errno.EBADF, errno.EINVAL, errno.ENOTSOCK) def __init__(self, listener, handle=None, spawn='default'): self._stop_event = Event() self._stop_event.set() self._watcher = None self._timer = None self._handle = None # XXX: FIXME: Subclasses rely on the presence or absence of the # `socket` attribute to determine whether we are open/should be opened. # Instead, have it be None. self.pool = None try: self.set_listener(listener) self.set_spawn(spawn) self.set_handle(handle) self.delay = self.min_delay self.loop = get_hub().loop if self.max_accept < 1: raise ValueError('max_accept must be positive int: %r' % (self.max_accept, )) except: self.close() raise def set_listener(self, listener): if hasattr(listener, 'accept'): if hasattr(listener, 'do_handshake'): raise TypeError( 'Expected a regular socket, not SSLSocket: %r' % (listener, )) self.family = listener.family self.address = listener.getsockname() self.socket = listener else: self.family, self.address = parse_address(listener) def set_spawn(self, spawn): if spawn == 'default': self.pool = None self._spawn = self._spawn elif hasattr(spawn, 'spawn'): self.pool = spawn self._spawn = spawn.spawn elif isinstance(spawn, integer_types): from gevent.pool import Pool self.pool = Pool(spawn) self._spawn = self.pool.spawn else: self.pool = None self._spawn = spawn if hasattr(self.pool, 'full'): self.full = self.pool.full if self.pool is not None: self.pool._semaphore.rawlink(self._start_accepting_if_started) def set_handle(self, handle): if handle is not None: self.handle = handle if hasattr(self, 'handle'): self._handle = self.handle else: raise TypeError("'handle' must be provided") def _start_accepting_if_started(self, _event=None): if self.started: self.start_accepting() def start_accepting(self): if self._watcher is None: # just stop watcher without creating a new one? self._watcher = self.loop.io(self.socket.fileno(), 1) self._watcher.start(self._do_read) def stop_accepting(self): if self._watcher is not None: self._watcher.stop() self._watcher = None if self._timer is not None: self._timer.stop() self._timer = None def do_handle(self, *args): spawn = self._spawn handle = self._handle close = self.do_close try: if spawn is None: _handle_and_close_when_done(handle, close, args) else: spawn(_handle_and_close_when_done, handle, close, args) except: close(*args) raise def do_close(self, *args): pass def do_read(self): raise NotImplementedError() def _do_read(self): for _ in xrange(self.max_accept): if self.full(): self.stop_accepting() return try: args = self.do_read() self.delay = self.min_delay if not args: return except: self.loop.handle_error(self, *sys.exc_info()) ex = sys.exc_info()[1] if self.is_fatal_error(ex): self.close() sys.stderr.write('ERROR: %s failed with %s\n' % (self, str(ex) or repr(ex))) return if self.delay >= 0: self.stop_accepting() self._timer = self.loop.timer(self.delay) self._timer.start(self._start_accepting_if_started) self.delay = min(self.max_delay, self.delay * 2) break else: try: self.do_handle(*args) except: self.loop.handle_error((args[1:], self), *sys.exc_info()) if self.delay >= 0: self.stop_accepting() self._timer = self.loop.timer(self.delay) self._timer.start(self._start_accepting_if_started) self.delay = min(self.max_delay, self.delay * 2) break def full(self): # copied from self.pool # pylint: disable=method-hidden return False def __repr__(self): return '<%s at %s %s>' % (type(self).__name__, hex( id(self)), self._formatinfo()) def __str__(self): return '<%s %s>' % (type(self).__name__, self._formatinfo()) def _formatinfo(self): if hasattr(self, 'socket'): try: fileno = self.socket.fileno() except Exception as ex: fileno = str(ex) result = 'fileno=%s ' % fileno else: result = '' try: if isinstance(self.address, tuple) and len(self.address) == 2: result += 'address=%s:%s' % self.address else: result += 'address=%s' % (self.address, ) except Exception as ex: result += str(ex) or '<error>' handle = self.__dict__.get('handle') if handle is not None: fself = getattr(handle, '__self__', None) try: if fself is self: # Checks the __self__ of the handle in case it is a bound # method of self to prevent recursivly defined reprs. handle_repr = '<bound method %s.%s of self>' % ( self.__class__.__name__, handle.__name__, ) else: handle_repr = repr(handle) result += ' handle=' + handle_repr except Exception as ex: result += str(ex) or '<error>' return result @property def server_host(self): """IP address that the server is bound to (string).""" if isinstance(self.address, tuple): return self.address[0] @property def server_port(self): """Port that the server is bound to (an integer).""" if isinstance(self.address, tuple): return self.address[1] def init_socket(self): """If the user initialized the server with an address rather than socket, then this function will create a socket, bind it and put it into listening mode. It is not supposed to be called by the user, it is called by :meth:`start` before starting the accept loop.""" pass @property def started(self): return not self._stop_event.is_set() def start(self): """Start accepting the connections. If an address was provided in the constructor, then also create a socket, bind it and put it into the listening mode. """ self.init_socket() self._stop_event.clear() try: self.start_accepting() except: self.close() raise def close(self): """Close the listener socket and stop accepting.""" self._stop_event.set() try: self.stop_accepting() finally: try: self.socket.close() except Exception: pass finally: self.__dict__.pop('socket', None) self.__dict__.pop('handle', None) self.__dict__.pop('_handle', None) self.__dict__.pop('_spawn', None) self.__dict__.pop('full', None) if self.pool is not None: self.pool._semaphore.unlink( self._start_accepting_if_started) @property def closed(self): return not hasattr(self, 'socket') def stop(self, timeout=None): """ Stop accepting the connections and close the listening socket. If the server uses a pool to spawn the requests, then :meth:`stop` also waits for all the handlers to exit. If there are still handlers executing after *timeout* has expired (default 1 second, :attr:`stop_timeout`), then the currently running handlers in the pool are killed. If the server does not use a pool, then this merely stops accepting connections; any spawned greenlets that are handling requests continue running until they naturally complete. """ self.close() if timeout is None: timeout = self.stop_timeout if self.pool: self.pool.join(timeout=timeout) self.pool.kill(block=True, timeout=1) def serve_forever(self, stop_timeout=None): """Start the server if it hasn't been already started and wait until it's stopped.""" # add test that serve_forever exists on stop() if not self.started: self.start() try: self._stop_event.wait() finally: Greenlet.spawn(self.stop, timeout=stop_timeout).join() def is_fatal_error(self, ex): return isinstance(ex, _socket.error) and ex.args[0] in self.fatal_errors
def test_proxies(proxies, timeout=10, single_url=None, many_urls=None, call_back=None): """ Test proxies, or process html source using callback in the meantime. :type proxies: list :param proxies: proxies :param timeout: response timeout :param single_url: The URL for testing :param many_urls: The list of URLs for testing. Pick one of them when perform request. :param call_back: Process the html source if status code is 200. callback(url, source) :return: """ proxies = set(proxies) errors = set() pool = Pool(100) def test(proxy): code = None url = random.choice(many_urls) if many_urls is not None else single_url start_time = time.time() try: with gevent.Timeout(seconds=timeout, exception=Exception('[Connection Timeout]')): _headers['User-Agent'] = random.choice(_user_agents) res = requests.get(url, proxies={ 'http': 'http://{}'.format(proxy.strip()), 'https': 'https://{}'.format(proxy.strip()) }, headers=_headers) code = res.status_code source = res.text _log('[Proxy: {:d} {:s}]'.format(code, proxy)) # 回调 if source is not None and call_back is not None and code == 200: call_back(url, source) if code != 200: errors.add(proxy) except Exception as e: # log(e.args) errors.add(proxy) end_time = time.time() escaped = end_time - start_time if code else None store_in_db(proxy, escaped=escaped, status_code=code) # store in db for proxy in proxies: pool.spawn(test, proxy) pool.join() proxies = proxies - errors _log('[HTTP Proxies] Available:{:d} Deprecated:{:d}'.format( len(proxies), len(errors))) return list(proxies)
def __init__(self): self.pool = Pool(1000) self.pool.start()