def start_pool(size): t1 = datetime.now() pool = Pool(size) while (datetime.now() - t1).seconds <= SECONDS: print 'pool.free_count():', pool.free_count() if pool.free_count() == 0: pool.wait_available() print '<free 1>' pool.apply_async(test_get) print 'Joining............................................' pool.join() t2 = datetime.now() print COUNT, TIMEOUT_CNT print COUNT / (t2-t1).seconds
def main(psize, filename=None): if filename: urls = Queue() results = Queue() pool = Pool(int(psize)) reader = gevent.spawn(readfile, filename, urls) request = gevent.spawn(work_input_file, urls, results, reader) pool.add(reader) pool.add(request) pool.join() pool.free_count() print results.qsize(), 3333333333333333333 print urls.qsize(), 3333333333333333333 return results
class WorkerPool: def __init__(self, queue, func=None, pool_size=100, worker_type='page'): self.queue = queue self.worker = func self.exit_signal = False self.pool_size = pool_size ## Pool类基于gevent.pool.Group类 self.pool = Pool(pool_size) self.worker_type = worker_type def start(self, page_task=None): if self.worker_type == 'asset': msg = '静态资源工作池启动, 所属页面: {:s}' logger.debug(msg.format(page_task['refer'])) while True: if self.exit_signal: break if not self.queue.empty(): task = self.queue.get() msg = '从队列中取出成员, 调用worker. task: {task:s}' logger.debug(msg.format(task=str(task))) self.pool.spawn(self.worker, task) elif self.pool.free_count() != self.pool.size: ## 如果队列已空, 但是协程池还未全部空闲, 说明仍有任务在执行, 等待. free = self.pool.free_count() total = self.pool.size working = total - free if self.worker_type == 'asset': msg = '工作池使用率: {working:d}/{total:d}, page_task: {page_task:s}' logger.debug( msg.format(working=working, total=total, page_task=str(page_task))) sleep(1) elif self.exit_signal: ## 如果队列为空, 且各协程都已空闲, 或是触发了stop()方法, 则停止while循环 break else: break if self.worker_type == 'asset': msg = '静态资源工作池结束, 所属页面: {:s}' logger.debug(msg.format(page_task['refer'])) def stop(self): self.exit_signal = True # 只让进队列, 不让出队列, 就是只把当前正在处理的页面中的链接入队列, 不再弹出任务 ## 把协程池中的任务取出重新入队列并持久化到本地文件, 避免丢失. for item in self.pool: self.queue.put(item.args)
class Task: def __init__(self, queue, pool_max=100): self.work = None self.pool_max = pool_max self.pool = Pool(pool_max) self.queue = queue def initTaskWork(self, func): self.work = func def start(self): while True: if not self.queue.empty(): t = self.queue.pop() self.pool.spawn(self.work, *t) elif self.pool.free_count() == self.pool.size or self.queue.isLock: # print 'queue is empty' # print self.pool.free_count(), self.pool.size break else: # print 'queue is empty but...' sleep(0) def stop(self): # 只让进队列,不让出队列 self.queue.lock(True) for item in self.pool: self.queue.push(list(item.args)) # print item # self.pool.killone(item) # self.pool.kill() # print '开始 stop的save' self.queue.save() self.queue.clear()
def main(): """spawn""" val = rclient.get('f1') print(val) pool = Pool(20) start('f1') #loop forever while True: #print( time.time() ) pool.spawn(func1) #print pool.wait_available() print ( pool.free_count() ) #sleep gevent.sleep(2)
class WorkerPool(object): """Docstring for WorkerPool """ def __init__(self, input, output, func, nthreads=800): """@todo: to be defined :param input: @todo :param output: @todo :param func: @todo :param qname: @todo """ self._func = func self._input = input self._output = output self._lock = BoundedSemaphore(1) self._pool = Pool(nthreads) self._nthreads = nthreads self._true = 0 self._false = 0 self._nogeo = 0 self._notruth = 0 def run_one(self, msg): result = self._func(msg) if result is not None: with self._lock: self._output.write( (json.dumps(result, ensure_ascii=False)).encode("utf-8") + "\n") #if not result['true_geo']: # self._notruth += 1 #elif ('country' not in result['embersGeoCode']): # self._nogeo += 1 #elif result['true_geo']['country'].lower() == result['embersGeoCode']['country'].lower(): # self._true += 1 #else: # self._false += 1 def run(self): last = time.time() for msg in self._input: self._pool.spawn(self.run_one, msg) if time.time() - last > 10: log.info("Workers running={}".format(self._nthreads - self._pool.free_count())) last = time.time() self._pool.join() # def cleanup_workers(self): # dones = [w.done for w in self._workers] # for done, w in zip(dones, self._workers): # if done: # fin_job = w.ret # self._output.write(fin_job) # self._workers = [w for done, w in zip(dones, self._workers) if not done] # def stop(self): self._pool.join()
def download_images(): images_to_download = Item.objects.filter(Q(image__isnull=True) | Q(image=''),image_url__isnull=False).values_list('pk','image_url') for obj in images_to_download: queue.put(obj) # create greenlet pool and spawn workers pool = Pool(size=POOL_SIZE) pool.spawn(download_crawler) # eventlet uses free(), gevent uses free_count() while not pool.free_count() == POOL_SIZE: gevent.sleep(0.1) #eventlet.sleep for x in xrange(0, min(queue.qsize(), pool.free_count())): pool.spawn(download_crawler) # Wait for everything to complete - eventlet uses waitall pool.join() pool.kill() time.sleep(2)
def use_gevent_with_queue(): queue = Queue() pool = Pool(5) for p in range(1, 7): put_new_page(p, queue) while pool.free_count(): sleep(0.1) pool.spawn(save_search_result_with_queue, queue) pool.join()
def save_html_with_gevent(items, gov): pool = Pool(10) queue = Gqueue() for item in items: queue.put(item) while pool.free_count(): pool.spawn(save_html_for_gevent, queue, gov['gov_name']) pool.join()
def main(): pool = Pool(results.threads) while 1: try: if manager.gamertags.empty(): print 'Finished' break for i in xrange(min(pool.free_count(), 50)): pool.spawn(manager.spawn_connect) gevent.sleep(1) except KeyboardInterrupt: print '[KYBRD_NTRPT] Finishing active threads' pool.join() break
def main(): pool = Pool(results.threads) while 1: try: if manager.gamertags.empty(): print ('Finished') break for i in xrange(min(pool.free_count(), 50)): pool.spawn(manager.spawn_connect) gevent.sleep(1) except KeyboardInterrupt: print ('[KYBRD_NTRPT] Finishing active threads') pool.join() break
class SocketPool(object): def __init__(self): self.pool = Pool(1) self.pool.add(self.server()) # 适合聊天室的按回车发送文本方式 def listen( self, socket,address): f = socket.makefile() print "listen" while True: name = f.readline().strip() print name def listen2( self, socket,address): print "listen2" print self.pool.free_count() while True: name =socket.recv(1010).strip() print name def add_handler( self, socket,address): if self.pool.full(): raise Exception( "At maximum pool size") else: print (" pool insert") s = self.pool.spawn(self.listen2(socket,address)) # self. pool.spawn( self. listen, socket,address) def shutdown( self): self. pool. kill() def server(self): print "server" server = StreamServer(('0.0.0.0', 8000), self.add_handler) server.serve_forever()
class BGTaskManager(object): def __init__(self, max_workers): self.max_workers = max_workers self._pool = Pool(size=max_workers) def run(self): while True: task_id, func, args, kw = bgtasks_queue.get() # 为每个任务创建单独的 execution context 避免数据库连接无法正常回收 # http://docs.peewee-orm.com/en/latest/peewee/database.html#advanced-connection-management func = db.execution_context(with_transaction=False)(func) self._pool.spawn(func, *args, **kw) def active_worker_count(self): return self._pool.size - self._pool.free_count()
class RssPool(object): def __init__(self): self.pool = Pool(RSS_MAX_POOL_NUM) self.start = False self.times = 0 self.beginTime = int(time.time()) def run(self): while True: if (not self.start) and (not self.pool.full()): self.addRssSpider() # self.syncDagrame() continue self.start = False if self.pool.free_count() < RSS_MAX_POOL_NUM: logging.info("---------------join run ") self.pool.join() else: logging.info("---------------not data ,sleep %s senconds " % MAIN_LOOP_SLEEP_TIME) time.sleep(MAIN_LOOP_SLEEP_TIME) def syncDagrame(self): """同步数据到线上""" self.times += 1 if self.times > RUN_SYNC_INTERVAL_TIMES or int(time.time()) - self.beginTime > RUN_SYNC_INTERVAL_TIME: logging.info("**********sync crawl infos ************") sync = SyncCrawlInfos() sync.index() self.times = 0 self.beginTime = int(time.time()) def addRssSpider(self): configList = getCrawlRssRequest() if not configList: self.start = True return True try: spider = CommonFeedRss() self.pool.spawn(spider.run, configList) except Exception, e: logging.info("------------------add spider exception : %s " % e)
class GEventTaskRunner(TaskRunner): timeout = 5 def __init__(self, pool_size=200, *args, **kw): super(GEventTaskRunner, self).__init__(*args, **kw) self._pool = Pool(pool_size) def run_task(self, func, *args, **kw): self.logger.debug("Adding task %s to pool of size %s", func, self._pool.free_count()) self._pool.start(Greenlet(func, *args, **kw)) self.logger.debug("Task added") def stop(self): self.logger.debug("Waiting for background queue to finish") self._pool.join(self.timeout) self.logger.debug("background queue finished") super(GEventTaskRunner, self).stop()
class Downloader(object): def __init__(self, concurrent=64): self.proxy_conf = OnlineConfig().proxy self.pool = Pool(concurrent) self.pool.join() def add_task(self, task, proxy): self.pool.add(gevent.spawn(self._download, task, proxy)) def free_count(self): return self.pool.free_count() @staticmethod def _before_download(task, proxy): module = ExternManager().get_model(task.s_platform, task.s_feature + '.request') request = module(task, proxy) if module else RequestExtra(task, proxy) return request @staticmethod def _after_download(task, request, response, proxy): module = ExternManager().get_model(task.s_platform, task.s_feature + '.response') response = module(task, request, response, proxy) \ if module else ResponseExtra(task, request, response, proxy) return response def _download(self, task, proxy): request = None req_response = None try: request = self._before_download(task, proxy) req_response = requests.request(**request()) response = self._after_download(task, request, req_response, proxy) del response del req_response del request except Exception as e: if req_response: del req_response if request: del request finally: del task del proxy
class GreenletExecutor(AbstractExecutor): """ GreenletExecutor is an AbstractExecutor subclass that uses a pool of greenlets to execute calls asynchronously. NOTE: Use this executor for I/O-bound tasks. Since all greenlets are multiplexed on a single pthread, do NOT use this for compute-bound callables. Try using the GIPCExecutor instead. """ def __init__(self, num_greenlets=50, **kwargs): super(GreenletExecutor, self).__init__(**kwargs) self.pool = Pool(size=num_greenlets) self.task_queue = Queue() self.num_ready = 0 def _shutdown(self): for _ in xrange(len(self.pool)): self.task_queue.put(None) if self.force_kill_on_shutdown: self.pool.kill() else: self.pool.join() def _worker_loop(self): try: self.num_ready += 1 while True: self.num_ready -= 1 task = self.task_queue.get() if task is None: return task.execute() self.num_ready += 1 except: pass def _submit(self, task): self.task_queue.put(task) if not self.num_ready and self.pool.free_count(): self.pool.spawn(self._worker_loop)
def run_gevent(): worker = _config.get('count_worker', 4) pool = Pool(worker) funcs = run() while True: if pool.full(): time.sleep(1) continue # getting func delete try: funcnya = next(funcs) pool.spawn(funcnya['func'], *funcnya['param']) except StopIteration as e: if pool.free_count() == worker: break time.sleep(0.01)
class worker: def __init__(self,seeds): self.showpercounts = 50 self.timeout = 10 self.starttime = time.time() self.quit = 0 #self.run_queue = Queue() self.run_queue = daemon.run_que self.done_queue = daemon.done_que self.tasks = [] self.done = 0 self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 300 self.freecount = 0 #self.maxfreecnt = 4 self.down_pool = Pool(size=self.poolsize) #self.mutex = gevent.coros.RLock() self.totalnettime = 0 self.cbcputime = 0 self.totaldownsize = 0 self.curspeed = 0 self.test = 0 self.errcnt = 0 self.bfdone = daemon.bfdone self.size = 0 if self.run_queue.qsize() == 0: for seed in seeds: self.run_queue.put( seed.split("http://")[-1] ) self.urlpatern = re.compile('href=[\"\']http://([^/?#\"\']+)') def cb_httpget(self, data): st = time.time() seed, err, headers, content = data #sself.test += 1 if err or len(content) == 0: self.errcnt += 1 return data={'url':seed,'headers':headers,'content':content} dat = cPickle.dumps(data) self.size = len(content) self.done_queue.put(dat) self.done += 1 #seed.split('http://')[-1] self.bfdone.add(seed) et = time.time() self.cbcputime += (et-st) if self.done % self.showpercounts == 0: t = self.cbcputime/self.done self.out(seed ,(et-st)) def out(self, cururl, cbtime=0 ): spendtime = time.time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else "" now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 ) print "%s D:%-4d R:%-7d SpeedT:%.2f/s SpeedC:%.2f/s Test:%0.2f CB:%0.4f Active:%d Err:%d %s" % (now, (self.done), self.run_queue.qsize(), \ self.done/spendtime,self.curspeed, self.test, cbtime ,self.poolsize-self.freecount, self.errcnt, cururl ) def work(self): while self.quit == 0: curstime = time.time() self.freecount = self.down_pool.free_count() self.tasks = [] if self.freecount == 0: gevent.sleep(0.1) continue st = time.time() xlen = self.freecount lasturl = "" while xlen > 0: xlen -= 1 url = self.run_queue.get() if url == lasturl: continue else: lasturl = url url = "http://"+url if url in self.bfdone: xlen += 1 continue #print xlen, url, self.down_pool.free_count() self.tasks.append(url) self.down_pool.apply_async(self.httpget, (url,), callback=self.cb_httpget) et = time.time() curetime = time.time() #self.curspeed = (self.done - curdone) / (curetime-curstime) self.down_pool.join() print "All OVER" # requests is better than pycurl ? def httpget_requests(self, url): st = time.time() con = "" e = None #'Connection':'close', headers = { 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding':'gzip,deflate', 'Connection':'close', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' } try: # query the ip of the website req = requests #r = requests req.max_redirects = 1 #with gevent.Timeout(5, False) as timeout: res = req.get(url, timeout = self.timeout) if res.url.startswith('https'): raise con = res.content headers = res.headers res.close() except KeyboardInterrupt: raise except Exception as e: et = time.time() return url,e,None,None et = time.time() self.totalnettime += (et-st) self.curspeed = self.totalnettime/(self.done+1) return url, e, headers, con
class BreakpadSubmitterResource(RequiredConfigMixin): """Handles incoming breakpad crash reports and saves to crashstorage This handles incoming HTTP POST requests containing breakpad-style crash reports in multipart/form-data format. It can handle compressed or uncompressed POST payloads. It parses the payload from the HTTP POST request, runs it through the throttler with the specified rules, generates a crash_id, returns the crash_id to the HTTP client and then saves the crash using the configured crashstorage class. .. Note:: From when a crash comes in to when it's saved by the crashstorage class, the crash is entirely in memory. Keep that in mind when figuring out how to scale your Antenna nodes. The most important configuration bit here is choosing the crashstorage class. For example:: CRASHSTORAGE_CLASS=antenna.ext.s3.crashstorage.S3CrashStorage """ required_config = ConfigOptions() required_config.add_option( 'dump_field', default='upload_file_minidump', doc='the name of the field in the POST data for dumps') required_config.add_option('dump_id_prefix', default='bp-', doc='the crash type prefix') required_config.add_option( 'crashstorage_class', default='antenna.ext.crashstorage_base.NoOpCrashStorage', parser=parse_class, doc='the class in charge of storing crashes') # Maximum number of concurrent crashmover workers; each process gets this # many concurrent crashmovers, so if you're running 5 processes on the node # then it's (5 * concurrent_crashmovers) fighting for upload bandwidth required_config.add_option( 'concurrent_crashmovers', default='2', parser=int, doc='the number of crashes concurrently being saved to s3') def __init__(self, config): self.config = config.with_options(self) self.crashstorage = self.config('crashstorage_class')( config.with_namespace('crashstorage')) self.throttler = Throttler(config) # Gevent pool for crashmover workers self.crashmover_pool = Pool(size=self.config('concurrent_crashmovers')) # Queue for crashmover of crashes to save self.crashmover_save_queue = deque() # Register hb functions with heartbeat manager register_for_heartbeat(self.hb_report_health_stats) register_for_heartbeat(self.hb_run_crashmover) # Register life function with heartbeat manager register_for_life(self.has_work_to_do) def get_runtime_config(self, namespace=None): for item in super().get_runtime_config(): yield item for item in self.throttler.get_runtime_config(): yield item for item in self.crashstorage.get_runtime_config(['crashstorage']): yield item def check_health(self, state): if hasattr(self.crashstorage, 'check_health'): self.crashstorage.check_health(state) def hb_report_health_stats(self): # The number of crash reports sitting in the queue; this is a direct # measure of the health of this process--a number that's going up means # impending doom mymetrics.gauge('save_queue_size', value=len(self.crashmover_save_queue)) def has_work_to_do(self): work_to_do = len(self.crashmover_save_queue) + len( self.crashmover_pool) logger.info('work left to do: %s' % work_to_do) # Indicates whether or not we're sitting on crashes to save--this helps # keep Antenna alive until we're done saving crashes return bool(work_to_do) def extract_payload(self, req): """Parses the HTTP POST payload Decompresses the payload if necessary and then walks through the FieldStorage converting from multipart/form-data to Python datatypes. NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It has a list attribute that is a list of FieldStorage items--one for each key/val in the form. For attached files, the FieldStorage will have a name, value and filename and the type should be application/octet-stream. Thus we parse it looking for things of type text/plain and application/octet-stream. :arg falcon.request.Request req: a Falcon Request instance :returns: (raw_crash dict, dumps dict) """ # If we don't have a content type, return an empty crash if not req.content_type: return {}, {} # If it's the wrong content type or there's no boundary section, return # an empty crash content_type = [ part.strip() for part in req.content_type.split(';', 1) ] if ((len(content_type) != 2 or content_type[0] != 'multipart/form-data' or not content_type[1].startswith('boundary='))): return {}, {} content_length = req.content_length or 0 # If there's no content, return an empty crash if content_length == 0: return {}, {} # Decompress payload if it's compressed if req.env.get('HTTP_CONTENT_ENCODING') == 'gzip': mymetrics.incr('gzipped_crash') # If the content is gzipped, we pull it out and decompress it. We # have to do that here because nginx doesn't have a good way to do # that in nginx-land. gzip_header = 16 + zlib.MAX_WBITS try: data = zlib.decompress(req.stream.read(content_length), gzip_header) except zlib.error: # This indicates this isn't a valid compressed stream. Given # that the HTTP request insists it is, we're just going to # assume it's junk and not try to process any further. mymetrics.incr('bad_gzipped_crash') return {}, {} # Stomp on the content length to correct it because we've changed # the payload size by decompressing it. We save the original value # in case we need to debug something later on. req.env['ORIG_CONTENT_LENGTH'] = content_length content_length = len(data) req.env['CONTENT_LENGTH'] = str(content_length) data = io.BytesIO(data) mymetrics.histogram('crash_size', value=content_length, tags=['payload:compressed']) else: # NOTE(willkg): At this point, req.stream is either a # falcon.request_helper.BoundedStream (in tests) or a # gunicorn.http.body.Body (in production). # # FieldStorage doesn't work with BoundedStream so we pluck out the # internal stream from that which works fine. # # FIXME(willkg): why don't tests work with BoundedStream? if isinstance(req.stream, BoundedStream): data = req.stream.stream else: data = req.stream mymetrics.histogram('crash_size', value=content_length, tags=['payload:uncompressed']) fs = cgi.FieldStorage(fp=data, environ=req.env, keep_blank_values=1) # NOTE(willkg): In the original collector, this returned request # querystring data as well as request body data, but we're not doing # that because the query string just duplicates data in the payload. raw_crash = {} dumps = {} for fs_item in fs.list: # NOTE(willkg): We saw some crashes come in where the raw crash ends up with # a None as a key. Make sure we can't end up with non-strings as keys. item_name = de_null(fs_item.name or '') if item_name == 'dump_checksums': # We don't want to pick up the dump_checksums from a raw # crash that was re-submitted. continue elif fs_item.type and ( fs_item.type.startswith('application/octet-stream') or isinstance(fs_item.value, bytes)): # This is a dump, so add it to dumps using a sanitized dump # name. dump_name = sanitize_dump_name(item_name) dumps[dump_name] = fs_item.value else: # This isn't a dump, so it's a key/val pair, so we add that. raw_crash[item_name] = de_null(fs_item.value) return raw_crash, dumps def get_throttle_result(self, raw_crash): """Given a raw_crash, figures out the throttling If the raw_crash contains throttling information already, it returns that. If it doesn't, then this will apply throttling and return the results of that. A rule name of ``ALREADY_THROTTLED`` indicates that the raw_crash was previously throttled and we're re-using that data. A rule name of ``THROTTLEABLE_0`` indicates that the raw_crash was marked to not be throttled. :arg dict raw_crash: the raw crash to throttle :returns tuple: ``(result, rule_name, percentage)`` """ # If the raw_crash has a uuid, then that implies throttling, so return # that. if 'uuid' in raw_crash: crash_id = raw_crash['uuid'] if crash_id[-7] in (str(ACCEPT), str(DEFER)): result = int(crash_id[-7]) throttle_rate = 100 # Save the results in the raw_crash itself raw_crash['legacy_processing'] = result raw_crash['throttle_rate'] = throttle_rate return result, 'FROM_CRASHID', throttle_rate # If we have throttle results for this crash, return those. if 'legacy_processing' in raw_crash and 'throttle_rate' in raw_crash: try: result = int(raw_crash['legacy_processing']) if result not in (ACCEPT, DEFER): raise ValueError('Result is not a valid value: %r', result) throttle_rate = int(raw_crash['throttle_rate']) if not (0 <= throttle_rate <= 100): raise ValueError('Throttle rate is not a valid value: %r', result) return result, 'ALREADY_THROTTLED', throttle_rate except ValueError: # If we've gotten a ValueError, it means one or both of the # values is bad and we should ignore it and move forward. mymetrics.incr('throttle.bad_throttle_values') # If we have a Throttleable=0, then return that. if raw_crash.get('Throttleable', None) == '0': # If the raw crash has ``Throttleable=0``, then we accept the # crash. mymetrics.incr('throttleable_0') result = ACCEPT rule_name = 'THROTTLEABLE_0' throttle_rate = 100 else: # At this stage, nothing has given us a throttle answer, so we # throttle the crash. result, rule_name, throttle_rate = self.throttler.throttle( raw_crash) # Save the results in the raw_crash itself raw_crash['legacy_processing'] = result raw_crash['throttle_rate'] = throttle_rate return result, rule_name, throttle_rate @mymetrics.timer_decorator('on_post.time') def on_post(self, req, resp): """Handles incoming HTTP POSTs Note: This is executed by the WSGI app, so it and anything it does is covered by the Sentry middleware. """ resp.status = falcon.HTTP_200 start_time = time.time() # NOTE(willkg): This has to return text/plain since that's what the # breakpad clients expect. resp.content_type = 'text/plain' raw_crash, dumps = self.extract_payload(req) # If we didn't get any crash data, then just drop it and move on--don't # count this as an incoming crash and don't do any more work on it if not raw_crash: resp.body = 'Discarded=1' return mymetrics.incr('incoming_crash') # Add timestamps current_timestamp = utc_now() raw_crash['submitted_timestamp'] = current_timestamp.isoformat() raw_crash['timestamp'] = start_time # Add checksums and MinidumpSha256Hash raw_crash['dump_checksums'] = { dump_name: hashlib.sha256(dump).hexdigest() for dump_name, dump in dumps.items() } raw_crash['MinidumpSha256Hash'] = raw_crash['dump_checksums'].get( 'upload_file_minidump', '') # First throttle the crash which gives us the information we need # to generate a crash id. throttle_result, rule_name, percentage = self.get_throttle_result( raw_crash) # Use a uuid if they gave us one and it's valid--otherwise create a new # one. if 'uuid' in raw_crash and validate_crash_id(raw_crash['uuid']): crash_id = raw_crash['uuid'] logger.info('%s has existing crash_id', crash_id) else: crash_id = create_crash_id(timestamp=current_timestamp, throttle_result=throttle_result) raw_crash['uuid'] = crash_id raw_crash['type_tag'] = self.config('dump_id_prefix').strip('-') # Log the throttle result logger.info('%s: matched by %s; returned %s', crash_id, rule_name, RESULT_TO_TEXT[throttle_result]) mymetrics.incr('throttle_rule', tags=['rule:%s' % rule_name]) mymetrics.incr( 'throttle', tags=['result:%s' % RESULT_TO_TEXT[throttle_result].lower()]) if throttle_result is REJECT: # If the result is REJECT, then discard it resp.body = 'Discarded=1' else: # If the result is not REJECT, then save it and return the CrashID to # the client self.crashmover_save_queue.append( CrashReport(raw_crash, dumps, crash_id)) self.hb_run_crashmover() resp.body = 'CrashID=%s%s\n' % (self.config('dump_id_prefix'), crash_id) def hb_run_crashmover(self): """Checks to see if it should spawn a crashmover and does if appropriate""" # Spawn a new crashmover if there's stuff in the queue and there isn't # one currently running if self.crashmover_save_queue and self.crashmover_pool.free_count( ) > 0: self.crashmover_pool.spawn(self.crashmover_process_queue) def crashmover_process_queue(self): """Processes the queue of crashes to save until it's empty Note: This has to be super careful not to lose crash reports. If there's any kind of problem, this must return the crash to the queue. """ # Process crashes until the queue is empty while self.crashmover_save_queue: crash_report = self.crashmover_save_queue.popleft() try: self.crashmover_save(crash_report) except Exception: mymetrics.incr('save_crash_exception.count') crash_report.errors += 1 logger.exception( 'Exception when processing save queue (%s); error %d/%d', crash_report.crash_id, crash_report.errors, MAX_ATTEMPTS) # After MAX_ATTEMPTS, we give up on this crash and move on if crash_report.errors < MAX_ATTEMPTS: self.crashmover_save_queue.append(crash_report) else: logger.error('%s: too many errors trying to save; dropped', crash_report.crash_id) mymetrics.incr('save_crash_dropped.count') def crashmover_save(self, crash_report): """Saves a crash to storage If this raises an error, then that bubbles up and the caller can figure out what to do with it and retry again later. """ crash_id = crash_report.crash_id dumps = crash_report.dumps raw_crash = crash_report.raw_crash # Capture total time it takes to save the crash with mymetrics.timer('crash_save.time'): # Save dumps to crashstorage self.crashstorage.save_dumps(crash_id, dumps) # Save the raw crash metadata to crashstorage self.crashstorage.save_raw_crash(crash_id, raw_crash) # Capture the total time it took for this crash to be handled from # being received from breakpad client to saving to s3. # # NOTE(willkg): time.time returns seconds, but .timing() wants # milliseconds, so we multiply! delta = (time.time() - raw_crash['timestamp']) * 1000 mymetrics.timing('crash_handling.time', value=delta) mymetrics.incr('save_crash.count') logger.info('%s saved', crash_id) def join_pool(self): """Joins the pool--use only in tests! This is helpful for forcing all the coroutines in the pool to complete so that we can verify outcomes in the test suite for work that might cross coroutines. """ self.crashmover_pool.join()
class _DownloadAgent(object): """Exclusively manages downloading files from Drive within another process. """ # TODO(dustin): We'll have to use multiprocessing's logging wrappers. def __init__(self, request_q, stop_ev): self.__request_q = request_q self.__stop_ev = stop_ev self.__kill_ev = gevent.event.Event() self.__worker_pool = Pool(size=download_agent.NUM_WORKERS) self.__http_pool = HttpPool(download_agent.HTTP_POOL_SIZE) self.__http = GdriveAuth().get_authed_http() def download_worker(self, download_request, request_ev, download_stop_ev, ns): # TODO(dustin): We're just assuming that we can signal a multiprocessing event # from a green thread (the event still has value switching # through green threads. file_path = ('/tmp/gdrivefs/downloaded/%s' % (download_request.typed_entry.entry_id)) with open(file_path, 'wb') as f: downloader = ChunkedDownload(f, self.__http, download_request.url, chunksize=download_agent.CHUNK_SIZE) try: while 1: # Stop downloading because the process is coming down. if self.__kill_ev.is_set() is True: raise DownloadAgentDownloadWorkerError( "Download worker terminated.") # Stop downloading this file, probably because all handles were # closed. if download_stop_ev.is_set() is True: raise DownloadAgentDownloadWorkerError( "Download worker was told to stop downloading.") # TODO(dustin): We'll have to provide an option for "revision assurance" to ensure that we download the same revision of a file from chunk to chunk. Otherwise, we won't have the guarantee. # TODO(dustin): Support reauthing, when necessary. # TODO(dustin): Support resumability. status, done = downloader.next_chunk() ns.bytes_written = status.resumable_progress if done is True: break # TODO(dustin): Finish this, and make sure the timezone matches the current system. mtime_epoch = 0#download_request.current_mtime_dt utime(file_path, (mtime_epoch, mtime_epoch)) except Exception as e: error = ("[%s] %s" % (e.__class__.__name__, str(e))) else: error = None ns.error = error if error is None: ns.file_path = file_path request_ev.set() def loop(self): while self.__stop_ev.is_set() is False: try: request_info = self.__request_q.get( timeout=download_agent.REQUEST_QUEUE_TIMEOUT_S) except Empty: continue if self.__worker_pool.free_count() == 0: logging.warn("It looks like we'll have to wait for a download " "worker to free up.") self.__worker_pool.spawn(self.download_worker, *request_info) # The download loop has exited (we were told to stop). # Signal the workers to stop what they're doing. self.__kill_ev.set() start_epoch = time() all_exited = False while (time() - start_epoch) < download_agent.GRACEFUL_WORKER_EXIT_WAIT_S: if self.__worker_pool.size <= self.__worker_pool.free_count(): all_exited = True break if all_exited is False: logging.error("Not all download workers exited in time: %d != %d" % (self.__worker_pool.size, self.__worker_pool.free_count())) # Kill and join the unassigned (and stubborn, still-assigned) workers. # TODO(dustin): We're assuming this is a hard kill that will always kill all workers. self.__worker_pool.kill() logging.info("Download agent is terminating. (%d) requested files " "will be abandoned." % (self.__request_q.qsize()))
class Worker: def __init__(self, seeds, done_que, run_que): self.showpercounts = 10 self.timeout = 5 self.starttime = time.time() self.oldtime = 0 self.quit = 0 self.https_enable = 0 self.run_que = run_que self.done_que = done_que self.tasks = [] self.done = 1 self.errdone = set() self.err = Error() self.loadstate() self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google', 'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' )) self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv')) self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 60 self.poolmaxfree = 20 self.freecount = 0 self.down_pool = Pool(size=self.poolsize) self.totalnettime = 0 self.cbcputime = 0 self.totaldownsize = 0 self.curspeed = 0 self.debugnosave = 1 self.tt = 1 self.done_sites_fname='done_sites.bin' try: self.bfdone = BloomFilter.open(self.done_sites_fname) except: self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M if self.run_que.qsize() == 0: for seed in seeds: self.run_que.put( seed.split("http://")[1] ) if self.https_enable == 0: self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I) else: self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I) def cb_httpget(self, data = None): if not data: return seed, err, headers, content = data st = time.time() if err: self.handle_error(err,seed) return if self.https_enable == 0: seed = seed[7:] self.bfdone.add(seed) self.done += 1 data={'seed':seed,'headers':headers,'content':content} dat = cPickle.dumps(data) self.done_que.put(dat) et = time.time() self.cbcputime += (et-st) #self.tt=(et-st) if self.done % self.showpercounts == 0: self.out(seed) pass def out(self, seed): spendtime = time.time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else "" now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 ) print "%s D:%-4d R:%-7d [Speed: T%.2f/s C%3d/s A%.2f] CB:%0.4f Active:%d %s %s" % (now, (self.done), self.run_que.qsize(), \ (self.done)/(spendtime+self.oldtime), self.curspeed, self.tt, self.totalnettime / self.done ,self.poolsize-self.freecount, str(self.err), seed ) def work(self): while self.quit == 0: st = time.time() curdone = self.done self.freecount = self.down_pool.free_count() if self.freecount > self.poolmaxfree: self.tasks = [] minlen = min(self.freecount+1,self.run_que.qsize()) #if minlen <=0:break for i in range( minlen): stt = time.time() url = self.run_que.get() ett = time.time() if url in self.bfdone:# 5%-10% continue url = "http://"+url self.tasks.append(url) for url in self.tasks: self.down_pool.apply_async(self.httpget, (url,), callback=self.cb_httpget) time.sleep(0.1) et = time.time() self.curspeed = (self.done - curdone) / (et-st) #self.tt = (et-st) self.down_pool.join() print "All OVER" def handle_error(self,e,url): if e.find('DNSError') > 0 : self.err.dns += 1 self.err.rdns.append(url) elif e.find('reset') > 0 :#Connection reset self.err.reset += 1 self.err.rreset.append(url) elif e.find('Max retries') > 0 or e.find('Connection aborted'): # self.err.conntimeout += 1 self.err.rconntimeout.append(url) elif e.find('refused') > 0: #Connection refused self.err.refuse += 1 self.err.rrefuse.append(url) else: self.err.others +=1 self.err.rothers.append(url) print "Error", url, e # requests is better through test def httpget_requests(self, url): st = time.time() con = "" e = "" res_headers = "" headers = { 'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding':'gzip,deflate', 'Connection':'close', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' } res = None try: # todo: query the ip of the website before get through dns req = requests req.max_redirects = 1 res = req.get(url, timeout = (3,2), headers = headers ) if self.https_enable == 0 and res.url.lower().startswith('http:'): if 'content-type' not in res.headers.keys() or 'html' not in res.headers['content-type']: return None con = res.content res.close() except KeyboardInterrupt: raise except Exception as e: e = str(e) if res: res.close() return url,e,None,None et = time.time() self.totalnettime += (et-st) self.tt = (et-st) return url, e, res.headers, con def savestate(self): self.quit = 1 now = time.time() self.oldtime += (now - self.starttime) #should hold on the singal for procdata done with open('state.txt','wb') as f: f.write(str(self.oldtime) + '\n') # tasks run_queue done f.write(str(len(self.tasks)) + '\n') for t in self.tasks: f.write(t + '\n') l = self.run_que.qsize() f.write(str(l)+ '\n') while l > 0: f.write( self.run_que.pop() + '\n') l-=1 f.write(str((self.done)) + '\n') with open('err_records.pack','wb') as f: cPickle.dump(self.err,f,2) print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Save state successfully." f.close() exit(0) def loadstate(self): try: with open('state.txt') as f: self.oldtime = float(f.readline()) tasks = int(f.readline()) for i in xrange(tasks): self.run_que.add(f.readline().rstrip('\n')) runnings = int(f.readline()) for i in xrange(runnings): self.run_que.add(f.readline().rstrip('\n')) self.done = int(f.readline()) with open('err_records.pack','rb') as f: self.err = cPickle.load(f) print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Load state successfuly." except Exception as e: print e
class RequestEngine: class ProcessorManager(object): def __init__(self): self._processor_map = {'default': None} def set(self, processor_name, value): self._processor_map[processor_name] = value def route(self, processor_name, **kwargs): if processor_name is None: processor_name_indeed = 'default' else: processor_name_indeed = processor_name processor = self._processor_map[processor_name_indeed] if processor is None: pass elif hasattr(processor, '__call__'): return processor.__call__(**kwargs) def __init__(self, pool_size = 20, pop_interval = 1, request_interval = 0, max_empty_retry = 2, request_timeout = 10, each_size_from_queue = 10, max_failure_allowed = -1): from gevent import monkey monkey.patch_all() self.pop_interval = pop_interval self.request_interval = request_interval self.pool = Pool(pool_size) self.quit_event = Event() self.max_empty_retry = max_empty_retry self.request_timeout = request_timeout self.each_size_from_queue = each_size_from_queue self.user_agent_provider = UserAgentProvider() self.max_failure_allowed = max_failure_allowed self._request_failure = 0 self.proxy_provider = None self.processor_manager = RequestEngine.ProcessorManager() self.before_each = [] self.after_each = [] gevent.signal(signal.SIGINT, self.quit) gevent.signal(signal.SIGQUIT, self.quit) gevent.signal(signal.SIGTERM, self.quit) def setup_request_queue(self, request_queue_ins): self.request_queue = request_queue_ins @property def active(self): if not hasattr(self, '_active'): self._active = False return self._active @active.setter def active(self, value): self._active = value def before_each(self, *processors): self.before_each += processors def after_each(self, *processors): self.after_each += processors def worker_count(self): return self.pool.size - self.pool.free_count() def quit(self): self.quit_event.set() def request(self, override_req_args= {}): self.active = True empty_count = 0 while True: if self.quit_event.is_set(): logger.warning("Quiting Engine") if self.pool.size != self.pool.free_count(): time.sleep(1) continue self.active = False logger.warning("Engine Gracefully Quit") break if (self.max_failure_allowed != -1 and self._request_failure >= self.max_failure_allowed): logger.warning( "Exceed Max Failures Count. Engine Stopping ..." ) self.quit() continue if self.pool.free_count() > self.each_size_from_queue: this_time_size = self.each_size_from_queue else: this_time_size = self.pool.free_count() if this_time_size > 0: reqs = self.request_queue.pop(this_time_size) logger.info('Current free workers: '+str(self.pool.free_count())) if (reqs is not None) and (len(reqs) > 0): for i in reqs: self.pool.spawn(self._make_requests, request=i, override = override_req_args) time.sleep(self.request_interval) else: empty_count +=1 if (self.max_empty_retry != -1 and empty_count >= self.max_empty_retry): logger.warning( "Exceed Max Empty. Engine Stopping ..." ) self.quit() continue #while self.pool.free_count() == 0: time.sleep(self.pop_interval) def setup_user_agent_provider(self, provider): self.user_agent_provider = provider def setup_proxy_provider(self, provider): self.proxy_provider = provider def register_processor(self, processor, name='default'): self.processor_manager.set(name, processor) def _make_requests(self, request, override): empty_count = 0 data= {} # Data flow is_failure_set = False request.kwargs.update(override) # Setting user agent if self.user_agent_provider: if 'headers' in request.kwargs: request.kwargs['headers'].update({'User-Agent': self.user_agent_provider.provide()}) else: request.kwargs['headers'] = {'User-Agent': self.user_agent_provider.provide()} # Setting proxy provider if self.proxy_provider: proxy = self.proxy_provider.provide() if proxy is not None: # If Provider return None, not use proxy _proxy = {'http':proxy.proxy, 'https':proxy.proxy} if 'proxies' in request.kwargs: request.kwargs['proxies'].update(_proxy) else: request.kwargs['proxies'] = _proxy logger.warning("Using Proxy: %s" % str(_proxy)) else: logger.warning("No Using Proxy") else: proxy = None ar = None result = False processors = {'before':None, 'after':None} if request.processors is not None: processors.update(request.processors) before_each_hook_result = None # Execute hook before every item try: logger.info("Executing before hook") before_each_hook_result = self.processor_manager.route( processor_name=processors['before'], request = request, extra = request.raw_info, data= data) for p in self.before_each: self.processor_manager.route(processor_name=p, request = request ,extra = request.raw_info, data= data) except: if not is_failure_set: self._request_failure += 1 is_failure_set = True logger.error("Exception while before hook execution: "+ traceback.format_exc()) # Execute request if before_each_hook_result != False: # Only if before hook return non-false try: logger.debug("Making request... (%s)" % str(request.kwargs)) _timeout = getattr(request.raw_info,'_timeout',self.request_timeout) logger.debug("Timeout setting: %s" % _timeout) with gevent.Timeout(_timeout): ar = requests.request(**request.kwargs) ar.raw_info = request.raw_info result = True # if result is False: # raise Exception("Request timeout (%s)" % self.request_timeout) except: if not is_failure_set: self._request_failure += 1 is_failure_set = True logger.error("Exception while requests execution: "+ traceback.format_exc()) try: # Execute hook after every request logger.info("Executing after hook") self.processor_manager.route( processor_name=processors['after'], response = ar, request = request, extra = request.raw_info, result = result, data=data) for p in self.after_each: self.processor_manager.route(processor_name=p,response = ar, request = request,extra = request.raw_info, result = result, data= data) # process proxy provider if proxy: self.proxy_provider.callback(proxy, result=result, response = ar, request=request) except: if not is_failure_set: self._request_failure += 1 is_failure_set = True logger.error("Exception while after hook execution", exc_info=True)
class BreakpadSubmitterResource(RequiredConfigMixin): """Handles incoming breakpad-style crash reports. This handles incoming HTTP POST requests containing breakpad-style crash reports in multipart/form-data format. It can handle compressed or uncompressed POST payloads. It parses the payload from the HTTP POST request, runs it through the throttler with the specified rules, generates a crash_id, returns the crash_id to the HTTP client, saves the crash using the configured crashstorage class, and publishes it using the configured crashpublish class. .. Note:: From when a crash comes in to when it's saved by the crashstorage class, the crash is entirely in memory. Keep that in mind when figuring out how to scale your Antenna nodes. The most important configuration bit here is choosing the crashstorage class. For example:: CRASHSTORAGE_CLASS=antenna.ext.s3.crashstorage.S3CrashStorage """ required_config = ConfigOptions() required_config.add_option( "dump_field", default="upload_file_minidump", doc="The name of the field in the POST data for dumps.", ) required_config.add_option( "dump_id_prefix", default="bp-", doc="The crash type prefix." ) required_config.add_option( "concurrent_crashmovers", default="2", parser=positive_int, doc=( "The number of crashes concurrently being saved and published. " "Each process gets this many concurrent crashmovers, so if you're " "running 5 processes on the node, then it's " "(5 * concurrent_crashmovers) sharing upload bandwidth." ), ) # crashstorage things required_config.add_option( "crashstorage_class", default="antenna.ext.crashstorage_base.NoOpCrashStorage", parser=parse_class, doc="The class in charge of storing crashes.", ) # crashpublish things required_config.add_option( "crashpublish_class", default="antenna.ext.crashpublish_base.NoOpCrashPublish", parser=parse_class, doc="The class in charge of publishing crashes.", ) def __init__(self, config): self.config = config.with_options(self) self.crashstorage = self.config("crashstorage_class")( config.with_namespace("crashstorage") ) self.crashpublish = self.config("crashpublish_class")( config.with_namespace("crashpublish") ) self.throttler = Throttler(config) # Gevent pool for crashmover workers self.crashmover_pool = Pool(size=self.config("concurrent_crashmovers")) # Queue for crashmover work self.crashmover_queue = deque() # Register hb functions with heartbeat manager register_for_heartbeat(self.hb_report_health_stats) register_for_heartbeat(self.hb_run_crashmover) # Register life function with heartbeat manager register_for_life(self.has_work_to_do) def get_runtime_config(self, namespace=None): """Return generator of runtime configuration.""" for item in super().get_runtime_config(): yield item for item in self.throttler.get_runtime_config(): yield item for item in self.crashstorage.get_runtime_config(["crashstorage"]): yield item for item in self.crashpublish.get_runtime_config(["crashpublish"]): yield item def check_health(self, state): """Return health state.""" if hasattr(self.crashstorage, "check_health"): self.crashstorage.check_health(state) if hasattr(self.crashpublish, "check_health"): self.crashpublish.check_health(state) def hb_report_health_stats(self): """Heartbeat function to report health stats.""" # The number of crash reports sitting in the work queue; this is a # direct measure of the health of this process--a number that's going # up means impending doom mymetrics.gauge("work_queue_size", value=len(self.crashmover_queue)) def has_work_to_do(self): """Return whether this still has work to do.""" work_to_do = len(self.crashmover_pool) + len(self.crashmover_queue) logger.info("work left to do: %s" % work_to_do) # Indicates whether or not we're sitting on crashes to save--this helps # keep Antenna alive until we're done saving crashes return bool(work_to_do) def extract_payload(self, req): """Parse HTTP POST payload. Decompresses the payload if necessary and then walks through the FieldStorage converting from multipart/form-data to Python datatypes. NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It has a list attribute that is a list of FieldStorage items--one for each key/val in the form. For attached files, the FieldStorage will have a name, value and filename and the type should be ``application/octet-stream``. Thus we parse it looking for things of type ``text/plain``, ``application/json``, and application/octet-stream. :arg falcon.request.Request req: a Falcon Request instance :returns: (raw_crash dict, dumps dict) :raises MalformedCrashReport: """ # If we don't have a content type, raise MalformedCrashReport if not req.content_type: raise MalformedCrashReport("no_content_type") # If it's the wrong content type or there's no boundary section, raise # MalformedCrashReport content_type = [part.strip() for part in req.content_type.split(";", 1)] if ( len(content_type) != 2 or content_type[0] != "multipart/form-data" or not content_type[1].startswith("boundary=") ): if content_type[0] != "multipart/form-data": raise MalformedCrashReport("wrong_content_type") else: raise MalformedCrashReport("no_boundary") content_length = req.content_length or 0 # If there's no content, raise MalformedCrashReport if content_length == 0: raise MalformedCrashReport("no_content_length") # Decompress payload if it's compressed if req.env.get("HTTP_CONTENT_ENCODING") == "gzip": mymetrics.incr("gzipped_crash") # If the content is gzipped, we pull it out and decompress it. We # have to do that here because nginx doesn't have a good way to do # that in nginx-land. gzip_header = 16 + zlib.MAX_WBITS try: data = zlib.decompress(req.stream.read(content_length), gzip_header) except zlib.error: # This indicates this isn't a valid compressed stream. Given # that the HTTP request insists it is, we're just going to # assume it's junk and not try to process any further. raise MalformedCrashReport("bad_gzip") # Stomp on the content length to correct it because we've changed # the payload size by decompressing it. We save the original value # in case we need to debug something later on. req.env["ORIG_CONTENT_LENGTH"] = content_length content_length = len(data) req.env["CONTENT_LENGTH"] = str(content_length) data = io.BytesIO(data) mymetrics.histogram( "crash_size", value=content_length, tags=["payload:compressed"] ) else: # NOTE(willkg): At this point, req.stream is either a # falcon.request_helper.BoundedStream (in tests) or a # gunicorn.http.body.Body (in production). # # FieldStorage doesn't work with BoundedStream so we pluck out the # internal stream from that which works fine. # # FIXME(willkg): why don't tests work with BoundedStream? if isinstance(req.stream, BoundedStream): data = req.stream.stream else: data = req.stream mymetrics.histogram( "crash_size", value=content_length, tags=["payload:uncompressed"] ) # Stomp on querystring so we don't pull it in request_env = dict(req.env) request_env["QUERY_STRING"] = "" fs = cgi.FieldStorage(fp=data, environ=request_env, keep_blank_values=1) raw_crash = {} dumps = {} has_json = False has_kvpairs = False for fs_item in fs.list: # If the field has no name, then it's probably junk, so let's drop it. if not fs_item.name: continue if fs_item.name == "dump_checksums": # We don't want to pick up the dump_checksums from a raw # crash that was re-submitted. continue elif fs_item.type and fs_item.type.startswith("application/json"): # This is a JSON blob, so load it and override raw_crash with # it. has_json = True try: raw_crash = json.loads(fs_item.value) except json.decoder.JSONDecodeError: raise MalformedCrashReport("bad_json") elif fs_item.type and ( fs_item.type.startswith("application/octet-stream") or isinstance(fs_item.value, bytes) ): # This is a dump, so add it to dumps using a sanitized dump # name. dump_name = sanitize_dump_name(fs_item.name) dumps[dump_name] = fs_item.value else: # This isn't a dump, so it's a key/val pair, so we add that. has_kvpairs = True raw_crash[fs_item.name] = fs_item.value if not raw_crash: raise MalformedCrashReport("no_annotations") if has_json and has_kvpairs: # If the crash payload has both kvpairs and a JSON blob, then it's # malformed and we should dump it. raise MalformedCrashReport("has_json_and_kv") # Add a note about how the annotations were encoded in the crash report. # For now, there are two options: json and multipart. if has_json: raw_crash["payload"] = "json" else: raw_crash["payload"] = "multipart" return raw_crash, dumps def get_throttle_result(self, raw_crash): """Run raw_crash through throttler for a throttling result. :arg dict raw_crash: the raw crash to throttle :returns tuple: ``(result, rule_name, percentage)`` """ # At this stage, nothing has given us a throttle answer, so we # throttle the crash. result, rule_name, throttle_rate = self.throttler.throttle(raw_crash) # Save the results in the raw_crash itself raw_crash["legacy_processing"] = result raw_crash["throttle_rate"] = throttle_rate return result, rule_name, throttle_rate def cleanup_crash_report(self, raw_crash): """Remove anything from the crash report that shouldn't be there. This operates on the raw_crash in-place. This adds notes to ``collector_notes``. """ collector_notes = [] # Remove bad fields for bad_field in BAD_FIELDS: if bad_field in raw_crash: del raw_crash[bad_field] collector_notes.append("Removed %s from raw crash." % bad_field) raw_crash["collector_notes"] = collector_notes @mymetrics.timer_decorator("on_post.time") def on_post(self, req, resp): """Handle incoming HTTP POSTs. Note: This is executed by the WSGI app, so it and anything it does is covered by the Sentry middleware. """ resp.status = falcon.HTTP_200 start_time = time.time() # NOTE(willkg): This has to return text/plain since that's what the # breakpad clients expect. resp.content_type = "text/plain" try: raw_crash, dumps = self.extract_payload(req) except MalformedCrashReport as exc: # If this is malformed, then reject it with malformed error code. msg = str(exc) mymetrics.incr("malformed", tags=["reason:%s" % msg]) resp.status = falcon.HTTP_400 resp.body = "Discarded=malformed_%s" % msg return mymetrics.incr("incoming_crash") # Add timestamps current_timestamp = utc_now() raw_crash["submitted_timestamp"] = current_timestamp.isoformat() raw_crash["timestamp"] = start_time # Add checksums and MinidumpSha256Hash raw_crash["dump_checksums"] = { dump_name: hashlib.sha256(dump).hexdigest() for dump_name, dump in dumps.items() } raw_crash["MinidumpSha256Hash"] = raw_crash["dump_checksums"].get( "upload_file_minidump", "" ) # First throttle the crash which gives us the information we need # to generate a crash id. throttle_result, rule_name, percentage = self.get_throttle_result(raw_crash) # Use a uuid if they gave us one and it's valid--otherwise create a new # one. if "uuid" in raw_crash and validate_crash_id(raw_crash["uuid"]): crash_id = raw_crash["uuid"] logger.info("%s has existing crash_id", crash_id) else: crash_id = create_crash_id( timestamp=current_timestamp, throttle_result=throttle_result ) raw_crash["uuid"] = crash_id raw_crash["type_tag"] = self.config("dump_id_prefix").strip("-") # Log the throttle result logger.info( "%s: matched by %s; returned %s", crash_id, rule_name, RESULT_TO_TEXT[throttle_result], ) mymetrics.incr("throttle_rule", tags=["rule:%s" % rule_name]) mymetrics.incr( "throttle", tags=["result:%s" % RESULT_TO_TEXT[throttle_result].lower()] ) # If the result is REJECT, then discard it if throttle_result is REJECT: resp.body = "Discarded=rule_%s" % rule_name return # If the result is a FAKEACCEPT, then we return a crash id, but throw the crash # away if throttle_result is FAKEACCEPT: resp.body = "CrashID=%s%s\n" % (self.config("dump_id_prefix"), crash_id) return # If we're accepting the cash report, then clean it up, save it and return the # CrashID to the client self.cleanup_crash_report(raw_crash) crash_report = CrashReport(raw_crash, dumps, crash_id) crash_report.set_state(STATE_SAVE) self.crashmover_queue.append(crash_report) self.hb_run_crashmover() resp.body = "CrashID=%s%s\n" % (self.config("dump_id_prefix"), crash_id) def hb_run_crashmover(self): """Spawn a crashmover if there's work to do.""" # Spawn a new crashmover if there's stuff in the queue and we haven't # hit the limit of how many we can run if self.crashmover_queue and self.crashmover_pool.free_count() > 0: self.crashmover_pool.spawn(self.crashmover_process_queue) def crashmover_process_queue(self): """Process crashmover work. NOTE(willkg): This has to be super careful not to lose crash reports. If there's any kind of problem, this must return the crash report to the relevant queue. """ while self.crashmover_queue: crash_report = self.crashmover_queue.popleft() try: if crash_report.state == STATE_SAVE: # Save crash and then toss crash_id in the publish queue self.crashmover_save(crash_report) crash_report.set_state(STATE_PUBLISH) self.crashmover_queue.append(crash_report) elif crash_report.state == STATE_PUBLISH: # Publish crash and we're done self.crashmover_publish(crash_report) self.crashmover_finish(crash_report) except Exception: mymetrics.incr("%s_crash_exception.count" % crash_report.state) crash_report.errors += 1 logger.exception( "Exception when processing queue (%s), state: %s; error %d/%d", crash_report.crash_id, crash_report.state, crash_report.errors, MAX_ATTEMPTS, ) # After MAX_ATTEMPTS, we give up on this crash and move on if crash_report.errors < MAX_ATTEMPTS: self.crashmover_queue.append(crash_report) else: logger.error( "%s: too many errors trying to %s; dropped", crash_report.crash_id, crash_report.state, ) mymetrics.incr("%s_crash_dropped.count" % crash_report.state) def crashmover_finish(self, crash_report): """Finish bookkeeping on crash report.""" # Capture the total time it took for this crash to be handled from # being received from breakpad client to saving to s3. # # NOTE(willkg): time.time returns seconds, but .timing() wants # milliseconds, so we multiply! delta = (time.time() - crash_report.raw_crash["timestamp"]) * 1000 mymetrics.timing("crash_handling.time", value=delta) mymetrics.incr("save_crash.count") @mymetrics.timer("crash_save.time") def crashmover_save(self, crash_report): """Save crash report to storage.""" self.crashstorage.save_crash(crash_report) logger.info("%s saved", crash_report.crash_id) @mymetrics.timer("crash_publish.time") def crashmover_publish(self, crash_report): """Publish crash_id in publish queue.""" self.crashpublish.publish_crash(crash_report) logger.info("%s published", crash_report.crash_id) def join_pool(self): """Join the pool. NOTE(willkg): Only use this in tests! This is helpful for forcing all the coroutines in the pool to complete so that we can verify outcomes in the test suite for work that might cross coroutines. """ self.crashmover_pool.join()
class BreakpadSubmitterResource(RequiredConfigMixin): """Handles incoming breakpad crash reports and saves to crashstorage. This handles incoming HTTP POST requests containing breakpad-style crash reports in multipart/form-data format. It can handle compressed or uncompressed POST payloads. It parses the payload from the HTTP POST request, runs it through the throttler with the specified rules, generates a crash_id, returns the crash_id to the HTTP client and then saves the crash using the configured crashstorage class. .. Note:: From when a crash comes in to when it's saved by the crashstorage class, the crash is entirely in memory. Keep that in mind when figuring out how to scale your Antenna nodes. The most important configuration bit here is choosing the crashstorage class. For example:: CRASHSTORAGE_CLASS=antenna.ext.s3.crashstorage.S3CrashStorage """ required_config = ConfigOptions() required_config.add_option( 'dump_field', default='upload_file_minidump', doc='The name of the field in the POST data for dumps.' ) required_config.add_option( 'dump_id_prefix', default='bp-', doc='The crash type prefix.' ) required_config.add_option( 'concurrent_crashmovers', default='2', parser=positive_int, doc=( 'The number of crashes concurrently being saved and published. ' 'Each process gets this many concurrent crashmovers, so if you\'re ' 'running 5 processes on the node, then it\'s ' '(5 * concurrent_crashmovers) sharing upload bandwidth.' ) ) # crashstorage things required_config.add_option( 'crashstorage_class', default='antenna.ext.crashstorage_base.NoOpCrashStorage', parser=parse_class, doc='The class in charge of storing crashes.' ) # crashpublish things required_config.add_option( 'crashpublish_class', default='antenna.ext.crashpublish_base.NoOpCrashPublish', parser=parse_class, doc='The class in charge of publishing crashes.' ) def __init__(self, config): self.config = config.with_options(self) self.crashstorage = self.config('crashstorage_class')(config.with_namespace('crashstorage')) self.crashpublish = self.config('crashpublish_class')(config.with_namespace('crashpublish')) self.throttler = Throttler(config) # Gevent pool for crashmover workers self.crashmover_pool = Pool(size=self.config('concurrent_crashmovers')) # Queue for crashmover work self.crashmover_queue = deque() # Register hb functions with heartbeat manager register_for_heartbeat(self.hb_report_health_stats) register_for_heartbeat(self.hb_run_crashmover) # Register life function with heartbeat manager register_for_life(self.has_work_to_do) def get_runtime_config(self, namespace=None): """Return generator of runtime configuration.""" for item in super().get_runtime_config(): yield item for item in self.throttler.get_runtime_config(): yield item for item in self.crashstorage.get_runtime_config(['crashstorage']): yield item for item in self.crashpublish.get_runtime_config(['crashpublish']): yield item def check_health(self, state): """Return health state.""" if hasattr(self.crashstorage, 'check_health'): self.crashstorage.check_health(state) if hasattr(self.crashpublish, 'check_health'): self.crashpublish.check_health(state) def hb_report_health_stats(self): """Heartbeat function to report health stats.""" # The number of crash reports sitting in the work queue; this is a # direct measure of the health of this process--a number that's going # up means impending doom mymetrics.gauge('work_queue_size', value=len(self.crashmover_queue)) def has_work_to_do(self): """Return whether this still has work to do.""" work_to_do = ( len(self.crashmover_pool) + len(self.crashmover_queue) ) logger.info('work left to do: %s' % work_to_do) # Indicates whether or not we're sitting on crashes to save--this helps # keep Antenna alive until we're done saving crashes return bool(work_to_do) def extract_payload(self, req): """Parse HTTP POST payload. Decompresses the payload if necessary and then walks through the FieldStorage converting from multipart/form-data to Python datatypes. NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It has a list attribute that is a list of FieldStorage items--one for each key/val in the form. For attached files, the FieldStorage will have a name, value and filename and the type should be application/octet-stream. Thus we parse it looking for things of type text/plain and application/octet-stream. :arg falcon.request.Request req: a Falcon Request instance :returns: (raw_crash dict, dumps dict) """ # If we don't have a content type, return an empty crash if not req.content_type: mymetrics.incr('malformed', tags=['reason:no_content_type']) return {}, {} # If it's the wrong content type or there's no boundary section, return # an empty crash content_type = [part.strip() for part in req.content_type.split(';', 1)] if ((len(content_type) != 2 or content_type[0] != 'multipart/form-data' or not content_type[1].startswith('boundary='))): if content_type[0] != 'multipart/form-data': mymetrics.incr('malformed', tags=['reason:wrong_content_type']) else: mymetrics.incr('malformed', tags=['reason:no_boundary']) return {}, {} content_length = req.content_length or 0 # If there's no content, return an empty crash if content_length == 0: mymetrics.incr('malformed', tags=['reason:no_content_length']) return {}, {} # Decompress payload if it's compressed if req.env.get('HTTP_CONTENT_ENCODING') == 'gzip': mymetrics.incr('gzipped_crash') # If the content is gzipped, we pull it out and decompress it. We # have to do that here because nginx doesn't have a good way to do # that in nginx-land. gzip_header = 16 + zlib.MAX_WBITS try: data = zlib.decompress(req.stream.read(content_length), gzip_header) except zlib.error: # This indicates this isn't a valid compressed stream. Given # that the HTTP request insists it is, we're just going to # assume it's junk and not try to process any further. mymetrics.incr('malformed', tags=['reason:bad_gzip']) return {}, {} # Stomp on the content length to correct it because we've changed # the payload size by decompressing it. We save the original value # in case we need to debug something later on. req.env['ORIG_CONTENT_LENGTH'] = content_length content_length = len(data) req.env['CONTENT_LENGTH'] = str(content_length) data = io.BytesIO(data) mymetrics.histogram('crash_size', value=content_length, tags=['payload:compressed']) else: # NOTE(willkg): At this point, req.stream is either a # falcon.request_helper.BoundedStream (in tests) or a # gunicorn.http.body.Body (in production). # # FieldStorage doesn't work with BoundedStream so we pluck out the # internal stream from that which works fine. # # FIXME(willkg): why don't tests work with BoundedStream? if isinstance(req.stream, BoundedStream): data = req.stream.stream else: data = req.stream mymetrics.histogram('crash_size', value=content_length, tags=['payload:uncompressed']) fs = cgi.FieldStorage(fp=data, environ=req.env, keep_blank_values=1) # NOTE(willkg): In the original collector, this returned request # querystring data as well as request body data, but we're not doing # that because the query string just duplicates data in the payload. raw_crash = {} dumps = {} has_json = False has_kvpairs = False for fs_item in fs.list: # NOTE(willkg): We saw some crashes come in where the raw crash ends up with # a None as a key. Make sure we can't end up with non-strings as keys. item_name = fs_item.name or '' if item_name == 'dump_checksums': # We don't want to pick up the dump_checksums from a raw # crash that was re-submitted. continue elif fs_item.type and fs_item.type.startswith('application/json'): # This is a JSON blob, so load it and override raw_crash with # it. has_json = True raw_crash = json.loads(fs_item.value) elif fs_item.type and (fs_item.type.startswith('application/octet-stream') or isinstance(fs_item.value, bytes)): # This is a dump, so add it to dumps using a sanitized dump # name. dump_name = sanitize_dump_name(item_name) dumps[dump_name] = fs_item.value else: # This isn't a dump, so it's a key/val pair, so we add that. has_kvpairs = True raw_crash[item_name] = fs_item.value if has_json and has_kvpairs: # If the crash payload has both kvpairs and a JSON blob, then it's # malformed and we should dump it. mymetrics.incr('malformed', tags=['reason:has_json_and_kv']) return {}, {} return raw_crash, dumps def get_throttle_result(self, raw_crash): """Run raw_crash through throttler for a throttling result. :arg dict raw_crash: the raw crash to throttle :returns tuple: ``(result, rule_name, percentage)`` """ # At this stage, nothing has given us a throttle answer, so we # throttle the crash. result, rule_name, throttle_rate = self.throttler.throttle(raw_crash) # Save the results in the raw_crash itself raw_crash['legacy_processing'] = result raw_crash['throttle_rate'] = throttle_rate return result, rule_name, throttle_rate @mymetrics.timer_decorator('on_post.time') def on_post(self, req, resp): """Handle incoming HTTP POSTs. Note: This is executed by the WSGI app, so it and anything it does is covered by the Sentry middleware. """ resp.status = falcon.HTTP_200 start_time = time.time() # NOTE(willkg): This has to return text/plain since that's what the # breakpad clients expect. resp.content_type = 'text/plain' raw_crash, dumps = self.extract_payload(req) # If we didn't get any crash data, then just drop it and move on--don't # count this as an incoming crash and don't do any more work on it if not raw_crash: resp.body = 'Discarded=1' return mymetrics.incr('incoming_crash') # Add timestamps current_timestamp = utc_now() raw_crash['submitted_timestamp'] = current_timestamp.isoformat() raw_crash['timestamp'] = start_time # Add checksums and MinidumpSha256Hash raw_crash['dump_checksums'] = { dump_name: hashlib.sha256(dump).hexdigest() for dump_name, dump in dumps.items() } raw_crash['MinidumpSha256Hash'] = raw_crash['dump_checksums'].get('upload_file_minidump', '') # First throttle the crash which gives us the information we need # to generate a crash id. throttle_result, rule_name, percentage = self.get_throttle_result(raw_crash) # Use a uuid if they gave us one and it's valid--otherwise create a new # one. if 'uuid' in raw_crash and validate_crash_id(raw_crash['uuid']): crash_id = raw_crash['uuid'] logger.info('%s has existing crash_id', crash_id) else: crash_id = create_crash_id( timestamp=current_timestamp, throttle_result=throttle_result ) raw_crash['uuid'] = crash_id raw_crash['type_tag'] = self.config('dump_id_prefix').strip('-') # Log the throttle result logger.info('%s: matched by %s; returned %s', crash_id, rule_name, RESULT_TO_TEXT[throttle_result]) mymetrics.incr('throttle_rule', tags=['rule:%s' % rule_name]) mymetrics.incr('throttle', tags=['result:%s' % RESULT_TO_TEXT[throttle_result].lower()]) if throttle_result is REJECT: # If the result is REJECT, then discard it resp.body = 'Discarded=1' elif throttle_result is FAKEACCEPT: # If the result is a FAKEACCEPT, then we return a crash id, but throw # the crash away resp.body = 'CrashID=%s%s\n' % (self.config('dump_id_prefix'), crash_id) else: # If the result is not REJECT, then save it and return the CrashID to # the client crash_report = CrashReport(raw_crash, dumps, crash_id) crash_report.set_state(STATE_SAVE) self.crashmover_queue.append(crash_report) self.hb_run_crashmover() resp.body = 'CrashID=%s%s\n' % (self.config('dump_id_prefix'), crash_id) def hb_run_crashmover(self): """Spawn a crashmover if there's work to do.""" # Spawn a new crashmover if there's stuff in the queue and we haven't # hit the limit of how many we can run if self.crashmover_queue and self.crashmover_pool.free_count() > 0: self.crashmover_pool.spawn(self.crashmover_process_queue) def crashmover_process_queue(self): """Process crashmover work. NOTE(willkg): This has to be super careful not to lose crash reports. If there's any kind of problem, this must return the crash report to the relevant queue. """ while self.crashmover_queue: crash_report = self.crashmover_queue.popleft() try: if crash_report.state == STATE_SAVE: # Save crash and then toss crash_id in the publish queue self.crashmover_save(crash_report) crash_report.set_state(STATE_PUBLISH) self.crashmover_queue.append(crash_report) elif crash_report.state == STATE_PUBLISH: # Publish crash and we're done self.crashmover_publish(crash_report) self.crashmover_finish(crash_report) except Exception: mymetrics.incr('%s_crash_exception.count' % crash_report.state) crash_report.errors += 1 logger.exception( 'Exception when processing queue (%s), state: %s; error %d/%d', crash_report.crash_id, crash_report.state, crash_report.errors, MAX_ATTEMPTS ) # After MAX_ATTEMPTS, we give up on this crash and move on if crash_report.errors < MAX_ATTEMPTS: self.crashmover_queue.append(crash_report) else: logger.error( '%s: too many errors trying to %s; dropped', crash_report.crash_id, crash_report.state ) mymetrics.incr('%s_crash_dropped.count' % crash_report.state) def crashmover_finish(self, crash_report): """Finish bookkeeping on crash report.""" # Capture the total time it took for this crash to be handled from # being received from breakpad client to saving to s3. # # NOTE(willkg): time.time returns seconds, but .timing() wants # milliseconds, so we multiply! delta = (time.time() - crash_report.raw_crash['timestamp']) * 1000 mymetrics.timing('crash_handling.time', value=delta) mymetrics.incr('save_crash.count') @mymetrics.timer('crash_save.time') def crashmover_save(self, crash_report): """Save crash report to storage.""" self.crashstorage.save_crash(crash_report) logger.info('%s saved', crash_report.crash_id) @mymetrics.timer('crash_publish.time') def crashmover_publish(self, crash_report): """Publish crash_id in publish queue.""" self.crashpublish.publish_crash(crash_report) logger.info('%s published', crash_report.crash_id) def join_pool(self): """Join the pool. NOTE(willkg): Only use this in tests! This is helpful for forcing all the coroutines in the pool to complete so that we can verify outcomes in the test suite for work that might cross coroutines. """ self.crashmover_pool.join()
class Worker(threading.Thread): """ 工作线程 """ def __init__(self, workers, thread_name, greents_num, func, workload): self.__workers = workers self.__busy = False self.__pool = Pool(greents_num + 1) self.greents_num = greents_num self.thread_name = thread_name self.__func = func self.workload = workload threading.Thread.__init__(self, None, None, self.thread_name, (), {}) logger.info("%s init complete" % self.thread_name) def task_entrance(self, task): try: with gevent.Timeout(self.workload.timeout): self.__func(task) except gevent.Timeout: self.workload.complete_workload(task, '52', 'NULL') logger.info('>>>>>>>>>>>>>> task timeout!' + str(task)) def dojudge(self): r = os.popen('free -am').readlines()[1].split(' ')[-1].strip() if int(r) < 500: gc.collect() return False return True def run(self): self.__busy = True while self.__busy: # 没任务会阻塞的,不用自己线程自己sleep ... task = self.workload.assign_workload() logger.info( 'workload assign task pool size: {0} free count: {1}'.format( self.__pool.size, self.__pool.free_count())) if self.__pool.free_count() < 2: logger.warn( '[Exception MJOPObserver,type=ex78000,uid=,csuid=,qid={ts},ts={ts},ip={ip},' 'refer_id=,cur_id=spider_slave,debug=任务堆积-空闲池:{free}/{size}-等待任务:{count}]' .format(ts=int(time.time() * 1000), ip=local_ip, size=self.__pool.size, free=self.__pool.free_count(), count=self.workload.tasks.qsize())) self.__pool.spawn(self.task_entrance, task) self.__busy = False logger.info("%s stop" % self.thread_name) def is_busy(self): return self.__busy def stop(self): self.__busy = False time.sleep(0.5)
#!/usr/bin/python import time import random import gevent from gevent import Greenlet from gevent.pool import Pool def thrFunc(n): print "sleep %d seconds start.\n" % n gevent.sleep(n) print "sleep %d seconds end.\n" % n threadPool = Pool(size=3) while True: sec = random.randint(3, 6) #gThr = Greenlet(thrFunc,sec) #gThr.start() #gThr.join() print "+++free:", threadPool.free_count() threadPool.spawn(thrFunc, sec) #threadPool.apply_async(thrFunc,sec)
class Worker: def __init__(self, seeds, connque): self.showpercounts = 10 self.timeout = 5 self.starttime = time.time() self.oldtime = 0 self.quit = 0 self.https_enable = 0 self.run_queue = multiprocessing.Queue() self.connque = connque self.tasks = [] self.done = 1 self.errdone = set() self.err = Error() self.loadstate() #self.whitelist = ['html','htm','php','shtml','asp','jsp','do','action','aspx'] self.blacklist = set( ('.blog.', '.taobao.com', '.baidu.com', '.edu', '.gov', '.mil', 'mail', '.google', 'weibo.com', 't.cn', 'wikipedia', 'facebook', 'twitter', 'dropbox')) self.allowdDomain = set(('com', 'net', 'org', 'cn', 'info', 'biz', 'me', 'name', 'cc', 'tv')) self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 200 self.poolmaxfree = 40 self.freecount = 0 self.down_pool = Pool(size=self.poolsize) self.totalnettime = 0 self.cbcputime = 0 self.totaldownsize = 0 self.curspeed = 0 self.debugnosave = 1 self.tt = 1 try: self.bfdone = BloomFilter.open('done_sites.bin') except: self.bfdone = BloomFilter(2**23, 10**(-5), 'done_sites.bin') if self.run_queue.qsize() == 0: for seed in seeds: self.run_queue.put(seed.split("http://")[1]) if self.https_enable == 0: self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)', re.I) else: self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)', re.I) def debug_filter(self, urls): #return filter(lambda url: ".fang.com" in url , urls) return urls def cb_httpget(self, data=None): if not data: return seed, err, headers, html = data st = time.time() if err: self.handle_error(err, seed) return #http:// if self.https_enable == 0: seed = seed[7:] self.bfdone.add(seed) self.done += 1 self.connque.put((seed, headers, html)) et = time.time() self.cbcputime += (et - st) if self.done % self.showpercounts == 0: self.out(seed) def out(self, seed): spendtime = time.time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime nowh = str(int(spendtime) / 3600) + ":" if spendtime > 3600 else "" now = "%s%02d:%02d" % (nowh, spendtime % 3600 / 60, spendtime % 60) print "%s D:%-4d R:%-7d [Speed: T%.2f/s C%.2f/s A%.3f] CB:%0.4f Active:%d %s %s" % (now, (self.done), self.run_queue.qsize(), \ (self.done)/(spendtime+self.oldtime), self.curspeed, self.tt, self.totalnettime / spendtime ,self.poolsize-self.freecount, str(self.err), seed ) def work(self): while self.quit == 0: st = time.time() curdone = self.done self.freecount = self.down_pool.free_count() if self.freecount > self.poolmaxfree: self.tasks = [] minlen = min(self.freecount, self.run_queue.qsize()) #if minlen <=0:break stt = time.time() for i in range(minlen): url = self.run_queue.get() if url in self.bfdone: # 5%-10% continue #self.tt = time.time() - stt # may need add a byte to the url to figure out the https url = "http://" + url self.tasks.append(url) self.down_pool.apply_async(self.httpget, (url, ), callback=self.cb_httpget) time.sleep(0.5) et = time.time() self.curspeed = (self.done - curdone) / (et - st) self.down_pool.join() print "All OVER" def handle_error(self, e, url): if e.find('DNSError') > 0: self.err.dns += 1 self.err.rdns.append(url) elif e.find('reset') > 0: #Connection reset self.err.reset += 1 self.err.rreset.append(url) elif e.find('Max retries') > 0: # self.err.conntimeout += 1 self.err.rconntimeout.append(url) elif e.find('refused') > 0: #Connection refused self.err.refuse += 1 self.err.rrefuse.append(url) else: self.err.others += 1 self.err.rothers.append(url) print "Error", url, e # requests is better through test def httpget_requests(self, url): st = time.time() con = "" e = "" res_headers = "" headers = { 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.6', #'Accept':'text/html' 'Connection': 'close' } res = None try: # todo: query the ip of the website before get through dns req = requests req.max_redirects = 1 res = req.get(url, timeout=(3, 3), headers=headers) if self.https_enable == 0 and "https" not in res.url: if 'html' not in res.headers['content-type']: return None con = res.content #res.close() except KeyboardInterrupt: raise except Exception as e: e = str(e) if res: res.close() return None et = time.time() self.totalnettime += (et - st) return url, e, res.headers, con def httpget_curl(self, url): con = "" buffer = StringIO() c = pycurl.Curl() c.setopt(pycurl.URL, url) c.setopt(pycurl.MAXCONNECTS, 2) c.setopt(pycurl.CONNECTTIMEOUT, 3) c.setopt(pycurl.TIMEOUT, 5) c.setopt(pycurl.WRITEFUNCTION, buffer.write) c.perform() c.close() con = buffer.getvalue() return con def filter_urls(self, seed, urls): nurls = [] seeditem = seed.lower().split('.') seedlen = len(seeditem) maindomain = 1 if seeditem[0] == 'www' else 0 urls = {}.fromkeys(urls).keys() for url in urls: #url = url.split('/',1)[0].split('#',1)[0].split('?',1)[0].lower() url = url.lower() #filter Domain , only allowd for china suf = 0 urlitem = url.split('.') nlen = len(urlitem) if nlen < 2: continue tld = urlitem[-1] if tld in self.allowdDomain: if urlitem[-2] in self.allowdDomain: if nlen <= 4: suf = 2 else: if nlen <= 3: suf = 1 if suf >= 1: # blacklist verify block = 0 for b in self.blacklist: if url.find(b) >= 0: block = 1 continue if block == 0: if nlen != seedlen: nurls.append(url) else: if maindomain or urlitem[-(suf + 1)] != seeditem[-(suf + 1)]: nurls.append(url) #print seed, nurls return {}.fromkeys(nurls).keys() def geturls(self, seed, html): if not html or len(html) == 0: return [] urls = re.findall(self.urlpatern, html) st = time.time() urls = self.filter_urls(seed, urls) et = time.time() return urls def savestate(self): self.quit = 1 now = time.time() self.oldtime += (now - self.starttime) #should hold on the singal for procdata done with open('state.txt', 'wb') as f: f.write(str(self.oldtime) + '\n') # tasks run_queue done f.write(str(len(self.tasks)) + '\n') for t in self.tasks: f.write(t + '\n') l = self.run_queue.qsize() f.write(str(l) + '\n') while l > 0: f.write(self.run_queue.pop() + '\n') l -= 1 f.write(str((self.done)) + '\n') with open('err_records.pack', 'wb') as f: cPickle.dump(self.err, f, 2) print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Save state successfully." f.close() exit(0) def loadstate(self): try: with open('state.txt') as f: self.oldtime = float(f.readline()) tasks = int(f.readline()) for i in xrange(tasks): self.run_queue.add(f.readline().rstrip('\n')) runnings = int(f.readline()) for i in xrange(runnings): self.run_queue.add(f.readline().rstrip('\n')) self.done = int(f.readline()) with open('err_records.pack', 'rb') as f: self.err = cPickle.load(f) print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Load state successfuly." except Exception as e: print e
class SphinxService: def __init__(self, redis_server, sphinx_server, part, workers): ''' Inicializa el servidor, creando el pool de conexiones a Sphinx y las conexiones a Redis ''' # configuraciones self.redis_server = redis_server self.sphinx_server = sphinx_server self.part = chr(part) self.version = WORKER_VERSION self.workers_pool_size = self.sphinx_pool_size = self.redis_pool_size = workers self.lock_expiration = LOCK_EXPIRATION self.index_name = INDEX_NAME+str(part) self.default_order = DEFAULT_ORDER self.default_order_key = DEFAULT_ORDER_KEY self.default_group_order = DEFAULT_GROUP_ORDER self.default_weight = DEFAULT_WEIGHT self.default_ranking = DEFAULT_RANKING self.default_field_weights = DEFAULT_FIELD_WEIGHTS self.default_max_query_time = DEFAULT_MAX_QUERY_TIME self.max_max_query_time = MAX_MAX_QUERY_TIME # pool de gevent self.gevent_pool = Pool(self.workers_pool_size) # pool conexiones sphinx self.sphinx_conns = SphinxPool(self.sphinx_pool_size, self.sphinx_server, self.max_max_query_time, SPHINX_SOCKET_TIMEOUT) # conexion a redis normal self.redis_conns = RedisPool(self.redis_pool_size, self.redis_server, self.version, REDIS_TIMEOUT) # inicializa variables de control self.last_reindex = -1. self.stop = False self.pubsub_used = True def update_last_reindex(self): ''' Averigua cuando se realizó la última reindexación de este servidor. ''' with self.redis_conns.get() as redisc: previous = self.last_reindex self.last_reindex = float(redisc.get(CONTROL_KEY+"lr_%d"%ord(self.part)) or -1) redisc.used = True print "["+datetime.now().isoformat(" ")+"]", "Last reindex date updated: %.2f (%.2f)."%(self.last_reindex, previous) def update_blocked_sources(self): ''' Obtiene lista de origenes bloqueados. ''' with self.redis_conns.get() as redisc: self.blocked_sources = parse_data(redisc.get(CONTROL_KEY+"bs") or "\x90") redisc.used = True print "["+datetime.now().isoformat(" ")+"]", "Blocked sources updated." def keepalive_pubsub(self, timeout): ''' Mantiene viva la conexion pubsub si no llegan mensajes. ''' while not self.stop: # espera un rato sleep(timeout) # comprueba que la conexion se haya utilizado o hace un ping if self.pubsub_used: self.pubsub_used = False else: with self.redis_conns.get() as redisc: redisc.publish(RESULTS_CHANNEL, "pn") redisc.publish(CONTROL_CHANNEL+self.part, "pn") redisc.used = True def stop_server(self): print "["+datetime.now().isoformat(" ")+"]", "Stop command received." # deja de atender peticiones self.stop = True self.redis_pubsub.close() self.redis_pubsub.connection_pool.disconnect() def serve_forever(self): ''' Recibe y procesa peticiones de busqueda. ''' print "\n\n["+datetime.now().isoformat(" ")+"]", "Server started: %s, %d, %s, %d, %d"%(repr(self.redis_server), self.version, repr(self.sphinx_server), ord(self.part), self.workers_pool_size) # Inicializa intervalo de reintento en la conexion retry = 1 while not self.stop: try: # actualiza variables globales self.update_last_reindex() self.update_blocked_sources() # conexion a redis para pubsub self.redis_pubsub = redis.StrictRedis(host=self.redis_server[0], port=self.redis_server[1], db=self.version).pubsub() self.redis_pubsub.subscribe(EXECUTE_CHANNEL) self.redis_pubsub.subscribe(EXECUTE_CHANNEL+self.part) self.redis_pubsub.subscribe(CONTROL_CHANNEL+self.part) # Reinicia intervalo de reintento en la conexion retry = 1 # inicia el proceso de keepalive de la conexion pubsub self.gevent_pool.spawn(self.keepalive_pubsub, REDIS_TIMEOUT/5) # espera mensajes for msg in self.redis_pubsub.listen(): # marca que se ha usado la conexion self.pubsub_used = True # ignora los mensajes que no son mensajes if msg["type"]!="message": continue # extrae informacion del mensaje channel, part = msg["channel"][0], msg["channel"][1:] data = msg["data"] if channel==EXECUTE_CHANNEL: # busqueda # comprueba si es una busqueda general o es para este servidor request_id, info = parse_data(data) # procesa la peticion if request_id[0]==QUERY_KEY: self.gevent_pool.spawn(self.process_search_request, request_id, info) elif request_id[0]==LOCATION_KEY: self.gevent_pool.spawn(self.process_get_id_server_request, request_id, info) elif channel==CONTROL_CHANNEL: # control if data == "lr": # actualiza fecha de reindexado self.gevent_pool.spawn(self.update_last_reindex) elif data == "bs": # actualiza lista de origenes bloqueados self.gevent_pool.spawn(self.update_blocked_sources) elif data == "pn": # ping del keepalive pass elif channel==UPDATE_CHANNEL: # actualizaciones pass except redis.ConnectionError as e: if self.stop: break else: # Espera y elimina procesos pendientes self.gevent_pool.join(timeout=2) self.gevent_pool.kill(timeout=1) print "["+datetime.now().isoformat(" ")+"]", "Server connection error %s:'%s'. Will reconnect in %d seconds." % (repr(e), e.message, retry) # Espera tiempo de reintento e incrementa tiempo de reintento para la próxima vez (hasta 64 segundos) sleep(retry) if retry < 64: retry *= 2 except BaseException as e: if self.stop: break else: print "["+datetime.now().isoformat(" ")+"]", "Server stopped with error %s:'%s'."%(repr(e), e.message) logging.exception("Error on main loop on service %d."%ord(self.part)) return # espera los procesos que esten respondiendo self.gevent_pool.join(2) # si alguno no acabado en 2 segundos, lo mata self.gevent_pool.kill(timeout=1) print "["+datetime.now().isoformat(" ")+"]", "Server stopped normally." def process_get_id_server_request(self, request_id, info): try: # extrae parametros de la llamada bin_file_id = request_id[1:] query = info.decode("utf-8") # obtiene el cliente de redis with self.redis_conns.get() as redisc: # bloquea acceso si hace falta procesar esta peticion (nadie la esta haciendo o ha hecho ya) start_time = time() if redisc.hsetnx(request_id, self.part, "P"): try: block_time = time() with self.sphinx_conns.get() as sphinx: # busca el registro con el id pedido uri1, uri2, uri3 = FULL_ID_STRUCT.unpack(bin_file_id) sphinx.SetMaxQueryTime(MAX_MAX_QUERY_TIME) sphinx.SetFilter('uri1', [uri1]) sphinx.SetFilter('uri2', [uri2]) sphinx.SetFilter('uri3', [uri3]) sphinx.SetLimits(0,1,1,1) sphinx.SetIDRange(PART_ID_STRUCT.unpack(bin_file_id[:5]+"\x00\x00\x00")[0], PART_ID_STRUCT.unpack(bin_file_id[:5]+"\xFF\xFF\xFF")[0]) results = sphinx.Query(query, self.index_name, "d_id "+str(bin_file_id[:3].encode("hex"))) search_time = time() # comprueba resultados obtenidos has_it = results and "matches" in results and results["matches"] if has_it: redisc.pipeline().hset(request_id, self.part, "H").publish(RESULTS_CHANNEL, format_data((request_id, self.part, self.part))).execute() else: redisc.pipeline().hset(request_id, self.part, "N").publish(RESULTS_CHANNEL, format_data((request_id, self.part, None))).execute() end_time = time() print "["+datetime.fromtimestamp(start_time).isoformat(" ")+"]", self.gevent_pool.free_count(), ("*" if has_it else " ")+bin_file_id.encode("hex"), " %.2f (%.4f %.4f %.4f)"%(end_time-start_time, block_time-start_time, search_time-block_time, end_time-search_time), repr(query) except BaseException as e: redisc.hdel(request_id, self.part) print "["+datetime.now().isoformat(" ")+"] ERROR", self.gevent_pool.free_count(), "process_get_id_server_request inner", repr(e), e.message logging.exception("Error on searching for id %s on service %d."%(bin_file_id.encode("hex"), ord(self.part))) redisc.used = True except BaseException as e: print "["+datetime.now().isoformat(" ")+"] ERROR", "process_get_id_server_request outer", repr(e), e.message logging.exception("Error on process_get_id_server_request on service %d."%ord(self.part)) def process_search_request(self, request_id, info): # extrae parametros de la llamada query = info[0] subgroups = info[1] try: # analiza la peticion para ver qué hay que buscar with self.redis_conns.get() as redisc: start_time = prep_time = search_time = time() query_key = QUERY_KEY+hash_dict(query) # genera informacion de la peticion search_info = {"query_key":query_key, "query":query, "subgroups":subgroups, "generate_info":False, "version":0, "tries":0} # intenta bloquear o ignora la peticion porque ya hay alguien trabajando en ellau lock = redisc.lock(query_key+self.part+ACTIVE_KEY, LOCK_EXPIRATION) if lock.acquire(False): try: must_search = self.prepare_search(redisc, search_info) prep_time = search_time = time() if must_search: # realiza la busqueda results = self.search(search_info) search_time = time() # almacena los resultados y avisa a quien ha hecho la peticion self.store_results(redisc, search_info, results) except BaseException as e: print "["+datetime.now().isoformat(" ")+"] ERROR", self.gevent_pool.free_count(), "process_search_request inner", repr(e), e.message finally: lock.release() else: must_search = None prep_time = search_time = time() redisc.used = True # prepara info de la consulta para loguear end_time = time() query_sum = query["t"] if subgroups: subgroups_sum = sorted(subgroups.iteritems()) query_sum += " %d/%d %s"%(len(search_info["subgroups"]), len(subgroups_sum), repr(subgroups_sum[:4])) # imprime información de la busqueda print "["+datetime.fromtimestamp(start_time).isoformat(" ")+"]", self.gevent_pool.free_count() ,"".join(name if flag else " " for name, flag in izip("BSEDW", (must_search==None, must_search, "early_response" in search_info, "delete_subgroups" in search_info, search_info["tries"]>0))), search_info["tries"], " %.2f (%.4f %.4f %.4f) "%(end_time-start_time, prep_time-start_time, search_time-prep_time, end_time-search_time), query_key.encode("hex")[-10:], query_sum except BaseException as e: print "["+datetime.now().isoformat(" ")+"] ERROR", "process_search_request outer", repr(e), e.message logging.exception("Error on process_search_request on service %d."%ord(self.part)) def prepare_search(self, redisc, search_info): ''' Averigua si debe realizar esta busqueda. ''' # por defecto no va a buscar, pero no avisa pronto early_response = must_search = False query_key = search_info["query_key"] subgroups = search_info["subgroups"] # decide que informacion necesita if subgroups: keys = [PART_KEY+self.part, VERSION_KEY+self.part] keys.extend(PART_SG_KEY+self.part+str(subgroup) for subgroup, start in subgroups.iteritems()) else: keys = [PART_KEY+self.part, VERSION_KEY+self.part, INFO_KEY] # obtiene informacion de la busqueda del servidor search_cache = redisc.hmget(query_key, *keys) part_info, version, rest = search_cache[0], search_cache[1], search_cache[2:] # almacena la version actual search_info["version"] = int(version) if version else -1 if part_info: # si esta parte ya se ha buscado, mira razones por que tenga que buscarse de nuevo o busca los subgrupos part_info = parse_data(part_info) # obtiene el numero de intentos necesitados para esta busqueda hasta ahora search_info["tries"] = part_info[2] # hay datos aunque puedan no ser validos, avisa que se pueden usar early_response = True # comprueba la fecha de la busqueda con respecto al ultimo indexado if part_info[0]<self.last_reindex: search_info["delete_subgroups"] = part_info[4].keys() must_search = True # comprueba warnings en respuesta (usualmente por falta de tiempo) elif part_info[1]: search_info["tries"] += 1 must_search = True # busca en subgrupos solo si hay info valida de esta parte (must_search=False) y no hay info de algun subgrupo if subgroups: if must_search: # los datos principales son invalidos, no puede dar el subgrupo must_search = False else: # no piden los subgrupos que ya se tienen new_subgroups = search_info["subgroups"] = {subgroup: (current_subgroup or [1]) for (subgroup, start), current_subgroup in izip(subgroups.iteritems(), (parse_data(asubgroup) if asubgroup else None for asubgroup in rest)) if not current_subgroup or current_subgroup[0]<=start} must_search = bool(new_subgroups) else: # busca la info de esta parte, pero no un subgrupo if not subgroups: # genera información de la consulta si no la ha generado nadie aun if not rest[0]: search_info["generate_info"] = True must_search = True # avisa, si hay datos disponibles aunque haya que buscar if not subgroups and early_response: search_info["early_response"] = True redisc.publish(RESULTS_CHANNEL, format_data((query_key, self.part, None))) # si no tiene que buscar, libera el bloqueo if not must_search: return False # debe buscar return True @retry def search(self, search_info): query = search_info["query"] subgroups = search_info["subgroups"] if not "t" in query: raise Exception("Empty query search received.") # parametros de busqueda text = query["t"] filters = query["f"] if "f" in query else {} order = query["o"] if "o" in query else self.default_order order_key = query["ok"] if "ok" in query else self.default_order_key group_order = query["go"] if "go" in query else self.default_group_order weight = query["w"] if "w" in query else self.default_weight range_ids = query["i"] if "i" in query else None field_weights = query["fw"] if "fw" in query else self.default_field_weights ranking = query["r"] if "r" in query else self.default_ranking # parametros que no varian la busqueda offset, limit, max_matches, cutoff = query["l"] grouping = query["g"] if not subgroups and "g" in query else (GROUPING_GROUP|GROUPING_NO_GROUP) # por defecto pide informacion sin y con agrupacion (solo para principal)? max_query_time = min(self.default_max_query_time+QUERY_TIME_STEP*search_info["tries"] if "tries" in search_info else query["mt"] if "mt" in query else self.default_max_query_time, self.max_max_query_time) # obtiene cliente de sphinx with self.sphinx_conns.get() as sphinx: sphinx.ResetFilters() sphinx.ResetGroupBy() # configura cliente sphinx.SetFieldWeights(field_weights) sphinx.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, order) sphinx.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED) sphinx.SetRankingMode(sphinxapi.SPH_RANK_EXPR, ranking) sphinx.SetSelect("*, if(g>0xFFFFFFFF,1,0) as e, "+order_key+" as ok, "+weight+" as w") sphinx.SetMaxQueryTime(max_query_time) if range_ids: sphinx.SetIDRange(range_ids[0], range_ids[1]) else: sphinx.SetIDRange(0, 0) # realiza la peticion if subgroups: for sg, current in subgroups.iteritems(): sphinx.SetFilter('bl', [0]) sphinx.SetFilter("g", [long(sg)]) sphinx.SetLimits(current[0], limit, max_matches, cutoff) if filters: self._apply_filters(sphinx, filters) sphinx.AddQuery(text, self.index_name, "d_s "+sg+" "+str(max_query_time)) sphinx.ResetFilters() else: # traer resumen principal de todos los grupos sphinx.SetFilter('bl', [0]) sphinx.SetFilter("s", self.blocked_sources, True) sphinx.SetLimits(offset, limit, max_matches, cutoff) if filters: self._apply_filters(sphinx, filters) if grouping&GROUPING_NO_GROUP: sphinx.AddQuery(text, self.index_name, "d_ng "+str(max_query_time)) if grouping&GROUPING_GROUP: sphinx.SetGroupBy("g", sphinxapi.SPH_GROUPBY_ATTR, group_order) sphinx.AddQuery(text, self.index_name, "d_m "+str(max_query_time)) results = sphinx.RunQueries() error = sphinx.GetLastError() if error: raise SphinxError(error) sphinx.used = True return results def _apply_filters(self, sphinx, filters): if "z" in filters: sphinx.SetFilterFloatRange('z', float(filters["z"][0]), float(filters["z"][1])) if "e" in filters: sphinx.SetFilterRange('e', filters["e"]) if "ct" in filters: sphinx.SetFilter('ct', filters["ct"]) if "src" in filters: sphinx.SetFilter('s', set(filters["src"]).difference(self.blocked_sources)) def store_results(self, redisc, search_info, results): # recorre resultados y los pone en el orden deseado subgroups = search_info["subgroups"] query = search_info["query"] query_key = search_info["query_key"] tries = search_info["tries"] # nueva version de los datos version = search_info["version"]+1 save_info = {VERSION_KEY+self.part: version} now = time() if subgroups: ''' Va a guardar: - [part][sg] = con los resultados de los subgrupos de los que se han obtenido resultados ''' for result, (sg, current) in izip(results, subgroups.iteritems()): current.extend((FULL_ID_STRUCT.pack(r["attrs"]["uri1"],r["attrs"]["uri2"],r["attrs"]["uri3"]), r["id"], version, r["attrs"]["r"], r["attrs"]["w"]) for r in result["matches"]) current[0] = len(current) # el numero de resultados compensa el primer resultado if current[0]>1: # no guarda el subgrupo si no añade resultados save_info[PART_SG_KEY+self.part+str(sg)] = format_data(current) else: # Tipo de agrupación grouping = query["g"] ''' Va a guardar: - INFO: si corresponde - [part]: con los resultados de la busqueda agrupada - [part][sg] = con los resultados de la busqueda no agrupada, para los subgrupos de los que se han obtenido resultados ''' # Información de la busqueda agrupada if grouping&GROUPING_GROUP: result = results[-1] # es el ultimo resultado, puede ser el 0 o el 1 segun se haya pedido la busqueda sin agrupar save_info[PART_KEY+self.part] = format_data((now, bool(result["warning"]), tries, result["time"], {r["attrs"]["g"]:(r["attrs"]["@count"], (FULL_ID_STRUCT.pack(r["attrs"]["uri1"],r["attrs"]["uri2"],r["attrs"]["uri3"]), r["id"], version, r["attrs"]["r"], r["attrs"]["w"])) for r in result["matches"]})) # Almacena información de la búsqueda sin agrupar, si se ha pedido if grouping&GROUPING_NO_GROUP: result = results[0] # Agrupa resultados por subgrupos subgroups_extra = {} for r in result["matches"]: sg = r["attrs"]["g"] if sg in subgroups_extra: subgroups_extra[sg].append((FULL_ID_STRUCT.pack(r["attrs"]["uri1"],r["attrs"]["uri2"],r["attrs"]["uri3"]), r["id"], version, r["attrs"]["r"], r["attrs"]["w"])) else: subgroups_extra[sg] = [] # no incluye el primer resultado, que ya está en el resumen # Genera listas a guardar for sg, files in subgroups_extra.iteritems(): if not files: continue # no crea grupos sin ficheros extra files.insert(0,len(files)+1) if files[0]>1: # no guarda el subgrupo si no añade resultados save_info[PART_SG_KEY+self.part+str(sg)] = format_data(files) # genera información principal si hace falta if search_info["generate_info"]: save_info[INFO_KEY] = format_data([fix_sphinx_result(word["word"]).encode("utf-8") for word in results[0]["words"]]) # almacena datos en redis if "delete_subgroups" in search_info: redisc.pipeline().hdel(query_key, search_info["delete_subgroups"]).hmset(query_key, save_info).execute() else: redisc.hmset(query_key, save_info) # avisa que estan disponibles los resultados principales if not subgroups: redisc.publish(RESULTS_CHANNEL, format_data((query_key, self.part, None)))
def main_loop(config): """ Основной цикл приложения. :param config: конфигурация :type config: Config Алгоритм: * Открываем соединение с tarantool.queue, использую config.QUEUE_* настройки. * Создаем пул обработчиков. * Создаем очередь куда обработчики будут помещать выполненные задачи. * Пока количество обработчиков <= config.WORKER_POOL_SIZE, берем задачу из tarantool.queue и запускаем greenlet для ее обработки. * Посылаем уведомления о том, что задачи завершены в tarantool.queue. * Спим config.SLEEP секунд. """ logger.info('Connect to queue server on {host}:{port} space #{space}.'.format( host=config.QUEUE_HOST, port=config.QUEUE_PORT, space=config.QUEUE_SPACE )) queue = tarantool_queue.Queue( host=config.QUEUE_HOST, port=config.QUEUE_PORT, space=config.QUEUE_SPACE ) logger.info('Use tube [{tube}], take timeout={take_timeout}.'.format( tube=config.QUEUE_TUBE, take_timeout=config.QUEUE_TAKE_TIMEOUT )) tube = queue.tube(config.QUEUE_TUBE) logger.info('Create worker pool[{size}].'.format(size=config.WORKER_POOL_SIZE)) worker_pool = Pool(config.WORKER_POOL_SIZE) processed_task_queue = gevent_queue.Queue() logger.info('Run main loop. Worker pool size={count}. Sleep time is {sleep}.'.format( count=config.WORKER_POOL_SIZE, sleep=config.SLEEP )) while run_application: free_workers_count = worker_pool.free_count() logger.debug('Pool has {count} free workers.'.format(count=free_workers_count)) for number in xrange(free_workers_count): logger.debug('Get task from tube for worker#{number}.'.format(number=number)) task = tube.take(config.QUEUE_TAKE_TIMEOUT) if task: logger.info('Start worker#{number} for task id={task_id}.'.format( task_id=task.task_id, number=number )) worker = Greenlet( notification_worker, task, processed_task_queue, timeout=config.HTTP_CONNECTION_TIMEOUT, verify=False ) worker_pool.add(worker) worker.start() done_with_processed_tasks(processed_task_queue) sleep(config.SLEEP) if break_func_for_test(): break else: logger.info('Stop application loop.')
class Worker: def __init__(self, seeds, done_que, run_que): self.showpercounts = 10 self.timeout = 5 self.starttime = time.time() self.oldtime = 0 self.quit = 0 self.https_enable = 0 self.run_que = run_que self.done_que = done_que self.tasks = [] self.done = 1 self.errdone = set() self.err = Error() self.loadstate() self.blacklist = set( ('.blog.', '.taobao.com', '.baidu.com', '.edu', '.gov', '.mil', 'mail', '.google', 'weibo.com', 't.cn', 'wikipedia', 'facebook', 'twitter', 'dropbox')) self.allowdDomain = set(('com', 'net', 'org', 'cn', 'info', 'biz', 'me', 'name', 'cc', 'tv')) self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl self.poolsize = 60 self.poolmaxfree = 20 self.freecount = 0 self.down_pool = Pool(size=self.poolsize) self.totalnettime = 0 self.cbcputime = 0 self.totaldownsize = 0 self.curspeed = 0 self.debugnosave = 1 self.tt = 1 self.done_sites_fname = 'done_sites.bin' try: self.bfdone = BloomFilter.open(self.done_sites_fname) except: self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M if self.run_que.qsize() == 0: for seed in seeds: self.run_que.put(seed.split("http://")[1]) if self.https_enable == 0: self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)', re.I) else: self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)', re.I) def cb_httpget(self, data=None): if not data: return seed, err, headers, content = data st = time.time() if err: self.handle_error(err, seed) return if self.https_enable == 0: seed = seed[7:] self.bfdone.add(seed) self.done += 1 data = {'seed': seed, 'headers': headers, 'content': content} dat = cPickle.dumps(data) self.done_que.put(dat) et = time.time() self.cbcputime += (et - st) #self.tt=(et-st) if self.done % self.showpercounts == 0: self.out(seed) pass def out(self, seed): spendtime = time.time() - self.starttime spendtime = 1 if spendtime == 0 else spendtime nowh = str(int(spendtime) / 3600) + ":" if spendtime > 3600 else "" now = "%s%02d:%02d" % (nowh, spendtime % 3600 / 60, spendtime % 60) print "%s D:%-4d R:%-7d [Speed: T%.2f/s C%3d/s A%.2f] CB:%0.4f Active:%d %s %s" % (now, (self.done), self.run_que.qsize(), \ (self.done)/(spendtime+self.oldtime), self.curspeed, self.tt, self.totalnettime / self.done ,self.poolsize-self.freecount, str(self.err), seed ) def work(self): while self.quit == 0: st = time.time() curdone = self.done self.freecount = self.down_pool.free_count() if self.freecount > self.poolmaxfree: self.tasks = [] minlen = min(self.freecount + 1, self.run_que.qsize()) #if minlen <=0:break for i in range(minlen): stt = time.time() url = self.run_que.get() ett = time.time() if url in self.bfdone: # 5%-10% continue url = "http://" + url self.tasks.append(url) for url in self.tasks: self.down_pool.apply_async(self.httpget, (url, ), callback=self.cb_httpget) time.sleep(0.1) et = time.time() self.curspeed = (self.done - curdone) / (et - st) #self.tt = (et-st) self.down_pool.join() print "All OVER" def handle_error(self, e, url): if e.find('DNSError') > 0: self.err.dns += 1 self.err.rdns.append(url) elif e.find('reset') > 0: #Connection reset self.err.reset += 1 self.err.rreset.append(url) elif e.find('Max retries') > 0 or e.find('Connection aborted'): # self.err.conntimeout += 1 self.err.rconntimeout.append(url) elif e.find('refused') > 0: #Connection refused self.err.refuse += 1 self.err.rrefuse.append(url) else: self.err.others += 1 self.err.rothers.append(url) print "Error", url, e # requests is better through test def httpget_requests(self, url): st = time.time() con = "" e = "" res_headers = "" headers = { 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.6', 'Accept-Encoding': 'gzip,deflate', 'Connection': 'close', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36' } res = None try: # todo: query the ip of the website before get through dns req = requests req.max_redirects = 1 res = req.get(url, timeout=(3, 2), headers=headers) if self.https_enable == 0 and res.url.lower().startswith('http:'): if 'content-type' not in res.headers.keys( ) or 'html' not in res.headers['content-type']: return None con = res.content res.close() except KeyboardInterrupt: raise except Exception as e: e = str(e) if res: res.close() return url, e, None, None et = time.time() self.totalnettime += (et - st) self.tt = (et - st) return url, e, res.headers, con def savestate(self): self.quit = 1 now = time.time() self.oldtime += (now - self.starttime) #should hold on the singal for procdata done with open('state.txt', 'wb') as f: f.write(str(self.oldtime) + '\n') # tasks run_queue done f.write(str(len(self.tasks)) + '\n') for t in self.tasks: f.write(t + '\n') l = self.run_que.qsize() f.write(str(l) + '\n') while l > 0: f.write(self.run_que.pop() + '\n') l -= 1 f.write(str((self.done)) + '\n') with open('err_records.pack', 'wb') as f: cPickle.dump(self.err, f, 2) print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Save state successfully." f.close() exit(0) def loadstate(self): try: with open('state.txt') as f: self.oldtime = float(f.readline()) tasks = int(f.readline()) for i in xrange(tasks): self.run_que.add(f.readline().rstrip('\n')) runnings = int(f.readline()) for i in xrange(runnings): self.run_que.add(f.readline().rstrip('\n')) self.done = int(f.readline()) with open('err_records.pack', 'rb') as f: self.err = cPickle.load(f) print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Load state successfuly." except Exception as e: print e
class FlowController: def __init__(self, base_data, plugin_data): # self.flow_init_result = True # 将测试任务基础数据转换为多个变量 self.base_data = base_data self.base_task_id = base_data['task_id'] self.base_exc_times = base_data['exc_times'] self.base_vuser_num = base_data['v_user'] self.plugin_data = plugin_data self.worker_info_id = app_config.getint("worker", "id") self.worker_info = {"id": self.worker_info_id} self.gevent_pool = None http_tell_test_task_status(task_id=self.base_task_id, status=2) self.parameters_storage = ParametersStorage() # 实例化日志控制器 self.init_log_controller = SyncLogController('tasklog', self.base_task_id, '_init') if self.init_log_controller.log_pool_make_result: app_logger.debug('测试任务ID:%d基础日志控制器初始化成功' % self.base_task_id) self.trans_init_log('基础日志控制器初始化成功') else: app_logger.error('测试任务ID:%d基础日志控制器初始化失败') self.flow_init_result = False self.run_log_controller = AsyncLogController('tasklog', self.base_task_id, '_run') if self.run_log_controller.log_pool_make_result: app_logger.debug('测试任务ID:%d运行日志控制器初始化成功' % self.base_task_id) self.trans_init_log('运行日志控制器初始化成功') else: app_logger.error('测试任务ID:%d运行日志控制器初始化失败') self.flow_init_result = False if self.flow_init_result: # 写一些环境信息 self.trans_init_log("启动测试任务") # 递归原始数据 self.trans_init_log("准备初始化各虚拟用户的插件树") # self.recurse_plugin_tree(plugin_data[0]) # self.trans_init_log("插件及流程控制器初始化结束") else: http_tell_test_task_status(task_id=self.base_task_id, status=-2) def trans_init_log(self, msg, level=None): log = "%s %s Worker:%d " % ( datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f'), 'INFO' if level is None else 'ERROR', self.worker_info_id) + msg self.init_log_controller.trans(log) def init_plugin_tree(self, tree_data, vuser_index=None): # 只在首次初始化插件树用以数据检查的时候才会用 not vuser_index and self.trans_init_log("准备初始化插件树") def recurse_plugin_tree(_data, parent_node=None): """ 递归初始化插件树 :param _data: 插件原始数据 :param parent_node: 父级节点实例 :return: 无返回 """ # 对于暂不支持的插件,忽略其初始化 if _data['originalId'] in all_plugins: if _data['status'] is True: self_plugin = all_plugins[_data['originalId']]( base_data=self.base_data, plugin_data=_data, worker_info=self.worker_info, vuser_index=vuser_index if vuser_index else 0, parent_node=parent_node, init_log_ctrl=self.init_log_controller, run_log_ctrl=self.run_log_controller, parameter_ctrl=self.parameters_storage) if not self_plugin.plugin_check_result: self.flow_init_result = False not vuser_index and self.trans_init_log( "插件'%s'初始化结果:%s" % (self_plugin.plugin_title, '成功' if self_plugin.plugin_check_result else ('失败,%s' % self_plugin.plugin_check_log))) if "children" in _data: for child in _data["children"]: recurse_plugin_tree(child, self_plugin) if parent_node is None: return self_plugin else: if self_plugin.__class__.__bases__[0] in [ ConfigurationPlugin, ParameterPlugin ]: parent_node.plugins_configuration.append( self_plugin) elif self_plugin.__class__.__bases__[0] in [ PreprocessorPlugin ]: parent_node.plugins_preprocessor.append( self_plugin) elif self_plugin.__class__.__bases__[0] in [ ControllerPlugin, RequestPlugin, TimerPlugin ]: parent_node.plugins_common.append(self_plugin) elif self_plugin.__class__.__bases__[0] in [ AssertionPlugin ]: parent_node.plugins_assertion.append(self_plugin) elif self_plugin.__class__.__bases__[0] in [ PostprocessorPlugin ]: parent_node.plugins_postprocessor.append( self_plugin) else: not vuser_index and self.trans_init_log( "插件'%s'初始化结果:%s" % (_data['title'], '失败,插件暂不支持')) self.flow_init_result = False plugin_tree = recurse_plugin_tree(tree_data) not vuser_index and self.trans_init_log("插件树初始化完毕") return plugin_tree def vuser_excute(self, tree): # 不同的线程之间共用self.base_exc_times会导致执行时间误减 base_exc_times = self.base_exc_times # 执行次数 while base_exc_times > 0: tree.run_test() base_exc_times -= 1 def init_vusers(self): # 首先初始化出来一颗原始的插件树用以基本检查 self.init_plugin_tree(self.plugin_data[0]) # 如果基本初始化失败则不操作协程池 if self.flow_init_result: # 初始化协程池 try: self.gevent_pool = GeventPool(self.base_vuser_num) except Exception as e: msg = '测试任务虚拟用户并发池创建失败:%s' % repr(e) self.flow_init_result = False app_logger.error(msg) self.trans_init_log(msg) else: msg = '测试任务虚拟用户并发池创建成功' app_logger.debug(msg) self.trans_init_log(msg) vuser_index = 1 free_count = self.gevent_pool.free_count() while free_count > 0: # 每个虚拟用户拥有属于自己的插件树,互不干扰 plugin_tree = self.init_plugin_tree( self.plugin_data[0], vuser_index) self.gevent_pool.spawn(self.vuser_excute, plugin_tree) self.trans_init_log("虚拟用户%d准备完毕" % vuser_index) vuser_index += 1 free_count -= 1 def run(self): # 调测阶段直接回写结束 http_tell_test_task_status(task_id=self.base_task_id, status=3) self.gevent_pool.join() self.run_log_controller.cancel() self.trans_init_log("测试结束") http_tell_test_task_status(task_id=self.base_task_id, status=10)
break #print "User: {user}; playtime: {playtime}; WaitTime: {wait}; Action:{action}".format(user=uid, playtime=playing_time, # wait=sleep_time, action=action) gevent.sleep(sleep_time) if action == "stop": inst.stop_instance() elif action == "noinput": inst.notify_instance('20') elif action == "crash": inst.notify_instance('11') else: pass print getcurrent() pool = Pool(parallen) pool.imap(cloud_play, range(1, 400)) weight['overtime'] = 0 while now_time < end_time - 30000: time.sleep(2) free_num = pool.free_count() print "==========", free_num if free_num > 0: pool.imap(cloud_play, range(end_num, end_num + free_num)) end_num += free_num now_time = int(time.time() * 1000)
#!/usr/bin/python import time import random import gevent from gevent import Greenlet from gevent.pool import Pool def thrFunc(n): print "sleep %d seconds start.\n" %n gevent.sleep(n) print "sleep %d seconds end.\n" %n threadPool = Pool(size=3) while True: sec = random.randint(3,6) #gThr = Greenlet(thrFunc,sec) #gThr.start() #gThr.join() print "+++free:",threadPool.free_count() threadPool.spawn(thrFunc,sec) #threadPool.apply_async(thrFunc,sec)
class ArticalSpider(object): """协程捕捉URL爬虫并解析html,将结果存入数据库 maxsize: 队列存储的最大值(默认为1000) poolSize:协程池最大同时激活greenlet个数(默认为5个) """ def __init__(self): self.evt = Event() # 等待初始化 self.initConfig() # 初始化配置文件 self.initModules() # 初始化模块 self.q = Queue(maxsize=self.maxsize) # 有界队列 self.initQueue() # 初始化队列 self.crawlUrlsCount = 0 # 统计搜到的链接的个数 self.crawlerID = 0 # 协程ID标志 self.pool = Pool(self.poolSize) # 协程池 self.isInitializeCompletely = False # 是否初始化完成 self.startTime = None # 爬虫启动时间 def initModules(self): """初始化模块""" logger.info('Initializing modules...') self.htmlParser = HtmlParser() # 加载智能解析模块 self.sqlManager = SQLManager() # 加载数据库模块 logger.info('Reading url md5 from mysql...') self.urlDict = self.sqlManager.getAllMd5() # 加载已解析URL字典 def initConfig(self): """读取配置文件信息""" logger.info('Initializing config...') with open('data.conf') as json_file: data = json.load(json_file) self.maxsize = data['maxUrlQueueSize'] # URL队列最大存储值 self.poolSize = data['poolSize'] # 协程池最大同时激活greenlet个数 self.fileName = data['urlQueueFileName'] # 队列url的保存文件名 self.startUrls = data['startUrls'] # 队列初始化url self.filterUrlsRegular = data['filterUrlsRegular'] # 过滤的url self.saveTime = data['saveTime'] # 队列url定时保存到本地文件 def initQueue(self): """初始化队列,提供起始url列表 :param urls: url列表 :return: """ self.loadLastUrlQueue() for url in self.startUrls[:self.maxsize]: self.q.put(url) self.isInitializeCompletely = True self.evt.set() def loadLastUrlQueue(self): """加载上次保存的队列url""" logger.info('Initializing queue...') hasLastUrls = False if not os.path.exists(self.fileName): return hasLastUrls with open(self.fileName, 'rb') as f: for url in pickle.load(f)[:self.maxsize - 100]: hasLastUrls = True self.q.put(url.strip()) # 注意把空格删除 return hasLastUrls def getCrawlUrlsCount(self): """返回已捕捉到的URL数量""" return self.crawlUrlsCount def getQueueSize(self): """返回当前队列中URL数量""" return self.q.qsize() def saveQueueUrls(self): """将队列内容拷贝到文件""" # 拷贝队列进行遍历 logger.info('Save queue urls') with open(self.fileName, 'wb') as f: urls = list(self.q.queue) pickle.dump(urls, f) def crawlURL(self, crawlerID): """每个工作者,搜索新的url""" # 为了减少协程的切换,每个新建的工作者会不断查找URL,直到队列空或满 # 实际上因为有界队列的原因,协程仍然会不断切换 while True: if not self.isInitializeCompletely: # 还未初始化完成则等待 self.evt.wait() # 定时保存队列数据,以便下次恢复 if time.time() - self.startTime > self.saveTime: self.saveQueueUrls() self.startTime = time.time() gevent.sleep(random.uniform(0, 1)) # 防止爬取频率过快 try: url = self.q.get(timeout=0.1) # 当队列空时自动释放当前greenlet md5_url = MD5(url) if md5_url in self.urlDict: continue # 如果已存在则抛弃 self.urlDict[md5_url] = True # 加入字典 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36', } r = requests.get(url, timeout=5, headers=headers) if r.status_code == 200: if r.encoding == 'ISO-8859-1': charset = self.detCharset(r.text) if charset != "" and charset.lower() in [ 'utf-8', 'gb2312', 'gbk' ]: r.encoding = charset else: r.encoding = chardet.detect( r.content)['encoding'] # 确定网页编码 # 插入数据库 self.insertMysql(r.text, url, MD5(url)) # 寻找下一个url for link in re.findall('<a[^>]+href="(http.*?)"', r.text): if len(self.filterUrlsRegular) != 0: for filterUrl in self.filterUrlsRegular: if filterUrl in link: # 仅当队列中元素小于最大队列个数添加当前url到队列 self.q.put( link.strip(), timeout=0.1) # 当队列满时自动释放当前greenlet self.crawlUrlsCount += 1 break else: if len(link.strip()) != 0: self.q.put(link.strip(), timeout=0.1) self.crawlUrlsCount += 1 else: logger.warning('Request error status: ' + str(r.status_code) + ': ' + url) # 这里可以进行重连(这里不写了) except Empty: # q.get()时队列为空异常 # logger.info('URL Queue is Empty! URLSpider-' + str(crawlerID) + ': stopping crawler...') break except Full: # q.put()时队列为满异常 # logger.info('URL Queue is Full! URLSpider-' + str(crawlerID) + ': stopping crawler...') break except requests.exceptions.ConnectionError: # 连接数过高,程序休眠 logger.warning('Connection refused') time.sleep(3) except requests.exceptions.ReadTimeout: # 超时 logger.warning('Request readTimeout') # 接下去可以尝试重连,这里不写了 def insertMysql(self, html, url, md5): """将解析结果插入队列""" parseDict = self.htmlParser.extract_offline(html) content = parseDict['content'] description = parseDict['description'] keyword = parseDict['keyword'] title = parseDict['title'] # 插入数据库 if content != "": self.sqlManager.insert( Artical(content=content, title=title, keyword=keyword, description=description, url=url, md5=md5)) logger.info('Insert Mysql: ' + url) def detCharset(self, html): """检测网页编码""" charsetPattern = re.compile( '<\s*meta[^>]*?charset=["]?(.*?)"?\s*[/]>?', re.I | re.S) charset = charsetPattern.search(html) if charset: charset = charset.groups()[0] else: charset = "" return charset def run(self): """开启协程池,运行爬虫,在队列中无url时退出捕获""" if self.q.qsize() == 0: logger.error('Please init Queue first (Check your .conf file)') return logger.info('Starting crawler...') self.startTime = time.time() while True: # 当没有任何协程在工作,且队列中无url时退出捕获 if self.q.empty() and self.pool.free_count() == self.poolSize: break # 每次创建和队列中url个数一样的协程数 # 如果协程池所能同时工作的协程数小于url个数,则创建协程池所能同时工作的最大协程数 # 保证协程池总是在最多激活greenlet数状态 for _ in range(min(self.pool.free_count(), self.q.qsize())): self.crawlerID += 1 self.pool.spawn(self.crawlURL, self.crawlerID) # 切换协程(因为只在遇到I/O才会自动切换协程) gevent.sleep(0.1) logger.warning('All crawler stopping...')
# coding=gbk import gevent from gevent.queue import Queue from gevent.pool import Pool from gevent import getcurrent def DoSomething(): print "thread %s " % id(getcurrent()) gevent.sleep(3) # 本测试发现:pool中add 后超出size 限制 即会开始执行,可以看做pool size +1 =限制容量大小 # greenlet 对象在推拉窗模式中 可以复用 pool = Pool(2) # 可并行 n + 1 个任务 print pool.free_count() pool.add(gevent.spawn(DoSomething)) pool.join() raw_input("waiting...") # print "stage" # for i in range(10): # pool.add(gevent.spawn(DoSomething)) #pool.join()
def main_loop(config): """ Основной цикл приложения. :param config: конфигурация :type config: Config Алгоритм: * Открываем соединение с tarantool.queue, использую config.QUEUE_* настройки. * Создаем пул обработчиков. * Создаем очередь куда обработчики будут помещать выполненные задачи. * Пока количество обработчиков <= config.WORKER_POOL_SIZE, берем задачу из tarantool.queue и запускаем greenlet для ее обработки. * Посылаем уведомления о том, что задачи завершены в tarantool.queue. * Спим config.SLEEP секунд. """ logger.info( 'Connect to queue server on {host}:{port} space #{space}.'.format( host=config.QUEUE_HOST, port=config.QUEUE_PORT, space=config.QUEUE_SPACE)) queue = tarantool_queue.Queue(host=config.QUEUE_HOST, port=config.QUEUE_PORT, space=config.QUEUE_SPACE) logger.info('Use tube [{tube}], take timeout={take_timeout}.'.format( tube=config.QUEUE_TUBE, take_timeout=config.QUEUE_TAKE_TIMEOUT)) tube = queue.tube(config.QUEUE_TUBE) logger.info( 'Create worker pool[{size}].'.format(size=config.WORKER_POOL_SIZE)) worker_pool = Pool(config.WORKER_POOL_SIZE) processed_task_queue = gevent_queue.Queue() logger.info( 'Run main loop. Worker pool size={count}. Sleep time is {sleep}.'. format(count=config.WORKER_POOL_SIZE, sleep=config.SLEEP)) while run_application: free_workers_count = worker_pool.free_count() logger.debug( 'Pool has {count} free workers.'.format(count=free_workers_count)) for number in xrange(free_workers_count): logger.debug('Get task from tube for worker#{number}.'.format( number=number)) task = tube.take(config.QUEUE_TAKE_TIMEOUT) if task: logger.info( 'Start worker#{number} for task id={task_id}.'.format( task_id=task.task_id, number=number)) worker = Greenlet(notification_worker, task, processed_task_queue, timeout=config.HTTP_CONNECTION_TIMEOUT, verify=False) worker_pool.add(worker) worker.start() done_with_processed_tasks(processed_task_queue) sleep(config.SLEEP) if break_func_for_test(): break else: logger.info('Stop application loop.')
class ArchivariusBridge(object): """Archivarius Bridge""" def __init__(self, config): self.config = config self.workers_config = {} self.log_dict = {} self.bridge_id = uuid.uuid4().hex self.api_host = self.config_get('resources_api_server') self.api_version = self.config_get('resources_api_version') # Workers settings for key in WORKER_CONFIG: self.workers_config[key] = (self.config_get(key) or WORKER_CONFIG[key]) # Init config for key in DEFAULTS: value = self.config_get(key) setattr(self, key, type(DEFAULTS[key])(value) if value else DEFAULTS[key]) # Pools self.workers_pool = Pool(self.workers_max) self.retry_workers_pool = Pool(self.retry_workers_max) self.filter_workers_pool = Pool() # Queues self.api_clients_queue = Queue() if self.resource_items_queue_size == -1: self.resource_items_queue = Queue() else: self.resource_items_queue = Queue(self.resource_items_queue_size) if self.retry_resource_items_queue_size == -1: self.retry_resource_items_queue = Queue() else: self.retry_resource_items_queue = Queue( self.retry_resource_items_queue_size) # Default values for statistic variables for key in ( 'droped', 'add_to_resource_items_queue', 'add_to_retry', 'exceptions_count', 'not_found_count', 'archived', 'moved_to_public_archive', 'dumped_to_secret_archive', ): self.log_dict[key] = 0 if self.api_host != '' and self.api_host is not None: api_host = urlparse(self.api_host) if api_host.scheme == '' and api_host.netloc == '': raise ConfigError('Invalid \'resources_api_server\' url.') else: raise ConfigError('In config dictionary empty or missing' ' \'resources_api_server\'') self.db = prepare_couchdb(self.couch_url, self.db_name, logger) self.archive_db = prepare_couchdb(self.couch_url, self.db_archive_name, logger) # TODO self.archive_db2 = prepare_couchdb(self.couch_url, self.db_archive_name + '_secret', logger) self.resources = {} for entry_point in iter_entry_points( 'openprocurement.archivarius.resources'): self.resources[entry_point.name] = { 'filter': entry_point.load(), 'view_path': '_design/{}/_view/by_dateModified'.format(entry_point.name) } def create_api_client(self): client_user_agent = self.user_agent + '/' + self.bridge_id + '/' + uuid.uuid4( ).hex timeout = 0.1 while True: try: api_client = APIClient(host_url=self.api_host, user_agent=client_user_agent, api_version=self.api_version, resource='RESOURCE', key=self.api_key) self.api_clients_queue.put({ 'client': api_client, 'request_interval': 0 }) logger.info('Started api_client {}'.format( api_client.session.headers['User-Agent'])) break except RequestFailed as e: self.log_dict['exceptions_count'] += 1 logger.error( 'Failed start api_client with status code {}'.format( e.status_code)) timeout = timeout * 2 sleep(timeout) def fill_api_clients_queue(self): while self.api_clients_queue.qsize() == 0: self.create_api_client() def fill_resource_items_queue(self, resource): start_time = datetime.now(TZ) rows = self.db.iterview(self.resources[resource]['view_path'], 10**3, include_docs=True) filter_func = partial(self.resources[resource]['filter'], time=start_time) for row in ifilter(filter_func, rows): self.resource_items_queue.put({'id': row.id, 'resource': resource}) self.log_dict['add_to_resource_items_queue'] += 1 def queues_controller(self): while True: self.fill_api_clients_queue() #if self.workers_pool.free_count() > 0 and (self.resource_items_queue.qsize() > int((self.resource_items_queue_size / 100) * self.workers_inc_threshold)): if self.resource_items_queue.qsize( ) > 0 and self.workers_pool.free_count() > 0: w = ArchiveWorker.spawn(self.api_clients_queue, self.resource_items_queue, self.db, self.archive_db, self.archive_db2, self.workers_config, self.retry_resource_items_queue, self.log_dict) self.workers_pool.add(w) logger.info('Queue controller: Create main queue worker.') #elif self.resource_items_queue.qsize() < int((self.resource_items_queue_size / 100) * self.workers_dec_threshold): elif self.resource_items_queue.qsize() == 0: if len(self.workers_pool) > self.workers_min: wi = self.workers_pool.greenlets.pop() wi.shutdown() logger.info('Queue controller: Kill main queue worker.') logger.info('Main resource items queue contains {} items'.format( self.resource_items_queue.qsize())) logger.info('Retry resource items queue contains {} items'.format( self.retry_resource_items_queue.qsize())) logger.info( 'Status: add to queue - {add_to_resource_items_queue}, add to retry - {add_to_retry}, moved to public archive - {moved_to_public_archive}, dumped to secret archive - {dumped_to_secret_archive}, archived - {archived}, exceptions - {exceptions_count}, not found - {not_found_count}' .format(**self.log_dict)) sleep(self.queues_controller_timeout) def gevent_watcher(self): self.fill_api_clients_queue() if not self.resource_items_queue.empty() and len( self.workers_pool) < self.workers_min: w = ArchiveWorker.spawn(self.api_clients_queue, self.resource_items_queue, self.db, self.archive_db, self.archive_db2, self.workers_config, self.retry_resource_items_queue, self.log_dict) self.workers_pool.add(w) logger.info('Watcher: Create main queue worker.') if not self.retry_resource_items_queue.empty() and len( self.retry_workers_pool) < self.retry_workers_min: w = ArchiveWorker.spawn(self.api_clients_queue, self.retry_resource_items_queue, self.db, self.archive_db, self.archive_db2, self.workers_config, self.retry_resource_items_queue, self.log_dict) self.retry_workers_pool.add(w) logger.info('Watcher: Create retry queue worker.') def run(self): logger.info('Start Archivarius Bridge', extra={'MESSAGE_ID': 'edge_bridge_start_bridge'}) for resource in self.resources: self.filter_workers_pool.spawn(self.fill_resource_items_queue, resource=resource) spawn(self.queues_controller) while True: self.gevent_watcher() if len(self.filter_workers_pool) == 0 and len( self.workers_pool) == 0 and len( self.retry_workers_pool) == 0: break sleep(self.watch_interval) def config_get(self, name): try: return self.config.get('main', name) except NoOptionError: return
class CoroutineWorker(Worker): DEFAULT_GREENLET_SIZE = 10 # control the pool size def __init__(self, cfg, file_logger=None, ppid=None, sockets=None): super(CoroutineWorker, self).__init__(cfg, file_logger, ppid, sockets) self.max_greenlets = int(self.cfg.max_greenlets or self.DEFAULT_GREENLET_SIZE) def patch(self): from gevent import monkey monkey.noisy = False # if the new version is used make sure to patch subprocess if gevent.version_info[0] == 0: monkey.patch_all() else: monkey.patch_all(subprocess=True) def init_process(self): super(CoroutineWorker, self).init_process() self.patch() self.pool = Pool(self.max_greenlets) self.mutex = threading.Semaphore() self._stop_event = threading.Event() def run(self): super(CoroutineWorker, self).run() while self.alive: if not self.pool.full(): self.pool.spawn(self._run) self.file_logger.debug("pool greenlet size %d" % (self.pool.size - self.pool.free_count())) gevent.sleep(1.0) self._stop_event.wait() gevent.spawn(self.stop).join() def _run(self): if self.LISTENERS: while self.alive: self.mutex.acquire() ret = select.select(self.rd_fds, [], [], 1.0) self.file_logger.debug("Before: socket fd length: %d, greenlet:%d, listen in:%s" % (len(self.rd_fds), id(getcurrent()), self.LISTENERS[0] in self.rd_fds)) if ret[0]: sock = ret[0][0] self.rd_fds.remove(sock) else: sock = None self.mutex.release() if sock: #for sock in ret[0]: if sock in self.LISTENERS: try: client, addr = sock.accept() client.setblocking(0) close_on_exec(client) self.rd_fds.append(client) except socket.error as e: if e.args[0] not in (errno.EAGAIN, errno.EWOULDBLOCK, errno.ECONNABORTED): self.file_logger.error(traceback.format_exc()) finally: self.rd_fds.append(sock) else: r = self.handle_request(client=sock) if r == -1: sock.close() else: self.rd_fds.append(sock) if self.ppid and self.ppid != os.getppid(): self.file_logger.info("Parent changed, shutting down: %s", self) return else: while self.alive: try: self.handle_request() except: self.file_logger.error(traceback.format_exc()) def stop(self): Worker.stop(self) self.pool.join(timeout=1) def handle_quit(self, sig, frame): self.alive = False self._stop_event.set()
class KittenServer(object): halting_signals = ( signal.SIGINT, signal.SIGTERM, ) def __init__(self, ns): self.ns = ns # Workers and queues self.pool = Pool(5) self.queue = Queue() # States self.working = None self.torn = False # Greenlets; to be populated when started self.listener = None self.worker = None self.log = logbook.Logger('Server-{0}'.format(self.ns.port)) def start(self): self.setup() self.listener = gevent.spawn(self.listen_forever) self.worker = gevent.spawn(self.work_forever) return self.listener def stop(self, exit=True): self.log.warning('Stopping server') self.teardown(exit) def listen(self, socket): request = socket.recv_json() # Send the request for processing and handle any errors response = self.handle_request(request) socket.send_json(response) return True def listen_forever(self): try: socket = self.get_socket() while self.listen(socket): pass except Exception: self.log.exception('Server died.') finally: self.teardown() def teardown_listener(self): self.log.info('Stopping socket listener.') self.listener.kill(timeout=5) # TODO: Configurable def handle_request(self, request): request = KittenRequest(request) self.queue.put(request) return request.ack() def work(self): if self.queue.empty(): gevent.sleep(0.1) # TODO: Configurable self.log.debug('Slept') return True request = self.queue.get() socket = self.get_socket(zmq.REQ, request.host) self.pool.spawn(request.process, socket) return True def work_forever(self): self.working = True while self.work(): pass # pragma: nocover self.working = False self.log.warning('Worker pool stopped.') def teardown_workers(self): free = self.pool.free_count() if free == self.pool.size: self.log.info('Workers idle. Killing without timeout.') self.pool.kill() return True timeout = 5 # TODO: Configurable count = self.pool.size - free self.log.info('Giving {1} requests {0}s to finish', timeout, count) self.pool.kill(timeout=timeout) self.log.info('Requests finished or timed out.') def get_socket(self, kind=zmq.REP, host=None): context = zmq.Context() socket = context.socket(kind) if not host: host = 'tcp://*:{0}'.format(self.ns.port) self.log.info( 'Binding {1} on {0}', host, {zmq.REP: 'REP', zmq.REQ: 'REQ'}.get(kind, kind) ) if kind == zmq.REP: socket.bind(host) else: socket.connect(host) return socket def setup(self): self.log.info('Setting up server') self.setup_signals() self.setup_pidfile() def teardown(self, exit=True): if self.torn: # The greenlets will try to exit as well upon signals, so we need # to keep state to make sure that we don't loopingly kill # everything. return False self.torn = True self.log.info('Tearing down server') self.teardown_workers() self.teardown_pidfile() self.teardown_listener() self.log.info('Server teardown complete.') if exit: self.log.info('Exiting.') sys.exit(0) def setup_signals(self): for sig in self.halting_signals: gevent.signal(sig, self.signal_handler) def signal_handler(self): self.log.warning('Recieved halting signal') self.stop(True) @property def pidfile(self): return conf.pidfile(self.ns.port) def setup_pidfile(self): pid = str(os.getpid()) self.log.debug('Pid: {0}', pid) with open(self.pidfile, 'w') as pidfile: pidfile.write(pid) def teardown_pidfile(self): self.log.debug('Removing pidfile') os.remove(self.pidfile)