Esempio n. 1
0
def start_pool(size):
    t1 = datetime.now()
    pool = Pool(size)
    while (datetime.now() - t1).seconds <= SECONDS:
        print 'pool.free_count():', pool.free_count()
        if pool.free_count() == 0:
            pool.wait_available()
            print '<free 1>'
        pool.apply_async(test_get)
    print 'Joining............................................'
    pool.join()
    t2 = datetime.now()
    print COUNT, TIMEOUT_CNT
    print COUNT / (t2-t1).seconds
Esempio n. 2
0
def main(psize, filename=None):
    if filename:
        urls = Queue()
        results = Queue()
        pool = Pool(int(psize))
        reader = gevent.spawn(readfile, filename, urls)
        request = gevent.spawn(work_input_file, urls, results, reader)
        pool.add(reader)
        pool.add(request)
        pool.join()
        pool.free_count()
        print results.qsize(), 3333333333333333333
        print urls.qsize(), 3333333333333333333
        return results
Esempio n. 3
0
class WorkerPool:
    def __init__(self, queue, func=None, pool_size=100, worker_type='page'):
        self.queue = queue
        self.worker = func
        self.exit_signal = False
        self.pool_size = pool_size
        ## Pool类基于gevent.pool.Group类
        self.pool = Pool(pool_size)
        self.worker_type = worker_type

    def start(self, page_task=None):
        if self.worker_type == 'asset':
            msg = '静态资源工作池启动, 所属页面: {:s}'
            logger.debug(msg.format(page_task['refer']))

        while True:
            if self.exit_signal: break
            if not self.queue.empty():
                task = self.queue.get()
                msg = '从队列中取出成员, 调用worker. task: {task:s}'
                logger.debug(msg.format(task=str(task)))
                self.pool.spawn(self.worker, task)
            elif self.pool.free_count() != self.pool.size:
                ## 如果队列已空, 但是协程池还未全部空闲, 说明仍有任务在执行, 等待.
                free = self.pool.free_count()
                total = self.pool.size
                working = total - free
                if self.worker_type == 'asset':
                    msg = '工作池使用率: {working:d}/{total:d}, page_task: {page_task:s}'
                    logger.debug(
                        msg.format(working=working,
                                   total=total,
                                   page_task=str(page_task)))
                sleep(1)
            elif self.exit_signal:
                ## 如果队列为空, 且各协程都已空闲, 或是触发了stop()方法, 则停止while循环
                break
            else:
                break
        if self.worker_type == 'asset':
            msg = '静态资源工作池结束, 所属页面: {:s}'
            logger.debug(msg.format(page_task['refer']))

    def stop(self):
        self.exit_signal = True
        # 只让进队列, 不让出队列, 就是只把当前正在处理的页面中的链接入队列, 不再弹出任务
        ## 把协程池中的任务取出重新入队列并持久化到本地文件, 避免丢失.
        for item in self.pool:
            self.queue.put(item.args)
Esempio n. 4
0
class Task:
    def __init__(self, queue, pool_max=100):
        self.work = None
        self.pool_max = pool_max
        self.pool = Pool(pool_max)
        self.queue = queue

    def initTaskWork(self, func):
        self.work = func

    def start(self):
        while True:
            if not self.queue.empty():
                t = self.queue.pop()
                self.pool.spawn(self.work, *t)
            elif self.pool.free_count() == self.pool.size or self.queue.isLock:
                # print 'queue is empty'
                # print self.pool.free_count(), self.pool.size
                break
            else:
                # print 'queue is empty but...'
                sleep(0)

    def stop(self):
        # 只让进队列,不让出队列
        self.queue.lock(True)
        for item in self.pool:
            self.queue.push(list(item.args))
            # print item
            # self.pool.killone(item)

        # self.pool.kill()
        # print '开始 stop的save'
        self.queue.save()
        self.queue.clear()
Esempio n. 5
0
def main():

    """spawn"""    
     
    val = rclient.get('f1')   

    print(val)
    
    pool = Pool(20)
    
    start('f1')
    

    
    
    #loop forever
    while True:
        
        #print( time.time() )        
        pool.spawn(func1)
        #print pool.wait_available()
        print ( pool.free_count() )
        
        #sleep
        gevent.sleep(2)        
Esempio n. 6
0
class WorkerPool(object):
    """Docstring for WorkerPool """
    def __init__(self, input, output, func, nthreads=800):
        """@todo: to be defined

        :param input: @todo
        :param output: @todo
        :param func: @todo
        :param qname: @todo

        """
        self._func = func
        self._input = input
        self._output = output
        self._lock = BoundedSemaphore(1)
        self._pool = Pool(nthreads)
        self._nthreads = nthreads
        self._true = 0
        self._false = 0
        self._nogeo = 0
        self._notruth = 0

    def run_one(self, msg):
        result = self._func(msg)
        if result is not None:
            with self._lock:
                self._output.write(
                    (json.dumps(result, ensure_ascii=False)).encode("utf-8") +
                    "\n")
                #if not result['true_geo']:
                #    self._notruth += 1
                #elif ('country' not in result['embersGeoCode']):
                #    self._nogeo += 1
                #elif result['true_geo']['country'].lower() == result['embersGeoCode']['country'].lower():
                #    self._true += 1
                #else:
                #    self._false += 1

    def run(self):
        last = time.time()
        for msg in self._input:
            self._pool.spawn(self.run_one, msg)
            if time.time() - last > 10:
                log.info("Workers running={}".format(self._nthreads -
                                                     self._pool.free_count()))
                last = time.time()
        self._pool.join()

#    def cleanup_workers(self):
#        dones = [w.done for w in self._workers]
#        for done, w in zip(dones, self._workers):
#            if done:
#                fin_job = w.ret
#                self._output.write(fin_job)
#        self._workers = [w for done, w in zip(dones, self._workers) if not done]
#

    def stop(self):
        self._pool.join()
Esempio n. 7
0
def download_images():
	images_to_download = Item.objects.filter(Q(image__isnull=True) | Q(image=''),image_url__isnull=False).values_list('pk','image_url')
	for obj in images_to_download:
		queue.put(obj)
	# create greenlet pool and spawn workers
	pool = Pool(size=POOL_SIZE)
	pool.spawn(download_crawler)
	# eventlet uses free(), gevent uses free_count()
	while not pool.free_count() == POOL_SIZE:
		gevent.sleep(0.1)
		#eventlet.sleep
		for x in xrange(0, min(queue.qsize(), pool.free_count())):
			pool.spawn(download_crawler)
	# Wait for everything to complete - eventlet uses waitall
	pool.join()
	pool.kill()
	time.sleep(2)
def use_gevent_with_queue():
    queue = Queue()
    pool = Pool(5)

    for p in range(1, 7):
        put_new_page(p, queue)

    while pool.free_count():
        sleep(0.1)
        pool.spawn(save_search_result_with_queue, queue)

    pool.join()
Esempio n. 9
0
def save_html_with_gevent(items, gov):
    pool = Pool(10)
    queue = Gqueue()

    for item in items:

        queue.put(item)

    while pool.free_count():
        pool.spawn(save_html_for_gevent, queue, gov['gov_name'])

    pool.join()
def use_gevent_with_queue():
    queue = Queue()
    pool = Pool(5)

    for p in range(1, 7):
        put_new_page(p, queue)

    while pool.free_count():
        sleep(0.1)
        pool.spawn(save_search_result_with_queue, queue)

    pool.join()
Esempio n. 11
0
def main():
    pool = Pool(results.threads)
    while 1:
        try:
            if manager.gamertags.empty():
                print 'Finished'
                break
            for i in xrange(min(pool.free_count(), 50)):
                pool.spawn(manager.spawn_connect)
            gevent.sleep(1)
        except KeyboardInterrupt:
            print '[KYBRD_NTRPT] Finishing active threads'
            pool.join()
            break
Esempio n. 12
0
def main():
    pool = Pool(results.threads)
    while 1:
        try:
            if manager.gamertags.empty():
                print ('Finished')
                break
            for i in xrange(min(pool.free_count(), 50)):
                pool.spawn(manager.spawn_connect)
            gevent.sleep(1)
        except KeyboardInterrupt:
            print ('[KYBRD_NTRPT] Finishing active threads')
            pool.join()
            break
Esempio n. 13
0
class SocketPool(object):
	def __init__(self):
		self.pool = Pool(1)
		self.pool.add(self.server())

	# 适合聊天室的按回车发送文本方式
	def listen( self, socket,address):
		f = socket.makefile()
		print "listen"

		while True:
			name = f.readline().strip()
			print name

	def listen2( self, socket,address):
		print "listen2"
		print self.pool.free_count()
		while True:
			name =socket.recv(1010).strip()
			print name

	def add_handler( self, socket,address):
		if self.pool.full():
			raise Exception( "At maximum pool size")
		else:
			print (" pool insert")
			s = self.pool.spawn(self.listen2(socket,address))

			# self. pool.spawn( self. listen, socket,address)

	def shutdown( self):
		self. pool. kill()

	def server(self):
		print "server"
		server = StreamServer(('0.0.0.0', 8000), self.add_handler)
		server.serve_forever()
Esempio n. 14
0
class BGTaskManager(object):
    def __init__(self, max_workers):
        self.max_workers = max_workers
        self._pool = Pool(size=max_workers)

    def run(self):
        while True:
            task_id, func, args, kw = bgtasks_queue.get()
            # 为每个任务创建单独的 execution context 避免数据库连接无法正常回收
            # http://docs.peewee-orm.com/en/latest/peewee/database.html#advanced-connection-management
            func = db.execution_context(with_transaction=False)(func)
            self._pool.spawn(func, *args, **kw)

    def active_worker_count(self):
        return self._pool.size - self._pool.free_count()
Esempio n. 15
0
class RssPool(object):

    def __init__(self):

        self.pool = Pool(RSS_MAX_POOL_NUM)
        self.start = False
        self.times = 0
        self.beginTime = int(time.time())

    def run(self):

        while True:

            if (not self.start) and (not self.pool.full()):
                self.addRssSpider()
                # self.syncDagrame()
                continue

            self.start = False
            if self.pool.free_count() < RSS_MAX_POOL_NUM:
                logging.info("---------------join run ")
                self.pool.join()
            else:
                logging.info("---------------not data ,sleep %s senconds " % MAIN_LOOP_SLEEP_TIME)
                time.sleep(MAIN_LOOP_SLEEP_TIME)

    def syncDagrame(self):
        """同步数据到线上"""
        self.times += 1
        if self.times > RUN_SYNC_INTERVAL_TIMES or int(time.time()) - self.beginTime > RUN_SYNC_INTERVAL_TIME:
            logging.info("**********sync crawl infos ************")
            sync = SyncCrawlInfos()
            sync.index()
            self.times = 0
            self.beginTime = int(time.time())

    def addRssSpider(self):

        configList = getCrawlRssRequest()
        if not configList:
            self.start = True
            return True

        try:
            spider = CommonFeedRss()
            self.pool.spawn(spider.run, configList)
        except Exception, e:
            logging.info("------------------add spider exception : %s " % e)
Esempio n. 16
0
class BGTaskManager(object):

    def __init__(self, max_workers):
        self.max_workers = max_workers
        self._pool = Pool(size=max_workers)

    def run(self):
        while True:
            task_id, func, args, kw = bgtasks_queue.get()
            # 为每个任务创建单独的 execution context 避免数据库连接无法正常回收
            # http://docs.peewee-orm.com/en/latest/peewee/database.html#advanced-connection-management
            func = db.execution_context(with_transaction=False)(func)
            self._pool.spawn(func, *args, **kw)

    def active_worker_count(self):
        return self._pool.size - self._pool.free_count()
class GEventTaskRunner(TaskRunner):
    timeout = 5

    def __init__(self, pool_size=200, *args, **kw):
        super(GEventTaskRunner, self).__init__(*args, **kw)
        self._pool = Pool(pool_size)

    def run_task(self, func, *args, **kw):
        self.logger.debug("Adding task %s to pool of size %s", func,
                          self._pool.free_count())
        self._pool.start(Greenlet(func, *args, **kw))
        self.logger.debug("Task added")

    def stop(self):
        self.logger.debug("Waiting for background queue to finish")
        self._pool.join(self.timeout)
        self.logger.debug("background queue finished")
        super(GEventTaskRunner, self).stop()
Esempio n. 18
0
class Downloader(object):
    def __init__(self, concurrent=64):
        self.proxy_conf = OnlineConfig().proxy
        self.pool = Pool(concurrent)
        self.pool.join()

    def add_task(self, task, proxy):
        self.pool.add(gevent.spawn(self._download, task, proxy))

    def free_count(self):
        return self.pool.free_count()

    @staticmethod
    def _before_download(task, proxy):
        module = ExternManager().get_model(task.s_platform, task.s_feature + '.request')
        request = module(task, proxy) if module else RequestExtra(task, proxy)
        return request

    @staticmethod
    def _after_download(task, request, response, proxy):
        module = ExternManager().get_model(task.s_platform, task.s_feature + '.response')
        response = module(task, request, response, proxy) \
            if module else ResponseExtra(task, request, response, proxy)
        return response

    def _download(self, task, proxy):
        request = None
        req_response = None
        try:
            request = self._before_download(task, proxy)
            req_response = requests.request(**request())
            response = self._after_download(task, request, req_response, proxy)
            del response
            del req_response
            del request
        except Exception as e:
            if req_response:
                del req_response
            if request:
                del request
        finally:
            del task
            del proxy
Esempio n. 19
0
class GreenletExecutor(AbstractExecutor):
    """
  GreenletExecutor is an AbstractExecutor subclass that uses a pool of
  greenlets to execute calls asynchronously.

  NOTE: Use this executor for I/O-bound tasks. Since all greenlets are
  multiplexed on a single pthread, do NOT use this for compute-bound
  callables. Try using the GIPCExecutor instead.
  """
    def __init__(self, num_greenlets=50, **kwargs):
        super(GreenletExecutor, self).__init__(**kwargs)
        self.pool = Pool(size=num_greenlets)
        self.task_queue = Queue()
        self.num_ready = 0

    def _shutdown(self):
        for _ in xrange(len(self.pool)):
            self.task_queue.put(None)
        if self.force_kill_on_shutdown:
            self.pool.kill()
        else:
            self.pool.join()

    def _worker_loop(self):
        try:
            self.num_ready += 1
            while True:
                self.num_ready -= 1
                task = self.task_queue.get()
                if task is None:
                    return
                task.execute()
                self.num_ready += 1
        except:
            pass

    def _submit(self, task):
        self.task_queue.put(task)
        if not self.num_ready and self.pool.free_count():
            self.pool.spawn(self._worker_loop)
Esempio n. 20
0
class GreenletExecutor(AbstractExecutor):
  """
  GreenletExecutor is an AbstractExecutor subclass that uses a pool of
  greenlets to execute calls asynchronously.

  NOTE: Use this executor for I/O-bound tasks. Since all greenlets are
  multiplexed on a single pthread, do NOT use this for compute-bound
  callables. Try using the GIPCExecutor instead.
  """
  def __init__(self, num_greenlets=50, **kwargs):
    super(GreenletExecutor, self).__init__(**kwargs)
    self.pool = Pool(size=num_greenlets)
    self.task_queue = Queue()
    self.num_ready = 0

  def _shutdown(self):
    for _ in xrange(len(self.pool)):
      self.task_queue.put(None)
    if self.force_kill_on_shutdown:
      self.pool.kill()
    else:
      self.pool.join()

  def _worker_loop(self):
    try:
      self.num_ready += 1
      while True:
        self.num_ready -= 1
        task = self.task_queue.get()
        if task is None:
          return
        task.execute()
        self.num_ready += 1
    except:
      pass

  def _submit(self, task):
    self.task_queue.put(task)
    if not self.num_ready and self.pool.free_count():
      self.pool.spawn(self._worker_loop)
Esempio n. 21
0
def run_gevent():

    worker = _config.get('count_worker', 4)
    pool = Pool(worker)

    funcs = run()

    while True:

        if pool.full():
            time.sleep(1)
            continue

        # getting func delete
        try:
            funcnya = next(funcs)
            pool.spawn(funcnya['func'], *funcnya['param'])

        except StopIteration as e:

            if pool.free_count() == worker:
                break

        time.sleep(0.01)
class worker:

	def __init__(self,seeds):

		self.showpercounts = 50
		self.timeout = 10
		self.starttime = time.time()
		self.quit = 0

		#self.run_queue = Queue()
		self.run_queue = daemon.run_que
		self.done_queue = daemon.done_que
		self.tasks = []
		self.done = 0
		
		self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

		self.poolsize = 300
		self.freecount = 0
		#self.maxfreecnt = 4
		self.down_pool = Pool(size=self.poolsize)

		#self.mutex = gevent.coros.RLock()

		self.totalnettime = 0
		self.cbcputime = 0
		self.totaldownsize = 0
		
		self.curspeed = 0
		self.test = 0
		self.errcnt  = 0
		self.bfdone = daemon.bfdone
		self.size = 0
		
		if self.run_queue.qsize() == 0:
			for seed in seeds:
				self.run_queue.put( seed.split("http://")[-1] )

		self.urlpatern = re.compile('href=[\"\']http://([^/?#\"\']+)')



	def cb_httpget(self, data):

		st = time.time()
		seed, err, headers, content = data

		#sself.test += 1
		if err or len(content) == 0:
			self.errcnt += 1
			return
			
		data={'url':seed,'headers':headers,'content':content}
		dat = cPickle.dumps(data)
		
		self.size = len(content)

		self.done_queue.put(dat)
		self.done += 1
		#seed.split('http://')[-1]
		self.bfdone.add(seed)

		et = time.time()
		
		self.cbcputime += (et-st)

		if self.done % self.showpercounts == 0:
			t = self.cbcputime/self.done
			self.out(seed ,(et-st))

		

	def out(self, cururl, cbtime=0 ):
		spendtime = time.time() - self.starttime
		spendtime = 1 if spendtime == 0 else spendtime
		nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else ""
		now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 )

		print "%s D:%-4d R:%-7d SpeedT:%.2f/s SpeedC:%.2f/s Test:%0.2f CB:%0.4f Active:%d Err:%d %s" % (now, (self.done), self.run_queue.qsize(), \
			self.done/spendtime,self.curspeed, self.test, cbtime ,self.poolsize-self.freecount, self.errcnt, cururl )
	
	

	def work(self):

		while self.quit == 0:
			curstime = time.time()

			self.freecount = self.down_pool.free_count()

			self.tasks = []
			if self.freecount == 0:
				gevent.sleep(0.1)
				continue

			st = time.time()
			xlen = self.freecount

			lasturl = ""
			while xlen > 0:
				xlen -= 1

				url = self.run_queue.get()
				if url == lasturl:
					continue
				else:
					lasturl = url
				url = "http://"+url
				if url in self.bfdone:
					xlen += 1
					continue
				#print xlen, url, self.down_pool.free_count()

				self.tasks.append(url)
				self.down_pool.apply_async(self.httpget, (url,), callback=self.cb_httpget)
			
			et = time.time()

			curetime = time.time()
			#self.curspeed = (self.done - curdone) / (curetime-curstime)
	
		self.down_pool.join()
		print "All OVER"

	
	# requests is better than pycurl ?
	def httpget_requests(self, url):

		st = time.time()
		con = ""
		e = None
		#'Connection':'close',
		headers = {
					'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
					'Accept-Encoding':'gzip,deflate',
					'Connection':'close',
					'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
				}

		try:
			# query the ip of the website
			req = requests
			#r = requests
			req.max_redirects = 1
			#with gevent.Timeout(5, False) as timeout:
			res = req.get(url, timeout = self.timeout)
			if res.url.startswith('https'):
				raise
			con = res.content
			headers = res.headers
			res.close()


		except KeyboardInterrupt:
				raise
		except Exception as e:

			et = time.time()
			return url,e,None,None

		et = time.time()
		self.totalnettime += (et-st)
		self.curspeed = self.totalnettime/(self.done+1)
		return url, e, headers, con
Esempio n. 23
0
class BreakpadSubmitterResource(RequiredConfigMixin):
    """Handles incoming breakpad crash reports and saves to crashstorage

    This handles incoming HTTP POST requests containing breakpad-style crash
    reports in multipart/form-data format.

    It can handle compressed or uncompressed POST payloads.

    It parses the payload from the HTTP POST request, runs it through the
    throttler with the specified rules, generates a crash_id, returns the
    crash_id to the HTTP client and then saves the crash using the configured
    crashstorage class.

    .. Note::

       From when a crash comes in to when it's saved by the crashstorage class,
       the crash is entirely in memory. Keep that in mind when figuring out
       how to scale your Antenna nodes.


    The most important configuration bit here is choosing the crashstorage
    class.

    For example::

        CRASHSTORAGE_CLASS=antenna.ext.s3.crashstorage.S3CrashStorage

    """
    required_config = ConfigOptions()
    required_config.add_option(
        'dump_field',
        default='upload_file_minidump',
        doc='the name of the field in the POST data for dumps')
    required_config.add_option('dump_id_prefix',
                               default='bp-',
                               doc='the crash type prefix')
    required_config.add_option(
        'crashstorage_class',
        default='antenna.ext.crashstorage_base.NoOpCrashStorage',
        parser=parse_class,
        doc='the class in charge of storing crashes')

    # Maximum number of concurrent crashmover workers; each process gets this
    # many concurrent crashmovers, so if you're running 5 processes on the node
    # then it's (5 * concurrent_crashmovers) fighting for upload bandwidth
    required_config.add_option(
        'concurrent_crashmovers',
        default='2',
        parser=int,
        doc='the number of crashes concurrently being saved to s3')

    def __init__(self, config):
        self.config = config.with_options(self)
        self.crashstorage = self.config('crashstorage_class')(
            config.with_namespace('crashstorage'))
        self.throttler = Throttler(config)

        # Gevent pool for crashmover workers
        self.crashmover_pool = Pool(size=self.config('concurrent_crashmovers'))

        # Queue for crashmover of crashes to save
        self.crashmover_save_queue = deque()

        # Register hb functions with heartbeat manager
        register_for_heartbeat(self.hb_report_health_stats)
        register_for_heartbeat(self.hb_run_crashmover)

        # Register life function with heartbeat manager
        register_for_life(self.has_work_to_do)

    def get_runtime_config(self, namespace=None):
        for item in super().get_runtime_config():
            yield item

        for item in self.throttler.get_runtime_config():
            yield item

        for item in self.crashstorage.get_runtime_config(['crashstorage']):
            yield item

    def check_health(self, state):
        if hasattr(self.crashstorage, 'check_health'):
            self.crashstorage.check_health(state)

    def hb_report_health_stats(self):
        # The number of crash reports sitting in the queue; this is a direct
        # measure of the health of this process--a number that's going up means
        # impending doom
        mymetrics.gauge('save_queue_size',
                        value=len(self.crashmover_save_queue))

    def has_work_to_do(self):
        work_to_do = len(self.crashmover_save_queue) + len(
            self.crashmover_pool)
        logger.info('work left to do: %s' % work_to_do)
        # Indicates whether or not we're sitting on crashes to save--this helps
        # keep Antenna alive until we're done saving crashes
        return bool(work_to_do)

    def extract_payload(self, req):
        """Parses the HTTP POST payload

        Decompresses the payload if necessary and then walks through the
        FieldStorage converting from multipart/form-data to Python datatypes.

        NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It
        has a list attribute that is a list of FieldStorage items--one for each
        key/val in the form. For attached files, the FieldStorage will have a
        name, value and filename and the type should be
        application/octet-stream. Thus we parse it looking for things of type
        text/plain and application/octet-stream.

        :arg falcon.request.Request req: a Falcon Request instance

        :returns: (raw_crash dict, dumps dict)

        """
        # If we don't have a content type, return an empty crash
        if not req.content_type:
            return {}, {}

        # If it's the wrong content type or there's no boundary section, return
        # an empty crash
        content_type = [
            part.strip() for part in req.content_type.split(';', 1)
        ]
        if ((len(content_type) != 2 or content_type[0] != 'multipart/form-data'
             or not content_type[1].startswith('boundary='))):
            return {}, {}

        content_length = req.content_length or 0

        # If there's no content, return an empty crash
        if content_length == 0:
            return {}, {}

        # Decompress payload if it's compressed
        if req.env.get('HTTP_CONTENT_ENCODING') == 'gzip':
            mymetrics.incr('gzipped_crash')

            # If the content is gzipped, we pull it out and decompress it. We
            # have to do that here because nginx doesn't have a good way to do
            # that in nginx-land.
            gzip_header = 16 + zlib.MAX_WBITS
            try:
                data = zlib.decompress(req.stream.read(content_length),
                                       gzip_header)
            except zlib.error:
                # This indicates this isn't a valid compressed stream. Given
                # that the HTTP request insists it is, we're just going to
                # assume it's junk and not try to process any further.
                mymetrics.incr('bad_gzipped_crash')
                return {}, {}

            # Stomp on the content length to correct it because we've changed
            # the payload size by decompressing it. We save the original value
            # in case we need to debug something later on.
            req.env['ORIG_CONTENT_LENGTH'] = content_length
            content_length = len(data)
            req.env['CONTENT_LENGTH'] = str(content_length)

            data = io.BytesIO(data)
            mymetrics.histogram('crash_size',
                                value=content_length,
                                tags=['payload:compressed'])
        else:
            # NOTE(willkg): At this point, req.stream is either a
            # falcon.request_helper.BoundedStream (in tests) or a
            # gunicorn.http.body.Body (in production).
            #
            # FieldStorage doesn't work with BoundedStream so we pluck out the
            # internal stream from that which works fine.
            #
            # FIXME(willkg): why don't tests work with BoundedStream?
            if isinstance(req.stream, BoundedStream):
                data = req.stream.stream
            else:
                data = req.stream

            mymetrics.histogram('crash_size',
                                value=content_length,
                                tags=['payload:uncompressed'])

        fs = cgi.FieldStorage(fp=data, environ=req.env, keep_blank_values=1)

        # NOTE(willkg): In the original collector, this returned request
        # querystring data as well as request body data, but we're not doing
        # that because the query string just duplicates data in the payload.

        raw_crash = {}
        dumps = {}

        for fs_item in fs.list:
            # NOTE(willkg): We saw some crashes come in where the raw crash ends up with
            # a None as a key. Make sure we can't end up with non-strings as keys.
            item_name = de_null(fs_item.name or '')

            if item_name == 'dump_checksums':
                # We don't want to pick up the dump_checksums from a raw
                # crash that was re-submitted.
                continue

            elif fs_item.type and (
                    fs_item.type.startswith('application/octet-stream')
                    or isinstance(fs_item.value, bytes)):
                # This is a dump, so add it to dumps using a sanitized dump
                # name.
                dump_name = sanitize_dump_name(item_name)
                dumps[dump_name] = fs_item.value

            else:
                # This isn't a dump, so it's a key/val pair, so we add that.
                raw_crash[item_name] = de_null(fs_item.value)

        return raw_crash, dumps

    def get_throttle_result(self, raw_crash):
        """Given a raw_crash, figures out the throttling

        If the raw_crash contains throttling information already, it returns
        that. If it doesn't, then this will apply throttling and return the
        results of that.

        A rule name of ``ALREADY_THROTTLED`` indicates that the raw_crash was
        previously throttled and we're re-using that data.

        A rule name of ``THROTTLEABLE_0`` indicates that the raw_crash was
        marked to not be throttled.

        :arg dict raw_crash: the raw crash to throttle

        :returns tuple: ``(result, rule_name, percentage)``

        """
        # If the raw_crash has a uuid, then that implies throttling, so return
        # that.
        if 'uuid' in raw_crash:
            crash_id = raw_crash['uuid']
            if crash_id[-7] in (str(ACCEPT), str(DEFER)):
                result = int(crash_id[-7])
                throttle_rate = 100

                # Save the results in the raw_crash itself
                raw_crash['legacy_processing'] = result
                raw_crash['throttle_rate'] = throttle_rate

                return result, 'FROM_CRASHID', throttle_rate

        # If we have throttle results for this crash, return those.
        if 'legacy_processing' in raw_crash and 'throttle_rate' in raw_crash:
            try:
                result = int(raw_crash['legacy_processing'])
                if result not in (ACCEPT, DEFER):
                    raise ValueError('Result is not a valid value: %r', result)

                throttle_rate = int(raw_crash['throttle_rate'])
                if not (0 <= throttle_rate <= 100):
                    raise ValueError('Throttle rate is not a valid value: %r',
                                     result)
                return result, 'ALREADY_THROTTLED', throttle_rate

            except ValueError:
                # If we've gotten a ValueError, it means one or both of the
                # values is bad and we should ignore it and move forward.
                mymetrics.incr('throttle.bad_throttle_values')

        # If we have a Throttleable=0, then return that.
        if raw_crash.get('Throttleable', None) == '0':
            # If the raw crash has ``Throttleable=0``, then we accept the
            # crash.
            mymetrics.incr('throttleable_0')
            result = ACCEPT
            rule_name = 'THROTTLEABLE_0'
            throttle_rate = 100

        else:
            # At this stage, nothing has given us a throttle answer, so we
            # throttle the crash.
            result, rule_name, throttle_rate = self.throttler.throttle(
                raw_crash)

        # Save the results in the raw_crash itself
        raw_crash['legacy_processing'] = result
        raw_crash['throttle_rate'] = throttle_rate

        return result, rule_name, throttle_rate

    @mymetrics.timer_decorator('on_post.time')
    def on_post(self, req, resp):
        """Handles incoming HTTP POSTs

        Note: This is executed by the WSGI app, so it and anything it does is
        covered by the Sentry middleware.

        """
        resp.status = falcon.HTTP_200

        start_time = time.time()
        # NOTE(willkg): This has to return text/plain since that's what the
        # breakpad clients expect.
        resp.content_type = 'text/plain'

        raw_crash, dumps = self.extract_payload(req)

        # If we didn't get any crash data, then just drop it and move on--don't
        # count this as an incoming crash and don't do any more work on it
        if not raw_crash:
            resp.body = 'Discarded=1'
            return

        mymetrics.incr('incoming_crash')

        # Add timestamps
        current_timestamp = utc_now()
        raw_crash['submitted_timestamp'] = current_timestamp.isoformat()
        raw_crash['timestamp'] = start_time

        # Add checksums and MinidumpSha256Hash
        raw_crash['dump_checksums'] = {
            dump_name: hashlib.sha256(dump).hexdigest()
            for dump_name, dump in dumps.items()
        }
        raw_crash['MinidumpSha256Hash'] = raw_crash['dump_checksums'].get(
            'upload_file_minidump', '')

        # First throttle the crash which gives us the information we need
        # to generate a crash id.
        throttle_result, rule_name, percentage = self.get_throttle_result(
            raw_crash)

        # Use a uuid if they gave us one and it's valid--otherwise create a new
        # one.
        if 'uuid' in raw_crash and validate_crash_id(raw_crash['uuid']):
            crash_id = raw_crash['uuid']
            logger.info('%s has existing crash_id', crash_id)

        else:
            crash_id = create_crash_id(timestamp=current_timestamp,
                                       throttle_result=throttle_result)
            raw_crash['uuid'] = crash_id

        raw_crash['type_tag'] = self.config('dump_id_prefix').strip('-')

        # Log the throttle result
        logger.info('%s: matched by %s; returned %s', crash_id, rule_name,
                    RESULT_TO_TEXT[throttle_result])
        mymetrics.incr('throttle_rule', tags=['rule:%s' % rule_name])
        mymetrics.incr(
            'throttle',
            tags=['result:%s' % RESULT_TO_TEXT[throttle_result].lower()])

        if throttle_result is REJECT:
            # If the result is REJECT, then discard it
            resp.body = 'Discarded=1'

        else:
            # If the result is not REJECT, then save it and return the CrashID to
            # the client
            self.crashmover_save_queue.append(
                CrashReport(raw_crash, dumps, crash_id))
            self.hb_run_crashmover()
            resp.body = 'CrashID=%s%s\n' % (self.config('dump_id_prefix'),
                                            crash_id)

    def hb_run_crashmover(self):
        """Checks to see if it should spawn a crashmover and does if appropriate"""
        # Spawn a new crashmover if there's stuff in the queue and there isn't
        # one currently running
        if self.crashmover_save_queue and self.crashmover_pool.free_count(
        ) > 0:
            self.crashmover_pool.spawn(self.crashmover_process_queue)

    def crashmover_process_queue(self):
        """Processes the queue of crashes to save until it's empty

        Note: This has to be super careful not to lose crash reports. If
        there's any kind of problem, this must return the crash to the queue.

        """
        # Process crashes until the queue is empty
        while self.crashmover_save_queue:
            crash_report = self.crashmover_save_queue.popleft()

            try:
                self.crashmover_save(crash_report)

            except Exception:
                mymetrics.incr('save_crash_exception.count')
                crash_report.errors += 1
                logger.exception(
                    'Exception when processing save queue (%s); error %d/%d',
                    crash_report.crash_id, crash_report.errors, MAX_ATTEMPTS)

                # After MAX_ATTEMPTS, we give up on this crash and move on
                if crash_report.errors < MAX_ATTEMPTS:
                    self.crashmover_save_queue.append(crash_report)
                else:
                    logger.error('%s: too many errors trying to save; dropped',
                                 crash_report.crash_id)
                    mymetrics.incr('save_crash_dropped.count')

    def crashmover_save(self, crash_report):
        """Saves a crash to storage

        If this raises an error, then that bubbles up and the caller can figure
        out what to do with it and retry again later.

        """
        crash_id = crash_report.crash_id
        dumps = crash_report.dumps
        raw_crash = crash_report.raw_crash

        # Capture total time it takes to save the crash
        with mymetrics.timer('crash_save.time'):
            # Save dumps to crashstorage
            self.crashstorage.save_dumps(crash_id, dumps)

            # Save the raw crash metadata to crashstorage
            self.crashstorage.save_raw_crash(crash_id, raw_crash)

        # Capture the total time it took for this crash to be handled from
        # being received from breakpad client to saving to s3.
        #
        # NOTE(willkg): time.time returns seconds, but .timing() wants
        # milliseconds, so we multiply!
        delta = (time.time() - raw_crash['timestamp']) * 1000
        mymetrics.timing('crash_handling.time', value=delta)

        mymetrics.incr('save_crash.count')
        logger.info('%s saved', crash_id)

    def join_pool(self):
        """Joins the pool--use only in tests!

        This is helpful for forcing all the coroutines in the pool to complete
        so that we can verify outcomes in the test suite for work that might
        cross coroutines.

        """
        self.crashmover_pool.join()
Esempio n. 24
0
class _DownloadAgent(object):
    """Exclusively manages downloading files from Drive within another process.
    """
# TODO(dustin): We'll have to use multiprocessing's logging wrappers.
    def __init__(self, request_q, stop_ev):
        self.__request_q = request_q
        self.__stop_ev = stop_ev
        self.__kill_ev = gevent.event.Event()
        self.__worker_pool = Pool(size=download_agent.NUM_WORKERS)
        self.__http_pool = HttpPool(download_agent.HTTP_POOL_SIZE)
        self.__http = GdriveAuth().get_authed_http()

    def download_worker(self, download_request, request_ev, download_stop_ev, 
                        ns):
# TODO(dustin): We're just assuming that we can signal a multiprocessing event
#               from a green thread (the event still has value switching 
#               through green threads.

        file_path = ('/tmp/gdrivefs/downloaded/%s' % 
                     (download_request.typed_entry.entry_id))

        with open(file_path, 'wb') as f:
            downloader = ChunkedDownload(f, 
                                         self.__http, 
                                         download_request.url, 
                                         chunksize=download_agent.CHUNK_SIZE)

        try:
            while 1:
                # Stop downloading because the process is coming down.
                if self.__kill_ev.is_set() is True:
                    raise DownloadAgentDownloadWorkerError(
                        "Download worker terminated.")

                # Stop downloading this file, probably because all handles were 
                # closed.
                if download_stop_ev.is_set() is True:
                    raise DownloadAgentDownloadWorkerError(
                        "Download worker was told to stop downloading.")

# TODO(dustin): We'll have to provide an option for "revision assurance" to ensure that we download the same revision of a file from chunk to chunk. Otherwise, we won't have the guarantee.

# TODO(dustin): Support reauthing, when necessary.
# TODO(dustin): Support resumability.

                status, done = downloader.next_chunk()
                ns.bytes_written = status.resumable_progress

                if done is True:
                    break

# TODO(dustin): Finish this, and make sure the timezone matches the current system.
            mtime_epoch = 0#download_request.current_mtime_dt
            utime(file_path, (mtime_epoch, mtime_epoch))

        except Exception as e:
            error = ("[%s] %s" % (e.__class__.__name__, str(e)))
        else:
            error = None

        ns.error = error
        if error is None:
            ns.file_path = file_path

        request_ev.set()

    def loop(self):
        while self.__stop_ev.is_set() is False:
            try:
                request_info = self.__request_q.get(
                    timeout=download_agent.REQUEST_QUEUE_TIMEOUT_S)
            except Empty:
                continue

            if self.__worker_pool.free_count() == 0:
                logging.warn("It looks like we'll have to wait for a download "
                             "worker to free up.")

            self.__worker_pool.spawn(self.download_worker, *request_info)

        # The download loop has exited (we were told to stop).

        # Signal the workers to stop what they're doing.

        self.__kill_ev.set()
        start_epoch = time()
        all_exited = False
        while (time() - start_epoch) < 
                download_agent.GRACEFUL_WORKER_EXIT_WAIT_S:
            if self.__worker_pool.size <= self.__worker_pool.free_count():
                all_exited = True
                break

        if all_exited is False:
            logging.error("Not all download workers exited in time: %d != %d" % 
                          (self.__worker_pool.size,
                           self.__worker_pool.free_count()))

        # Kill and join the unassigned (and stubborn, still-assigned) workers.
# TODO(dustin): We're assuming this is a hard kill that will always kill all workers.
        self.__worker_pool.kill()

        logging.info("Download agent is terminating. (%d) requested files "
                     "will be abandoned." % (self.__request_q.qsize()))
Esempio n. 25
0
class Worker:

	def __init__(self, seeds, done_que, run_que):

		self.showpercounts = 10
		self.timeout = 5
		self.starttime = time.time()
		self.oldtime = 0

		self.quit = 0
		self.https_enable = 0


		self.run_que = run_que
		self.done_que = done_que
		self.tasks = []
		self.done = 1

		self.errdone = set()
		self.err = Error()

		self.loadstate()

		self.blacklist = set (( '.blog.','.taobao.com','.baidu.com','.edu','.gov','.mil','mail','.google',
	'weibo.com','t.cn','wikipedia','facebook','twitter','dropbox' ))
		self.allowdDomain = set(('com','net','org','cn','info','biz','me','name','cc','tv'))

		self.httpget = self.httpget_requests # down method self.httpget_requests | httpget_curl

		self.poolsize = 60
		self.poolmaxfree = 20
		self.freecount = 0
		self.down_pool = Pool(size=self.poolsize)

		self.totalnettime = 0
		self.cbcputime = 0
		self.totaldownsize = 0
		
		self.curspeed = 0

		self.debugnosave = 1
		self.tt = 1

		self.done_sites_fname='done_sites.bin'
		try:
			self.bfdone = BloomFilter.open(self.done_sites_fname)
		except:
			self.bfdone = BloomFilter(2**23, 10**(-5), self.done_sites_fname) #8M 

		if self.run_que.qsize() == 0:
			for seed in seeds:
				self.run_que.put( seed.split("http://")[1] )

		if self.https_enable == 0:
			self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',re.I)
		else:
			self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',re.I)


	def cb_httpget(self, data = None):

		if not data:
			return
		seed, err, headers, content = data
		st = time.time()

		if err:
			self.handle_error(err,seed)
			return

		if self.https_enable == 0:
			seed = seed[7:]

		self.bfdone.add(seed)
		self.done += 1

		data={'seed':seed,'headers':headers,'content':content}

		dat = cPickle.dumps(data)
		self.done_que.put(dat)

		et = time.time()
		self.cbcputime += (et-st)
		#self.tt=(et-st)

		if self.done % self.showpercounts == 0:
			self.out(seed)
			pass

	def out(self, seed):

		spendtime = time.time() - self.starttime
		spendtime = 1 if spendtime == 0 else spendtime
		nowh = str(int(spendtime)/3600)+":" if spendtime>3600 else ""
		now = "%s%02d:%02d" % (nowh, spendtime%3600/60, spendtime%60 )
		print "%s D:%-4d R:%-7d [Speed: T%.2f/s C%3d/s A%.2f] CB:%0.4f Active:%d %s %s" % (now, (self.done), self.run_que.qsize(), \
			(self.done)/(spendtime+self.oldtime), self.curspeed, self.tt, self.totalnettime / self.done ,self.poolsize-self.freecount, str(self.err), seed )
	
	
	def work(self):

		while self.quit == 0:

			st = time.time()
			curdone = self.done

			self.freecount = self.down_pool.free_count()
			

			if self.freecount > self.poolmaxfree:
				self.tasks = []
				minlen = min(self.freecount+1,self.run_que.qsize())
				#if minlen <=0:break
				
				for i in range( minlen):
					stt = time.time()
					url = self.run_que.get()
					ett = time.time()
					if url in self.bfdone:# 5%-10%
							continue

					url = "http://"+url
					self.tasks.append(url)

				for url in self.tasks:
					self.down_pool.apply_async(self.httpget, (url,), callback=self.cb_httpget)

			
			time.sleep(0.1)
			et = time.time()	
			self.curspeed = (self.done - curdone) / (et-st)
			#self.tt = (et-st)

	
		self.down_pool.join()
		print "All OVER"

	def handle_error(self,e,url):

		if e.find('DNSError') > 0 :
			self.err.dns += 1
			self.err.rdns.append(url)
		elif e.find('reset') > 0 :#Connection reset
			self.err.reset += 1
			self.err.rreset.append(url)
		elif e.find('Max retries') > 0 or e.find('Connection aborted'): #
			self.err.conntimeout += 1
			self.err.rconntimeout.append(url)
		elif e.find('refused') > 0: #Connection refused
			self.err.refuse += 1
			self.err.rrefuse.append(url)

		else:
			self.err.others +=1
			self.err.rothers.append(url)
			print "Error", url, e

	# requests is better through test
	def httpget_requests(self, url):

		st = time.time()
		con = ""
		e = ""
		res_headers = ""
		headers = {
					'Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.6',
					'Accept-Encoding':'gzip,deflate',
					'Connection':'close',
					'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
				}


		res = None
		try:
			# todo: query the ip of the website before get through dns
			req = requests
			req.max_redirects = 1
			res = req.get(url, timeout = (3,2), headers = headers )
			if self.https_enable == 0 and res.url.lower().startswith('http:'):
				if 'content-type' not in res.headers.keys() or 'html' not in res.headers['content-type']:
					return None
				con = res.content
				
			res.close()

		except KeyboardInterrupt:
				raise
		except Exception as e:
			e = str(e)
			if res:
				res.close()

			return url,e,None,None

		et = time.time()
		self.totalnettime += (et-st)
		self.tt = (et-st)
		return url, e, res.headers, con

	def savestate(self):

		self.quit = 1
		now = time.time()
		self.oldtime += (now - self.starttime)

		#should hold on the singal for procdata done


		with open('state.txt','wb') as f:
			f.write(str(self.oldtime) + '\n')
			# tasks run_queue done
			f.write(str(len(self.tasks)) + '\n')
			for t in self.tasks:
				f.write(t + '\n')
			l = self.run_que.qsize()
			f.write(str(l)+ '\n')
			while l > 0:
				f.write( self.run_que.pop() + '\n')
				l-=1
			f.write(str((self.done)) + '\n')
 
		with open('err_records.pack','wb') as f:
			cPickle.dump(self.err,f,2)

		print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Save state successfully."
		f.close()
		exit(0)

	def loadstate(self):

		try:
			with open('state.txt') as f:
				self.oldtime = float(f.readline())
				tasks = int(f.readline())
				for i in xrange(tasks):
					self.run_que.add(f.readline().rstrip('\n'))

				runnings = int(f.readline())
				for i in xrange(runnings):
					self.run_que.add(f.readline().rstrip('\n'))

				self.done = int(f.readline())

			with open('err_records.pack','rb') as f:
				self.err = cPickle.load(f)

			print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), " Load state successfuly."
		except Exception as e:
				print e
Esempio n. 26
0
class RequestEngine:

    class ProcessorManager(object):
        def __init__(self):
            self._processor_map = {'default': None}
        def set(self, processor_name,  value):
            self._processor_map[processor_name] = value

        def route(self, processor_name, **kwargs):
            if processor_name is None:
                processor_name_indeed = 'default'
            else:
                processor_name_indeed = processor_name

            processor = self._processor_map[processor_name_indeed]
            if processor is None:
                pass
            elif hasattr(processor, '__call__'):
                return processor.__call__(**kwargs)


    def __init__(self,
                 pool_size = 20,
                 pop_interval = 1,
                 request_interval = 0,
                 max_empty_retry = 2,
                 request_timeout = 10,
                 each_size_from_queue = 10,
                 max_failure_allowed = -1):
        from gevent import monkey
        monkey.patch_all()
        self.pop_interval = pop_interval
        self.request_interval = request_interval
        self.pool = Pool(pool_size)
        self.quit_event = Event()
        self.max_empty_retry = max_empty_retry
        self.request_timeout = request_timeout
        self.each_size_from_queue = each_size_from_queue
        self.user_agent_provider = UserAgentProvider()
        self.max_failure_allowed = max_failure_allowed
        self._request_failure = 0
        self.proxy_provider = None
        self.processor_manager = RequestEngine.ProcessorManager()
        self.before_each = []
        self.after_each = []

        gevent.signal(signal.SIGINT, self.quit)
        gevent.signal(signal.SIGQUIT, self.quit)
        gevent.signal(signal.SIGTERM, self.quit)

    def setup_request_queue(self, request_queue_ins):
        self.request_queue = request_queue_ins

    @property
    def active(self):
        if not hasattr(self, '_active'):
            self._active = False
        return self._active

    @active.setter
    def active(self, value):
        self._active = value

    def before_each(self, *processors):
        self.before_each += processors

    def after_each(self, *processors):
        self.after_each += processors

    def worker_count(self):
        return self.pool.size - self.pool.free_count()

    def quit(self):
        self.quit_event.set()

    def request(self, override_req_args= {}):
        self.active = True
        empty_count = 0
        while True:
            if self.quit_event.is_set():

                logger.warning("Quiting Engine")
                if self.pool.size != self.pool.free_count():
                    time.sleep(1)
                    continue

                self.active = False
                logger.warning("Engine Gracefully Quit")
                break

            if (self.max_failure_allowed != -1 and self._request_failure >= self.max_failure_allowed):
                logger.warning( "Exceed Max Failures Count. Engine Stopping ..." )
                self.quit()
                continue

            if self.pool.free_count() > self.each_size_from_queue:
                this_time_size = self.each_size_from_queue
            else:
                this_time_size = self.pool.free_count()

            if this_time_size > 0:
                reqs = self.request_queue.pop(this_time_size)
                logger.info('Current free workers: '+str(self.pool.free_count()))
                if (reqs is not None) and (len(reqs) > 0):

                    for i in reqs:
                        self.pool.spawn(self._make_requests, request=i, override = override_req_args)
                        time.sleep(self.request_interval)
                else:
                    empty_count +=1
                    if (self.max_empty_retry != -1 and empty_count >= self.max_empty_retry):
                        logger.warning( "Exceed Max Empty. Engine Stopping ..." )
                        self.quit()
                        continue

            #while self.pool.free_count() == 0:
            time.sleep(self.pop_interval)

    def setup_user_agent_provider(self, provider):
        self.user_agent_provider = provider

    def setup_proxy_provider(self, provider):
        self.proxy_provider = provider

    def register_processor(self, processor, name='default'):
        self.processor_manager.set(name, processor)

    def _make_requests(self, request, override):
        empty_count = 0
        data= {} # Data flow

        is_failure_set = False
        request.kwargs.update(override)
        # Setting user agent
        if self.user_agent_provider:
            if 'headers' in request.kwargs:
                request.kwargs['headers'].update({'User-Agent': self.user_agent_provider.provide()})
            else:
                request.kwargs['headers'] = {'User-Agent': self.user_agent_provider.provide()}

        # Setting proxy provider
        if self.proxy_provider:
            proxy = self.proxy_provider.provide()
            if proxy is not None:
                # If Provider return None, not use proxy
                _proxy = {'http':proxy.proxy, 'https':proxy.proxy}
                if 'proxies' in request.kwargs:
                    request.kwargs['proxies'].update(_proxy)
                else:
                    request.kwargs['proxies'] = _proxy

                logger.warning("Using Proxy: %s" % str(_proxy))
            else:
                logger.warning("No Using Proxy")
        else:
            proxy = None


        ar = None
        result = False
        processors = {'before':None, 'after':None}
        if request.processors is not None:
            processors.update(request.processors)
        before_each_hook_result = None
        # Execute hook before every item
        try:
            logger.info("Executing before hook")
            before_each_hook_result = self.processor_manager.route(
                                                                   processor_name=processors['before'],
                                                                   request = request,
                                                                   extra = request.raw_info,
                                                                   data= data)

            for p in self.before_each:
                self.processor_manager.route(processor_name=p, request = request ,extra = request.raw_info, data= data)
        except:
            if not is_failure_set:
                self._request_failure += 1
                is_failure_set = True
            logger.error("Exception while before hook execution: "+ traceback.format_exc())
        # Execute request

        if before_each_hook_result != False:
            # Only if before hook return non-false
            try:
                logger.debug("Making request... (%s)" % str(request.kwargs))
                _timeout =  getattr(request.raw_info,'_timeout',self.request_timeout)
                logger.debug("Timeout setting: %s" % _timeout)
                with gevent.Timeout(_timeout):
                    ar = requests.request(**request.kwargs)
                    ar.raw_info = request.raw_info
                    result = True
                # if result is False:
                #     raise Exception("Request timeout (%s)" % self.request_timeout)
            except:

                if not is_failure_set:
                    self._request_failure += 1
                    is_failure_set = True
                logger.error("Exception while requests execution: "+ traceback.format_exc())


            try:

                # Execute hook after every request
                logger.info("Executing after hook")
                self.processor_manager.route(
                                             processor_name=processors['after'],
                                             response = ar,
                                             request = request,
                                             extra = request.raw_info,
                                             result = result, data=data)

                for p in self.after_each:
                    self.processor_manager.route(processor_name=p,response = ar, request = request,extra = request.raw_info, result = result, data= data)

                # process proxy provider
                if proxy:
                    self.proxy_provider.callback(proxy, result=result, response = ar, request=request)
            except:
                if not is_failure_set:
                    self._request_failure += 1
                    is_failure_set = True
                logger.error("Exception while after hook execution", exc_info=True)
Esempio n. 27
0
class BreakpadSubmitterResource(RequiredConfigMixin):
    """Handles incoming breakpad-style crash reports.

    This handles incoming HTTP POST requests containing breakpad-style crash reports in
    multipart/form-data format.

    It can handle compressed or uncompressed POST payloads.

    It parses the payload from the HTTP POST request, runs it through the throttler with
    the specified rules, generates a crash_id, returns the crash_id to the HTTP client,
    saves the crash using the configured crashstorage class, and publishes it using
    the configured crashpublish class.

    .. Note::

       From when a crash comes in to when it's saved by the crashstorage class, the
       crash is entirely in memory. Keep that in mind when figuring out how to scale
       your Antenna nodes.


    The most important configuration bit here is choosing the crashstorage class.

    For example::

        CRASHSTORAGE_CLASS=antenna.ext.s3.crashstorage.S3CrashStorage

    """

    required_config = ConfigOptions()
    required_config.add_option(
        "dump_field",
        default="upload_file_minidump",
        doc="The name of the field in the POST data for dumps.",
    )
    required_config.add_option(
        "dump_id_prefix", default="bp-", doc="The crash type prefix."
    )
    required_config.add_option(
        "concurrent_crashmovers",
        default="2",
        parser=positive_int,
        doc=(
            "The number of crashes concurrently being saved and published. "
            "Each process gets this many concurrent crashmovers, so if you're "
            "running 5 processes on the node, then it's "
            "(5 * concurrent_crashmovers) sharing upload bandwidth."
        ),
    )

    # crashstorage things
    required_config.add_option(
        "crashstorage_class",
        default="antenna.ext.crashstorage_base.NoOpCrashStorage",
        parser=parse_class,
        doc="The class in charge of storing crashes.",
    )

    # crashpublish things
    required_config.add_option(
        "crashpublish_class",
        default="antenna.ext.crashpublish_base.NoOpCrashPublish",
        parser=parse_class,
        doc="The class in charge of publishing crashes.",
    )

    def __init__(self, config):
        self.config = config.with_options(self)
        self.crashstorage = self.config("crashstorage_class")(
            config.with_namespace("crashstorage")
        )
        self.crashpublish = self.config("crashpublish_class")(
            config.with_namespace("crashpublish")
        )
        self.throttler = Throttler(config)

        # Gevent pool for crashmover workers
        self.crashmover_pool = Pool(size=self.config("concurrent_crashmovers"))

        # Queue for crashmover work
        self.crashmover_queue = deque()

        # Register hb functions with heartbeat manager
        register_for_heartbeat(self.hb_report_health_stats)
        register_for_heartbeat(self.hb_run_crashmover)

        # Register life function with heartbeat manager
        register_for_life(self.has_work_to_do)

    def get_runtime_config(self, namespace=None):
        """Return generator of runtime configuration."""
        for item in super().get_runtime_config():
            yield item

        for item in self.throttler.get_runtime_config():
            yield item

        for item in self.crashstorage.get_runtime_config(["crashstorage"]):
            yield item

        for item in self.crashpublish.get_runtime_config(["crashpublish"]):
            yield item

    def check_health(self, state):
        """Return health state."""
        if hasattr(self.crashstorage, "check_health"):
            self.crashstorage.check_health(state)
        if hasattr(self.crashpublish, "check_health"):
            self.crashpublish.check_health(state)

    def hb_report_health_stats(self):
        """Heartbeat function to report health stats."""
        # The number of crash reports sitting in the work queue; this is a
        # direct measure of the health of this process--a number that's going
        # up means impending doom
        mymetrics.gauge("work_queue_size", value=len(self.crashmover_queue))

    def has_work_to_do(self):
        """Return whether this still has work to do."""
        work_to_do = len(self.crashmover_pool) + len(self.crashmover_queue)
        logger.info("work left to do: %s" % work_to_do)
        # Indicates whether or not we're sitting on crashes to save--this helps
        # keep Antenna alive until we're done saving crashes
        return bool(work_to_do)

    def extract_payload(self, req):
        """Parse HTTP POST payload.

        Decompresses the payload if necessary and then walks through the
        FieldStorage converting from multipart/form-data to Python datatypes.

        NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It
        has a list attribute that is a list of FieldStorage items--one for each
        key/val in the form. For attached files, the FieldStorage will have a
        name, value and filename and the type should be
        ``application/octet-stream``. Thus we parse it looking for things of type
        ``text/plain``, ``application/json``, and application/octet-stream.

        :arg falcon.request.Request req: a Falcon Request instance

        :returns: (raw_crash dict, dumps dict)

        :raises MalformedCrashReport:

        """
        # If we don't have a content type, raise MalformedCrashReport
        if not req.content_type:
            raise MalformedCrashReport("no_content_type")

        # If it's the wrong content type or there's no boundary section, raise
        # MalformedCrashReport
        content_type = [part.strip() for part in req.content_type.split(";", 1)]
        if (
            len(content_type) != 2
            or content_type[0] != "multipart/form-data"
            or not content_type[1].startswith("boundary=")
        ):
            if content_type[0] != "multipart/form-data":
                raise MalformedCrashReport("wrong_content_type")
            else:
                raise MalformedCrashReport("no_boundary")

        content_length = req.content_length or 0

        # If there's no content, raise MalformedCrashReport
        if content_length == 0:
            raise MalformedCrashReport("no_content_length")

        # Decompress payload if it's compressed
        if req.env.get("HTTP_CONTENT_ENCODING") == "gzip":
            mymetrics.incr("gzipped_crash")

            # If the content is gzipped, we pull it out and decompress it. We
            # have to do that here because nginx doesn't have a good way to do
            # that in nginx-land.
            gzip_header = 16 + zlib.MAX_WBITS
            try:
                data = zlib.decompress(req.stream.read(content_length), gzip_header)
            except zlib.error:
                # This indicates this isn't a valid compressed stream. Given
                # that the HTTP request insists it is, we're just going to
                # assume it's junk and not try to process any further.
                raise MalformedCrashReport("bad_gzip")

            # Stomp on the content length to correct it because we've changed
            # the payload size by decompressing it. We save the original value
            # in case we need to debug something later on.
            req.env["ORIG_CONTENT_LENGTH"] = content_length
            content_length = len(data)
            req.env["CONTENT_LENGTH"] = str(content_length)

            data = io.BytesIO(data)
            mymetrics.histogram(
                "crash_size", value=content_length, tags=["payload:compressed"]
            )
        else:
            # NOTE(willkg): At this point, req.stream is either a
            # falcon.request_helper.BoundedStream (in tests) or a
            # gunicorn.http.body.Body (in production).
            #
            # FieldStorage doesn't work with BoundedStream so we pluck out the
            # internal stream from that which works fine.
            #
            # FIXME(willkg): why don't tests work with BoundedStream?
            if isinstance(req.stream, BoundedStream):
                data = req.stream.stream
            else:
                data = req.stream

            mymetrics.histogram(
                "crash_size", value=content_length, tags=["payload:uncompressed"]
            )

        # Stomp on querystring so we don't pull it in
        request_env = dict(req.env)
        request_env["QUERY_STRING"] = ""

        fs = cgi.FieldStorage(fp=data, environ=request_env, keep_blank_values=1)

        raw_crash = {}
        dumps = {}

        has_json = False
        has_kvpairs = False

        for fs_item in fs.list:
            # If the field has no name, then it's probably junk, so let's drop it.
            if not fs_item.name:
                continue

            if fs_item.name == "dump_checksums":
                # We don't want to pick up the dump_checksums from a raw
                # crash that was re-submitted.
                continue

            elif fs_item.type and fs_item.type.startswith("application/json"):
                # This is a JSON blob, so load it and override raw_crash with
                # it.
                has_json = True
                try:
                    raw_crash = json.loads(fs_item.value)
                except json.decoder.JSONDecodeError:
                    raise MalformedCrashReport("bad_json")

            elif fs_item.type and (
                fs_item.type.startswith("application/octet-stream")
                or isinstance(fs_item.value, bytes)
            ):
                # This is a dump, so add it to dumps using a sanitized dump
                # name.
                dump_name = sanitize_dump_name(fs_item.name)
                dumps[dump_name] = fs_item.value

            else:
                # This isn't a dump, so it's a key/val pair, so we add that.
                has_kvpairs = True
                raw_crash[fs_item.name] = fs_item.value

        if not raw_crash:
            raise MalformedCrashReport("no_annotations")

        if has_json and has_kvpairs:
            # If the crash payload has both kvpairs and a JSON blob, then it's
            # malformed and we should dump it.
            raise MalformedCrashReport("has_json_and_kv")

        # Add a note about how the annotations were encoded in the crash report.
        # For now, there are two options: json and multipart.
        if has_json:
            raw_crash["payload"] = "json"
        else:
            raw_crash["payload"] = "multipart"

        return raw_crash, dumps

    def get_throttle_result(self, raw_crash):
        """Run raw_crash through throttler for a throttling result.

        :arg dict raw_crash: the raw crash to throttle

        :returns tuple: ``(result, rule_name, percentage)``

        """
        # At this stage, nothing has given us a throttle answer, so we
        # throttle the crash.
        result, rule_name, throttle_rate = self.throttler.throttle(raw_crash)

        # Save the results in the raw_crash itself
        raw_crash["legacy_processing"] = result
        raw_crash["throttle_rate"] = throttle_rate

        return result, rule_name, throttle_rate

    def cleanup_crash_report(self, raw_crash):
        """Remove anything from the crash report that shouldn't be there.

        This operates on the raw_crash in-place. This adds notes to ``collector_notes``.

        """
        collector_notes = []

        # Remove bad fields
        for bad_field in BAD_FIELDS:
            if bad_field in raw_crash:
                del raw_crash[bad_field]
                collector_notes.append("Removed %s from raw crash." % bad_field)

        raw_crash["collector_notes"] = collector_notes

    @mymetrics.timer_decorator("on_post.time")
    def on_post(self, req, resp):
        """Handle incoming HTTP POSTs.

        Note: This is executed by the WSGI app, so it and anything it does is
        covered by the Sentry middleware.

        """
        resp.status = falcon.HTTP_200

        start_time = time.time()
        # NOTE(willkg): This has to return text/plain since that's what the
        # breakpad clients expect.
        resp.content_type = "text/plain"

        try:
            raw_crash, dumps = self.extract_payload(req)

        except MalformedCrashReport as exc:
            # If this is malformed, then reject it with malformed error code.
            msg = str(exc)
            mymetrics.incr("malformed", tags=["reason:%s" % msg])
            resp.status = falcon.HTTP_400
            resp.body = "Discarded=malformed_%s" % msg
            return

        mymetrics.incr("incoming_crash")

        # Add timestamps
        current_timestamp = utc_now()
        raw_crash["submitted_timestamp"] = current_timestamp.isoformat()
        raw_crash["timestamp"] = start_time

        # Add checksums and MinidumpSha256Hash
        raw_crash["dump_checksums"] = {
            dump_name: hashlib.sha256(dump).hexdigest()
            for dump_name, dump in dumps.items()
        }
        raw_crash["MinidumpSha256Hash"] = raw_crash["dump_checksums"].get(
            "upload_file_minidump", ""
        )

        # First throttle the crash which gives us the information we need
        # to generate a crash id.
        throttle_result, rule_name, percentage = self.get_throttle_result(raw_crash)

        # Use a uuid if they gave us one and it's valid--otherwise create a new
        # one.
        if "uuid" in raw_crash and validate_crash_id(raw_crash["uuid"]):
            crash_id = raw_crash["uuid"]
            logger.info("%s has existing crash_id", crash_id)

        else:
            crash_id = create_crash_id(
                timestamp=current_timestamp, throttle_result=throttle_result
            )
            raw_crash["uuid"] = crash_id

        raw_crash["type_tag"] = self.config("dump_id_prefix").strip("-")

        # Log the throttle result
        logger.info(
            "%s: matched by %s; returned %s",
            crash_id,
            rule_name,
            RESULT_TO_TEXT[throttle_result],
        )
        mymetrics.incr("throttle_rule", tags=["rule:%s" % rule_name])
        mymetrics.incr(
            "throttle", tags=["result:%s" % RESULT_TO_TEXT[throttle_result].lower()]
        )

        # If the result is REJECT, then discard it
        if throttle_result is REJECT:
            resp.body = "Discarded=rule_%s" % rule_name
            return

        # If the result is a FAKEACCEPT, then we return a crash id, but throw the crash
        # away
        if throttle_result is FAKEACCEPT:
            resp.body = "CrashID=%s%s\n" % (self.config("dump_id_prefix"), crash_id)
            return

        # If we're accepting the cash report, then clean it up, save it and return the
        # CrashID to the client
        self.cleanup_crash_report(raw_crash)
        crash_report = CrashReport(raw_crash, dumps, crash_id)
        crash_report.set_state(STATE_SAVE)
        self.crashmover_queue.append(crash_report)
        self.hb_run_crashmover()
        resp.body = "CrashID=%s%s\n" % (self.config("dump_id_prefix"), crash_id)

    def hb_run_crashmover(self):
        """Spawn a crashmover if there's work to do."""
        # Spawn a new crashmover if there's stuff in the queue and we haven't
        # hit the limit of how many we can run
        if self.crashmover_queue and self.crashmover_pool.free_count() > 0:
            self.crashmover_pool.spawn(self.crashmover_process_queue)

    def crashmover_process_queue(self):
        """Process crashmover work.

        NOTE(willkg): This has to be super careful not to lose crash reports.
        If there's any kind of problem, this must return the crash report to
        the relevant queue.

        """
        while self.crashmover_queue:
            crash_report = self.crashmover_queue.popleft()

            try:
                if crash_report.state == STATE_SAVE:
                    # Save crash and then toss crash_id in the publish queue
                    self.crashmover_save(crash_report)
                    crash_report.set_state(STATE_PUBLISH)
                    self.crashmover_queue.append(crash_report)

                elif crash_report.state == STATE_PUBLISH:
                    # Publish crash and we're done
                    self.crashmover_publish(crash_report)
                    self.crashmover_finish(crash_report)

            except Exception:
                mymetrics.incr("%s_crash_exception.count" % crash_report.state)
                crash_report.errors += 1
                logger.exception(
                    "Exception when processing queue (%s), state: %s; error %d/%d",
                    crash_report.crash_id,
                    crash_report.state,
                    crash_report.errors,
                    MAX_ATTEMPTS,
                )

                # After MAX_ATTEMPTS, we give up on this crash and move on
                if crash_report.errors < MAX_ATTEMPTS:
                    self.crashmover_queue.append(crash_report)
                else:
                    logger.error(
                        "%s: too many errors trying to %s; dropped",
                        crash_report.crash_id,
                        crash_report.state,
                    )
                    mymetrics.incr("%s_crash_dropped.count" % crash_report.state)

    def crashmover_finish(self, crash_report):
        """Finish bookkeeping on crash report."""
        # Capture the total time it took for this crash to be handled from
        # being received from breakpad client to saving to s3.
        #
        # NOTE(willkg): time.time returns seconds, but .timing() wants
        # milliseconds, so we multiply!
        delta = (time.time() - crash_report.raw_crash["timestamp"]) * 1000

        mymetrics.timing("crash_handling.time", value=delta)
        mymetrics.incr("save_crash.count")

    @mymetrics.timer("crash_save.time")
    def crashmover_save(self, crash_report):
        """Save crash report to storage."""
        self.crashstorage.save_crash(crash_report)
        logger.info("%s saved", crash_report.crash_id)

    @mymetrics.timer("crash_publish.time")
    def crashmover_publish(self, crash_report):
        """Publish crash_id in publish queue."""
        self.crashpublish.publish_crash(crash_report)
        logger.info("%s published", crash_report.crash_id)

    def join_pool(self):
        """Join the pool.

        NOTE(willkg): Only use this in tests!

        This is helpful for forcing all the coroutines in the pool to complete
        so that we can verify outcomes in the test suite for work that might
        cross coroutines.

        """
        self.crashmover_pool.join()
Esempio n. 28
0
class BreakpadSubmitterResource(RequiredConfigMixin):
    """Handles incoming breakpad crash reports and saves to crashstorage.

    This handles incoming HTTP POST requests containing breakpad-style crash
    reports in multipart/form-data format.

    It can handle compressed or uncompressed POST payloads.

    It parses the payload from the HTTP POST request, runs it through the
    throttler with the specified rules, generates a crash_id, returns the
    crash_id to the HTTP client and then saves the crash using the configured
    crashstorage class.

    .. Note::

       From when a crash comes in to when it's saved by the crashstorage class,
       the crash is entirely in memory. Keep that in mind when figuring out
       how to scale your Antenna nodes.


    The most important configuration bit here is choosing the crashstorage
    class.

    For example::

        CRASHSTORAGE_CLASS=antenna.ext.s3.crashstorage.S3CrashStorage

    """

    required_config = ConfigOptions()
    required_config.add_option(
        'dump_field', default='upload_file_minidump',
        doc='The name of the field in the POST data for dumps.'
    )
    required_config.add_option(
        'dump_id_prefix', default='bp-',
        doc='The crash type prefix.'
    )
    required_config.add_option(
        'concurrent_crashmovers',
        default='2',
        parser=positive_int,
        doc=(
            'The number of crashes concurrently being saved and published. '
            'Each process gets this many concurrent crashmovers, so if you\'re '
            'running 5 processes on the node, then it\'s '
            '(5 * concurrent_crashmovers) sharing upload bandwidth.'
        )
    )

    # crashstorage things
    required_config.add_option(
        'crashstorage_class',
        default='antenna.ext.crashstorage_base.NoOpCrashStorage',
        parser=parse_class,
        doc='The class in charge of storing crashes.'
    )

    # crashpublish things
    required_config.add_option(
        'crashpublish_class',
        default='antenna.ext.crashpublish_base.NoOpCrashPublish',
        parser=parse_class,
        doc='The class in charge of publishing crashes.'
    )

    def __init__(self, config):
        self.config = config.with_options(self)
        self.crashstorage = self.config('crashstorage_class')(config.with_namespace('crashstorage'))
        self.crashpublish = self.config('crashpublish_class')(config.with_namespace('crashpublish'))
        self.throttler = Throttler(config)

        # Gevent pool for crashmover workers
        self.crashmover_pool = Pool(size=self.config('concurrent_crashmovers'))

        # Queue for crashmover work
        self.crashmover_queue = deque()

        # Register hb functions with heartbeat manager
        register_for_heartbeat(self.hb_report_health_stats)
        register_for_heartbeat(self.hb_run_crashmover)

        # Register life function with heartbeat manager
        register_for_life(self.has_work_to_do)

    def get_runtime_config(self, namespace=None):
        """Return generator of runtime configuration."""
        for item in super().get_runtime_config():
            yield item

        for item in self.throttler.get_runtime_config():
            yield item

        for item in self.crashstorage.get_runtime_config(['crashstorage']):
            yield item

        for item in self.crashpublish.get_runtime_config(['crashpublish']):
            yield item

    def check_health(self, state):
        """Return health state."""
        if hasattr(self.crashstorage, 'check_health'):
            self.crashstorage.check_health(state)
        if hasattr(self.crashpublish, 'check_health'):
            self.crashpublish.check_health(state)

    def hb_report_health_stats(self):
        """Heartbeat function to report health stats."""
        # The number of crash reports sitting in the work queue; this is a
        # direct measure of the health of this process--a number that's going
        # up means impending doom
        mymetrics.gauge('work_queue_size', value=len(self.crashmover_queue))

    def has_work_to_do(self):
        """Return whether this still has work to do."""
        work_to_do = (
            len(self.crashmover_pool) +
            len(self.crashmover_queue)
        )
        logger.info('work left to do: %s' % work_to_do)
        # Indicates whether or not we're sitting on crashes to save--this helps
        # keep Antenna alive until we're done saving crashes
        return bool(work_to_do)

    def extract_payload(self, req):
        """Parse HTTP POST payload.

        Decompresses the payload if necessary and then walks through the
        FieldStorage converting from multipart/form-data to Python datatypes.

        NOTE(willkg): The FieldStorage is poorly documented (in my opinion). It
        has a list attribute that is a list of FieldStorage items--one for each
        key/val in the form. For attached files, the FieldStorage will have a
        name, value and filename and the type should be
        application/octet-stream. Thus we parse it looking for things of type
        text/plain and application/octet-stream.

        :arg falcon.request.Request req: a Falcon Request instance

        :returns: (raw_crash dict, dumps dict)

        """
        # If we don't have a content type, return an empty crash
        if not req.content_type:
            mymetrics.incr('malformed', tags=['reason:no_content_type'])
            return {}, {}

        # If it's the wrong content type or there's no boundary section, return
        # an empty crash
        content_type = [part.strip() for part in req.content_type.split(';', 1)]
        if ((len(content_type) != 2 or
             content_type[0] != 'multipart/form-data' or
             not content_type[1].startswith('boundary='))):
            if content_type[0] != 'multipart/form-data':
                mymetrics.incr('malformed', tags=['reason:wrong_content_type'])
            else:
                mymetrics.incr('malformed', tags=['reason:no_boundary'])
            return {}, {}

        content_length = req.content_length or 0

        # If there's no content, return an empty crash
        if content_length == 0:
            mymetrics.incr('malformed', tags=['reason:no_content_length'])
            return {}, {}

        # Decompress payload if it's compressed
        if req.env.get('HTTP_CONTENT_ENCODING') == 'gzip':
            mymetrics.incr('gzipped_crash')

            # If the content is gzipped, we pull it out and decompress it. We
            # have to do that here because nginx doesn't have a good way to do
            # that in nginx-land.
            gzip_header = 16 + zlib.MAX_WBITS
            try:
                data = zlib.decompress(req.stream.read(content_length), gzip_header)
            except zlib.error:
                # This indicates this isn't a valid compressed stream. Given
                # that the HTTP request insists it is, we're just going to
                # assume it's junk and not try to process any further.
                mymetrics.incr('malformed', tags=['reason:bad_gzip'])
                return {}, {}

            # Stomp on the content length to correct it because we've changed
            # the payload size by decompressing it. We save the original value
            # in case we need to debug something later on.
            req.env['ORIG_CONTENT_LENGTH'] = content_length
            content_length = len(data)
            req.env['CONTENT_LENGTH'] = str(content_length)

            data = io.BytesIO(data)
            mymetrics.histogram('crash_size', value=content_length, tags=['payload:compressed'])
        else:
            # NOTE(willkg): At this point, req.stream is either a
            # falcon.request_helper.BoundedStream (in tests) or a
            # gunicorn.http.body.Body (in production).
            #
            # FieldStorage doesn't work with BoundedStream so we pluck out the
            # internal stream from that which works fine.
            #
            # FIXME(willkg): why don't tests work with BoundedStream?
            if isinstance(req.stream, BoundedStream):
                data = req.stream.stream
            else:
                data = req.stream

            mymetrics.histogram('crash_size', value=content_length, tags=['payload:uncompressed'])

        fs = cgi.FieldStorage(fp=data, environ=req.env, keep_blank_values=1)

        # NOTE(willkg): In the original collector, this returned request
        # querystring data as well as request body data, but we're not doing
        # that because the query string just duplicates data in the payload.

        raw_crash = {}
        dumps = {}

        has_json = False
        has_kvpairs = False

        for fs_item in fs.list:
            # NOTE(willkg): We saw some crashes come in where the raw crash ends up with
            # a None as a key. Make sure we can't end up with non-strings as keys.
            item_name = fs_item.name or ''

            if item_name == 'dump_checksums':
                # We don't want to pick up the dump_checksums from a raw
                # crash that was re-submitted.
                continue

            elif fs_item.type and fs_item.type.startswith('application/json'):
                # This is a JSON blob, so load it and override raw_crash with
                # it.
                has_json = True
                raw_crash = json.loads(fs_item.value)

            elif fs_item.type and (fs_item.type.startswith('application/octet-stream') or isinstance(fs_item.value, bytes)):
                # This is a dump, so add it to dumps using a sanitized dump
                # name.
                dump_name = sanitize_dump_name(item_name)
                dumps[dump_name] = fs_item.value

            else:
                # This isn't a dump, so it's a key/val pair, so we add that.
                has_kvpairs = True
                raw_crash[item_name] = fs_item.value

        if has_json and has_kvpairs:
            # If the crash payload has both kvpairs and a JSON blob, then it's
            # malformed and we should dump it.
            mymetrics.incr('malformed', tags=['reason:has_json_and_kv'])
            return {}, {}

        return raw_crash, dumps

    def get_throttle_result(self, raw_crash):
        """Run raw_crash through throttler for a throttling result.

        :arg dict raw_crash: the raw crash to throttle

        :returns tuple: ``(result, rule_name, percentage)``

        """
        # At this stage, nothing has given us a throttle answer, so we
        # throttle the crash.
        result, rule_name, throttle_rate = self.throttler.throttle(raw_crash)

        # Save the results in the raw_crash itself
        raw_crash['legacy_processing'] = result
        raw_crash['throttle_rate'] = throttle_rate

        return result, rule_name, throttle_rate

    @mymetrics.timer_decorator('on_post.time')
    def on_post(self, req, resp):
        """Handle incoming HTTP POSTs.

        Note: This is executed by the WSGI app, so it and anything it does is
        covered by the Sentry middleware.

        """
        resp.status = falcon.HTTP_200

        start_time = time.time()
        # NOTE(willkg): This has to return text/plain since that's what the
        # breakpad clients expect.
        resp.content_type = 'text/plain'

        raw_crash, dumps = self.extract_payload(req)

        # If we didn't get any crash data, then just drop it and move on--don't
        # count this as an incoming crash and don't do any more work on it
        if not raw_crash:
            resp.body = 'Discarded=1'
            return

        mymetrics.incr('incoming_crash')

        # Add timestamps
        current_timestamp = utc_now()
        raw_crash['submitted_timestamp'] = current_timestamp.isoformat()
        raw_crash['timestamp'] = start_time

        # Add checksums and MinidumpSha256Hash
        raw_crash['dump_checksums'] = {
            dump_name: hashlib.sha256(dump).hexdigest()
            for dump_name, dump in dumps.items()
        }
        raw_crash['MinidumpSha256Hash'] = raw_crash['dump_checksums'].get('upload_file_minidump', '')

        # First throttle the crash which gives us the information we need
        # to generate a crash id.
        throttle_result, rule_name, percentage = self.get_throttle_result(raw_crash)

        # Use a uuid if they gave us one and it's valid--otherwise create a new
        # one.
        if 'uuid' in raw_crash and validate_crash_id(raw_crash['uuid']):
            crash_id = raw_crash['uuid']
            logger.info('%s has existing crash_id', crash_id)

        else:
            crash_id = create_crash_id(
                timestamp=current_timestamp,
                throttle_result=throttle_result
            )
            raw_crash['uuid'] = crash_id

        raw_crash['type_tag'] = self.config('dump_id_prefix').strip('-')

        # Log the throttle result
        logger.info('%s: matched by %s; returned %s', crash_id, rule_name,
                    RESULT_TO_TEXT[throttle_result])
        mymetrics.incr('throttle_rule', tags=['rule:%s' % rule_name])
        mymetrics.incr('throttle', tags=['result:%s' % RESULT_TO_TEXT[throttle_result].lower()])

        if throttle_result is REJECT:
            # If the result is REJECT, then discard it
            resp.body = 'Discarded=1'

        elif throttle_result is FAKEACCEPT:
            # If the result is a FAKEACCEPT, then we return a crash id, but throw
            # the crash away
            resp.body = 'CrashID=%s%s\n' % (self.config('dump_id_prefix'), crash_id)

        else:
            # If the result is not REJECT, then save it and return the CrashID to
            # the client
            crash_report = CrashReport(raw_crash, dumps, crash_id)
            crash_report.set_state(STATE_SAVE)
            self.crashmover_queue.append(crash_report)
            self.hb_run_crashmover()
            resp.body = 'CrashID=%s%s\n' % (self.config('dump_id_prefix'), crash_id)

    def hb_run_crashmover(self):
        """Spawn a crashmover if there's work to do."""
        # Spawn a new crashmover if there's stuff in the queue and we haven't
        # hit the limit of how many we can run
        if self.crashmover_queue and self.crashmover_pool.free_count() > 0:
            self.crashmover_pool.spawn(self.crashmover_process_queue)

    def crashmover_process_queue(self):
        """Process crashmover work.

        NOTE(willkg): This has to be super careful not to lose crash reports.
        If there's any kind of problem, this must return the crash report to
        the relevant queue.

        """
        while self.crashmover_queue:
            crash_report = self.crashmover_queue.popleft()

            try:
                if crash_report.state == STATE_SAVE:
                    # Save crash and then toss crash_id in the publish queue
                    self.crashmover_save(crash_report)
                    crash_report.set_state(STATE_PUBLISH)
                    self.crashmover_queue.append(crash_report)

                elif crash_report.state == STATE_PUBLISH:
                    # Publish crash and we're done
                    self.crashmover_publish(crash_report)
                    self.crashmover_finish(crash_report)

            except Exception:
                mymetrics.incr('%s_crash_exception.count' % crash_report.state)
                crash_report.errors += 1
                logger.exception(
                    'Exception when processing queue (%s), state: %s; error %d/%d',
                    crash_report.crash_id,
                    crash_report.state,
                    crash_report.errors,
                    MAX_ATTEMPTS
                )

                # After MAX_ATTEMPTS, we give up on this crash and move on
                if crash_report.errors < MAX_ATTEMPTS:
                    self.crashmover_queue.append(crash_report)
                else:
                    logger.error(
                        '%s: too many errors trying to %s; dropped',
                        crash_report.crash_id,
                        crash_report.state
                    )
                    mymetrics.incr('%s_crash_dropped.count' % crash_report.state)

    def crashmover_finish(self, crash_report):
        """Finish bookkeeping on crash report."""
        # Capture the total time it took for this crash to be handled from
        # being received from breakpad client to saving to s3.
        #
        # NOTE(willkg): time.time returns seconds, but .timing() wants
        # milliseconds, so we multiply!
        delta = (time.time() - crash_report.raw_crash['timestamp']) * 1000

        mymetrics.timing('crash_handling.time', value=delta)
        mymetrics.incr('save_crash.count')

    @mymetrics.timer('crash_save.time')
    def crashmover_save(self, crash_report):
        """Save crash report to storage."""
        self.crashstorage.save_crash(crash_report)
        logger.info('%s saved', crash_report.crash_id)

    @mymetrics.timer('crash_publish.time')
    def crashmover_publish(self, crash_report):
        """Publish crash_id in publish queue."""
        self.crashpublish.publish_crash(crash_report)
        logger.info('%s published', crash_report.crash_id)

    def join_pool(self):
        """Join the pool.

        NOTE(willkg): Only use this in tests!

        This is helpful for forcing all the coroutines in the pool to complete
        so that we can verify outcomes in the test suite for work that might
        cross coroutines.

        """
        self.crashmover_pool.join()
Esempio n. 29
0
class Worker(threading.Thread):
    """
        工作线程
    """
    def __init__(self, workers, thread_name, greents_num, func, workload):
        self.__workers = workers
        self.__busy = False
        self.__pool = Pool(greents_num + 1)
        self.greents_num = greents_num
        self.thread_name = thread_name
        self.__func = func
        self.workload = workload
        threading.Thread.__init__(self, None, None, self.thread_name, (), {})
        logger.info("%s init complete" % self.thread_name)

    def task_entrance(self, task):
        try:
            with gevent.Timeout(self.workload.timeout):
                self.__func(task)
        except gevent.Timeout:
            self.workload.complete_workload(task, '52', 'NULL')
            logger.info('>>>>>>>>>>>>>> task timeout!' + str(task))

    def dojudge(self):

        r = os.popen('free -am').readlines()[1].split(' ')[-1].strip()

        if int(r) < 500:
            gc.collect()
            return False

        return True

    def run(self):

        self.__busy = True
        while self.__busy:
            # 没任务会阻塞的,不用自己线程自己sleep ...
            task = self.workload.assign_workload()
            logger.info(
                'workload assign task pool size: {0} free count: {1}'.format(
                    self.__pool.size, self.__pool.free_count()))
            if self.__pool.free_count() < 2:
                logger.warn(
                    '[Exception MJOPObserver,type=ex78000,uid=,csuid=,qid={ts},ts={ts},ip={ip},'
                    'refer_id=,cur_id=spider_slave,debug=任务堆积-空闲池:{free}/{size}-等待任务:{count}]'
                    .format(ts=int(time.time() * 1000),
                            ip=local_ip,
                            size=self.__pool.size,
                            free=self.__pool.free_count(),
                            count=self.workload.tasks.qsize()))
            self.__pool.spawn(self.task_entrance, task)

        self.__busy = False

        logger.info("%s stop" % self.thread_name)

    def is_busy(self):
        return self.__busy

    def stop(self):
        self.__busy = False
        time.sleep(0.5)
Esempio n. 30
0
#!/usr/bin/python

import time
import random

import gevent
from gevent import Greenlet
from gevent.pool import Pool


def thrFunc(n):
    print "sleep %d seconds start.\n" % n
    gevent.sleep(n)
    print "sleep %d seconds end.\n" % n


threadPool = Pool(size=3)
while True:
    sec = random.randint(3, 6)
    #gThr = Greenlet(thrFunc,sec)
    #gThr.start()
    #gThr.join()
    print "+++free:", threadPool.free_count()
    threadPool.spawn(thrFunc, sec)
    #threadPool.apply_async(thrFunc,sec)
Esempio n. 31
0
class Worker:
    def __init__(self, seeds, connque):

        self.showpercounts = 10
        self.timeout = 5
        self.starttime = time.time()
        self.oldtime = 0

        self.quit = 0
        self.https_enable = 0

        self.run_queue = multiprocessing.Queue()
        self.connque = connque
        self.tasks = []
        self.done = 1

        self.errdone = set()
        self.err = Error()

        self.loadstate()

        #self.whitelist = ['html','htm','php','shtml','asp','jsp','do','action','aspx']
        self.blacklist = set(
            ('.blog.', '.taobao.com', '.baidu.com', '.edu', '.gov', '.mil',
             'mail', '.google', 'weibo.com', 't.cn', 'wikipedia', 'facebook',
             'twitter', 'dropbox'))
        self.allowdDomain = set(('com', 'net', 'org', 'cn', 'info', 'biz',
                                 'me', 'name', 'cc', 'tv'))

        self.httpget = self.httpget_requests  # down method self.httpget_requests | httpget_curl

        self.poolsize = 200
        self.poolmaxfree = 40
        self.freecount = 0
        self.down_pool = Pool(size=self.poolsize)

        self.totalnettime = 0
        self.cbcputime = 0
        self.totaldownsize = 0

        self.curspeed = 0

        self.debugnosave = 1
        self.tt = 1

        try:
            self.bfdone = BloomFilter.open('done_sites.bin')
        except:
            self.bfdone = BloomFilter(2**23, 10**(-5), 'done_sites.bin')

        if self.run_queue.qsize() == 0:
            for seed in seeds:
                self.run_queue.put(seed.split("http://")[1])

        if self.https_enable == 0:
            self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',
                                        re.I)
        else:
            self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',
                                        re.I)

    def debug_filter(self, urls):
        #return filter(lambda url: ".fang.com" in url , urls)
        return urls

    def cb_httpget(self, data=None):
        if not data:
            return
        seed, err, headers, html = data
        st = time.time()

        if err:
            self.handle_error(err, seed)
            return

        #http://
        if self.https_enable == 0:
            seed = seed[7:]
        self.bfdone.add(seed)
        self.done += 1

        self.connque.put((seed, headers, html))

        et = time.time()
        self.cbcputime += (et - st)

        if self.done % self.showpercounts == 0:
            self.out(seed)

    def out(self, seed):

        spendtime = time.time() - self.starttime
        spendtime = 1 if spendtime == 0 else spendtime
        nowh = str(int(spendtime) / 3600) + ":" if spendtime > 3600 else ""
        now = "%s%02d:%02d" % (nowh, spendtime % 3600 / 60, spendtime % 60)
        print "%s D:%-4d R:%-7d [Speed: T%.2f/s C%.2f/s A%.3f] CB:%0.4f Active:%d %s %s" % (now, (self.done), self.run_queue.qsize(), \
         (self.done)/(spendtime+self.oldtime), self.curspeed, self.tt, self.totalnettime / spendtime ,self.poolsize-self.freecount, str(self.err), seed )

    def work(self):

        while self.quit == 0:

            st = time.time()
            curdone = self.done
            self.freecount = self.down_pool.free_count()

            if self.freecount > self.poolmaxfree:
                self.tasks = []
                minlen = min(self.freecount, self.run_queue.qsize())
                #if minlen <=0:break
                stt = time.time()
                for i in range(minlen):
                    url = self.run_queue.get()
                    if url in self.bfdone:  # 5%-10%
                        continue
                    #self.tt = time.time() - stt
                    # may need add a byte to the url to figure out the https
                    url = "http://" + url

                    self.tasks.append(url)
                    self.down_pool.apply_async(self.httpget, (url, ),
                                               callback=self.cb_httpget)

            time.sleep(0.5)
            et = time.time()
            self.curspeed = (self.done - curdone) / (et - st)

        self.down_pool.join()
        print "All OVER"

    def handle_error(self, e, url):

        if e.find('DNSError') > 0:
            self.err.dns += 1
            self.err.rdns.append(url)
        elif e.find('reset') > 0:  #Connection reset
            self.err.reset += 1
            self.err.rreset.append(url)
        elif e.find('Max retries') > 0:  #
            self.err.conntimeout += 1
            self.err.rconntimeout.append(url)
        elif e.find('refused') > 0:  #Connection refused
            self.err.refuse += 1
            self.err.rrefuse.append(url)

        else:
            self.err.others += 1
            self.err.rothers.append(url)
            print "Error", url, e

    # requests is better through test
    def httpget_requests(self, url):

        st = time.time()
        con = ""
        e = ""
        res_headers = ""
        headers = {
            'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.6',
            #'Accept':'text/html'
            'Connection': 'close'
        }

        res = None
        try:
            # todo: query the ip of the website before get through dns
            req = requests
            req.max_redirects = 1
            res = req.get(url, timeout=(3, 3), headers=headers)
            if self.https_enable == 0 and "https" not in res.url:

                if 'html' not in res.headers['content-type']:
                    return None
                con = res.content

            #res.close()

        except KeyboardInterrupt:
            raise
        except Exception as e:
            e = str(e)
            if res:
                res.close()

            return None

        et = time.time()
        self.totalnettime += (et - st)
        return url, e, res.headers, con

    def httpget_curl(self, url):

        con = ""
        buffer = StringIO()
        c = pycurl.Curl()
        c.setopt(pycurl.URL, url)
        c.setopt(pycurl.MAXCONNECTS, 2)
        c.setopt(pycurl.CONNECTTIMEOUT, 3)
        c.setopt(pycurl.TIMEOUT, 5)
        c.setopt(pycurl.WRITEFUNCTION, buffer.write)

        c.perform()
        c.close()
        con = buffer.getvalue()

        return con

    def filter_urls(self, seed, urls):

        nurls = []
        seeditem = seed.lower().split('.')
        seedlen = len(seeditem)
        maindomain = 1 if seeditem[0] == 'www' else 0
        urls = {}.fromkeys(urls).keys()

        for url in urls:
            #url = url.split('/',1)[0].split('#',1)[0].split('?',1)[0].lower()
            url = url.lower()

            #filter Domain , only allowd for china
            suf = 0
            urlitem = url.split('.')
            nlen = len(urlitem)
            if nlen < 2:
                continue
            tld = urlitem[-1]
            if tld in self.allowdDomain:
                if urlitem[-2] in self.allowdDomain:
                    if nlen <= 4:
                        suf = 2

                else:
                    if nlen <= 3:
                        suf = 1

            if suf >= 1:
                # blacklist verify
                block = 0
                for b in self.blacklist:
                    if url.find(b) >= 0:
                        block = 1
                        continue

                if block == 0:
                    if nlen != seedlen:
                        nurls.append(url)
                    else:
                        if maindomain or urlitem[-(suf +
                                                   1)] != seeditem[-(suf + 1)]:
                            nurls.append(url)

        #print seed, nurls
        return {}.fromkeys(nurls).keys()

    def geturls(self, seed, html):
        if not html or len(html) == 0:
            return []

        urls = re.findall(self.urlpatern, html)

        st = time.time()
        urls = self.filter_urls(seed, urls)
        et = time.time()
        return urls

    def savestate(self):

        self.quit = 1
        now = time.time()
        self.oldtime += (now - self.starttime)

        #should hold on the singal for procdata done

        with open('state.txt', 'wb') as f:
            f.write(str(self.oldtime) + '\n')
            # tasks run_queue done
            f.write(str(len(self.tasks)) + '\n')
            for t in self.tasks:
                f.write(t + '\n')
            l = self.run_queue.qsize()
            f.write(str(l) + '\n')
            while l > 0:
                f.write(self.run_queue.pop() + '\n')
                l -= 1
            f.write(str((self.done)) + '\n')

        with open('err_records.pack', 'wb') as f:
            cPickle.dump(self.err, f, 2)

        print time.strftime("%Y-%m-%d %H:%M:%S",
                            time.localtime()), " Save state successfully."
        f.close()
        exit(0)

    def loadstate(self):

        try:
            with open('state.txt') as f:
                self.oldtime = float(f.readline())
                tasks = int(f.readline())
                for i in xrange(tasks):
                    self.run_queue.add(f.readline().rstrip('\n'))

                runnings = int(f.readline())
                for i in xrange(runnings):
                    self.run_queue.add(f.readline().rstrip('\n'))

                self.done = int(f.readline())

            with open('err_records.pack', 'rb') as f:
                self.err = cPickle.load(f)

            print time.strftime("%Y-%m-%d %H:%M:%S",
                                time.localtime()), " Load state successfuly."
        except Exception as e:
            print e
Esempio n. 32
0
class SphinxService:
    def __init__(self, redis_server, sphinx_server, part, workers):
        '''
        Inicializa el servidor, creando el pool de conexiones a Sphinx y las conexiones a Redis
        '''

        # configuraciones
        self.redis_server = redis_server
        self.sphinx_server = sphinx_server
        self.part = chr(part)

        self.version = WORKER_VERSION
        self.workers_pool_size = self.sphinx_pool_size = self.redis_pool_size = workers
        self.lock_expiration = LOCK_EXPIRATION
        self.index_name = INDEX_NAME+str(part)
        self.default_order = DEFAULT_ORDER
        self.default_order_key = DEFAULT_ORDER_KEY
        self.default_group_order = DEFAULT_GROUP_ORDER
        self.default_weight = DEFAULT_WEIGHT
        self.default_ranking = DEFAULT_RANKING
        self.default_field_weights = DEFAULT_FIELD_WEIGHTS
        self.default_max_query_time = DEFAULT_MAX_QUERY_TIME
        self.max_max_query_time = MAX_MAX_QUERY_TIME

        # pool de gevent
        self.gevent_pool = Pool(self.workers_pool_size)

        # pool conexiones sphinx
        self.sphinx_conns = SphinxPool(self.sphinx_pool_size, self.sphinx_server, self.max_max_query_time, SPHINX_SOCKET_TIMEOUT)

        # conexion a redis normal
        self.redis_conns = RedisPool(self.redis_pool_size, self.redis_server, self.version, REDIS_TIMEOUT)

        # inicializa variables de control
        self.last_reindex = -1.
        self.stop = False
        self.pubsub_used = True

    def update_last_reindex(self):
        ''' Averigua cuando se realizó la última reindexación de este servidor. '''
        with self.redis_conns.get() as redisc:
            previous = self.last_reindex
            self.last_reindex = float(redisc.get(CONTROL_KEY+"lr_%d"%ord(self.part)) or -1)
            redisc.used = True
            print "["+datetime.now().isoformat(" ")+"]", "Last reindex date updated: %.2f (%.2f)."%(self.last_reindex, previous)

    def update_blocked_sources(self):
        ''' Obtiene lista de origenes bloqueados. '''
        with self.redis_conns.get() as redisc:
            self.blocked_sources = parse_data(redisc.get(CONTROL_KEY+"bs") or "\x90")
            redisc.used = True
            print "["+datetime.now().isoformat(" ")+"]", "Blocked sources updated."

    def keepalive_pubsub(self, timeout):
        '''
        Mantiene viva la conexion pubsub si no llegan mensajes.
        '''
        while not self.stop:
            # espera un rato
            sleep(timeout)

            # comprueba que la conexion se haya utilizado o hace un ping
            if self.pubsub_used:
                self.pubsub_used = False
            else:
                with self.redis_conns.get() as redisc:
                    redisc.publish(RESULTS_CHANNEL, "pn")
                    redisc.publish(CONTROL_CHANNEL+self.part, "pn")
                    redisc.used = True

    def stop_server(self):
        print "["+datetime.now().isoformat(" ")+"]", "Stop command received."

        # deja de atender peticiones
        self.stop = True
        self.redis_pubsub.close()
        self.redis_pubsub.connection_pool.disconnect()

    def serve_forever(self):
        '''
        Recibe y procesa peticiones de busqueda.
        '''

        print "\n\n["+datetime.now().isoformat(" ")+"]", "Server started: %s, %d, %s, %d, %d"%(repr(self.redis_server), self.version, repr(self.sphinx_server), ord(self.part), self.workers_pool_size)

        # Inicializa intervalo de reintento en la conexion
        retry = 1

        while not self.stop:
            try:
                # actualiza variables globales
                self.update_last_reindex()
                self.update_blocked_sources()

                # conexion a redis para pubsub
                self.redis_pubsub = redis.StrictRedis(host=self.redis_server[0], port=self.redis_server[1], db=self.version).pubsub()
                self.redis_pubsub.subscribe(EXECUTE_CHANNEL)
                self.redis_pubsub.subscribe(EXECUTE_CHANNEL+self.part)
                self.redis_pubsub.subscribe(CONTROL_CHANNEL+self.part)

                # Reinicia intervalo de reintento en la conexion
                retry = 1

                # inicia el proceso de keepalive de la conexion pubsub
                self.gevent_pool.spawn(self.keepalive_pubsub, REDIS_TIMEOUT/5)

                # espera mensajes
                for msg in self.redis_pubsub.listen():
                    # marca que se ha usado la conexion
                    self.pubsub_used = True

                    # ignora los mensajes que no son mensajes
                    if msg["type"]!="message":
                        continue

                    # extrae informacion del mensaje
                    channel, part = msg["channel"][0], msg["channel"][1:]
                    data = msg["data"]

                    if channel==EXECUTE_CHANNEL:    # busqueda
                        # comprueba si es una busqueda general o es para este servidor
                        request_id, info = parse_data(data)

                        # procesa la peticion
                        if request_id[0]==QUERY_KEY:
                            self.gevent_pool.spawn(self.process_search_request, request_id, info)
                        elif request_id[0]==LOCATION_KEY:
                            self.gevent_pool.spawn(self.process_get_id_server_request, request_id, info)

                    elif channel==CONTROL_CHANNEL:  # control
                        if data == "lr":    # actualiza fecha de reindexado
                            self.gevent_pool.spawn(self.update_last_reindex)
                        elif data == "bs":  # actualiza lista de origenes bloqueados
                            self.gevent_pool.spawn(self.update_blocked_sources)
                        elif data == "pn":  # ping del keepalive
                            pass

                    elif channel==UPDATE_CHANNEL:  # actualizaciones
                        pass
            except redis.ConnectionError as e:
                if self.stop:
                    break
                else:
                    # Espera y elimina procesos pendientes
                    self.gevent_pool.join(timeout=2)
                    self.gevent_pool.kill(timeout=1)

                    print "["+datetime.now().isoformat(" ")+"]", "Server connection error %s:'%s'. Will reconnect in %d seconds." % (repr(e), e.message, retry)

                    # Espera tiempo de reintento e incrementa tiempo de reintento para la próxima vez (hasta 64 segundos)
                    sleep(retry)
                    if retry < 64: retry *= 2

            except BaseException as e:
                if self.stop:
                    break
                else:
                    print "["+datetime.now().isoformat(" ")+"]", "Server stopped with error %s:'%s'."%(repr(e), e.message)
                    logging.exception("Error on main loop on service %d."%ord(self.part))
                    return

        # espera los procesos que esten respondiendo
        self.gevent_pool.join(2)

        # si alguno no acabado en 2 segundos, lo mata
        self.gevent_pool.kill(timeout=1)

        print "["+datetime.now().isoformat(" ")+"]", "Server stopped normally."

    def process_get_id_server_request(self, request_id, info):
        try:
            # extrae parametros de la llamada
            bin_file_id = request_id[1:]
            query = info.decode("utf-8")

            # obtiene el cliente de redis
            with self.redis_conns.get() as redisc:

                # bloquea acceso si hace falta procesar esta peticion (nadie la esta haciendo o ha hecho ya)
                start_time = time()
                if redisc.hsetnx(request_id, self.part, "P"):
                    try:
                        block_time = time()
                        with self.sphinx_conns.get() as sphinx:
                            # busca el registro con el id pedido
                            uri1, uri2, uri3 = FULL_ID_STRUCT.unpack(bin_file_id)
                            sphinx.SetMaxQueryTime(MAX_MAX_QUERY_TIME)
                            sphinx.SetFilter('uri1', [uri1])
                            sphinx.SetFilter('uri2', [uri2])
                            sphinx.SetFilter('uri3', [uri3])
                            sphinx.SetLimits(0,1,1,1)
                            sphinx.SetIDRange(PART_ID_STRUCT.unpack(bin_file_id[:5]+"\x00\x00\x00")[0], PART_ID_STRUCT.unpack(bin_file_id[:5]+"\xFF\xFF\xFF")[0])
                            results = sphinx.Query(query, self.index_name, "d_id "+str(bin_file_id[:3].encode("hex")))
                            search_time = time()

                            # comprueba resultados obtenidos
                            has_it = results and "matches" in results and results["matches"]
                            if has_it:
                                redisc.pipeline().hset(request_id, self.part, "H").publish(RESULTS_CHANNEL, format_data((request_id, self.part, self.part))).execute()
                            else:
                                redisc.pipeline().hset(request_id, self.part, "N").publish(RESULTS_CHANNEL, format_data((request_id, self.part, None))).execute()
                            end_time = time()

                            print "["+datetime.fromtimestamp(start_time).isoformat(" ")+"]", self.gevent_pool.free_count(), ("*" if has_it else " ")+bin_file_id.encode("hex"), " %.2f (%.4f %.4f %.4f)"%(end_time-start_time, block_time-start_time, search_time-block_time, end_time-search_time), repr(query)


                    except BaseException as e:
                        redisc.hdel(request_id, self.part)
                        print "["+datetime.now().isoformat(" ")+"] ERROR", self.gevent_pool.free_count(), "process_get_id_server_request inner", repr(e), e.message
                        logging.exception("Error on searching for id %s on service %d."%(bin_file_id.encode("hex"), ord(self.part)))

                redisc.used = True
        except BaseException as e:
            print "["+datetime.now().isoformat(" ")+"] ERROR", "process_get_id_server_request outer", repr(e), e.message
            logging.exception("Error on process_get_id_server_request on service %d."%ord(self.part))

    def process_search_request(self, request_id, info):
        # extrae parametros de la llamada
        query = info[0]
        subgroups = info[1]

        try:
            # analiza la peticion para ver qué hay que buscar
            with self.redis_conns.get() as redisc:
                start_time = prep_time = search_time = time()

                query_key = QUERY_KEY+hash_dict(query)
                # genera informacion de la peticion
                search_info = {"query_key":query_key, "query":query, "subgroups":subgroups, "generate_info":False, "version":0, "tries":0}

                # intenta bloquear o ignora la peticion porque ya hay alguien trabajando en ellau
                lock = redisc.lock(query_key+self.part+ACTIVE_KEY, LOCK_EXPIRATION)
                if lock.acquire(False):
                    try:
                        must_search = self.prepare_search(redisc, search_info)
                        prep_time = search_time = time()

                        if must_search:
                            # realiza la busqueda
                            results = self.search(search_info)
                            search_time = time()

                            # almacena los resultados y avisa a quien ha hecho la peticion
                            self.store_results(redisc, search_info, results)
                    except BaseException as e:
                        print "["+datetime.now().isoformat(" ")+"] ERROR", self.gevent_pool.free_count(), "process_search_request inner", repr(e), e.message
                    finally:
                        lock.release()
                else:
                    must_search = None
                    prep_time = search_time = time()

                redisc.used = True

            # prepara info de la consulta para loguear
            end_time = time()
            query_sum = query["t"]
            if subgroups:
                subgroups_sum = sorted(subgroups.iteritems())
                query_sum += " %d/%d %s"%(len(search_info["subgroups"]), len(subgroups_sum), repr(subgroups_sum[:4]))

            # imprime información de la busqueda
            print "["+datetime.fromtimestamp(start_time).isoformat(" ")+"]", self.gevent_pool.free_count() ,"".join(name if flag else " " for name, flag in izip("BSEDW", (must_search==None, must_search, "early_response" in search_info, "delete_subgroups" in search_info, search_info["tries"]>0))), search_info["tries"], " %.2f (%.4f %.4f %.4f) "%(end_time-start_time, prep_time-start_time, search_time-prep_time, end_time-search_time), query_key.encode("hex")[-10:], query_sum
        except BaseException as e:
            print  "["+datetime.now().isoformat(" ")+"] ERROR", "process_search_request outer", repr(e), e.message
            logging.exception("Error on process_search_request on service %d."%ord(self.part))

    def prepare_search(self, redisc, search_info):
        '''
        Averigua si debe realizar esta busqueda.
        '''
        # por defecto no va a buscar, pero no avisa pronto
        early_response = must_search = False

        query_key = search_info["query_key"]
        subgroups = search_info["subgroups"]

        # decide que informacion necesita
        if subgroups:
            keys = [PART_KEY+self.part, VERSION_KEY+self.part]
            keys.extend(PART_SG_KEY+self.part+str(subgroup) for subgroup, start in subgroups.iteritems())
        else:
            keys = [PART_KEY+self.part, VERSION_KEY+self.part, INFO_KEY]

        # obtiene informacion de la busqueda del servidor
        search_cache = redisc.hmget(query_key, *keys)
        part_info, version, rest = search_cache[0], search_cache[1], search_cache[2:]

        # almacena la version actual
        search_info["version"] = int(version) if version else -1

        if part_info: # si esta parte ya se ha buscado, mira razones por que tenga que buscarse de nuevo o busca los subgrupos
            part_info = parse_data(part_info)

            # obtiene el numero de intentos necesitados para esta busqueda hasta ahora
            search_info["tries"] = part_info[2]

            # hay datos aunque puedan no ser validos, avisa que se pueden usar
            early_response = True

            # comprueba la fecha de la busqueda con respecto al ultimo indexado
            if part_info[0]<self.last_reindex:
                search_info["delete_subgroups"] = part_info[4].keys()
                must_search = True

            # comprueba warnings en respuesta (usualmente por falta de tiempo)
            elif part_info[1]:
                search_info["tries"] += 1
                must_search = True

            # busca en subgrupos solo si hay info valida de esta parte (must_search=False) y no hay info de algun subgrupo
            if subgroups:
                if must_search: # los datos principales son invalidos, no puede dar el subgrupo
                    must_search = False
                else:
                    # no piden los subgrupos que ya se tienen
                    new_subgroups = search_info["subgroups"] = {subgroup: (current_subgroup or [1]) for (subgroup, start), current_subgroup in izip(subgroups.iteritems(), (parse_data(asubgroup) if asubgroup else None for asubgroup in rest)) if not current_subgroup or current_subgroup[0]<=start}
                    must_search = bool(new_subgroups)
        else:
            # busca la info de esta parte, pero no un subgrupo
            if not subgroups:
                # genera información de la consulta si no la ha generado nadie aun
                if not rest[0]:
                    search_info["generate_info"] = True
                must_search = True

        # avisa, si hay datos disponibles aunque haya que buscar
        if not subgroups and early_response:
            search_info["early_response"] = True
            redisc.publish(RESULTS_CHANNEL, format_data((query_key, self.part, None)))

        # si no tiene que buscar, libera el bloqueo
        if not must_search:
            return False

        # debe buscar
        return True

    @retry
    def search(self, search_info):
        query = search_info["query"]
        subgroups = search_info["subgroups"]

        if not "t" in query:
            raise Exception("Empty query search received.")

        # parametros de busqueda
        text = query["t"]
        filters = query["f"] if "f" in query else {}
        order = query["o"] if "o" in query else self.default_order
        order_key = query["ok"] if "ok" in query else self.default_order_key
        group_order = query["go"] if "go" in query else self.default_group_order
        weight = query["w"] if "w" in query else self.default_weight
        range_ids = query["i"] if "i" in query else None
        field_weights = query["fw"] if "fw" in query else self.default_field_weights
        ranking = query["r"] if "r" in query else self.default_ranking

        # parametros que no varian la busqueda
        offset, limit, max_matches, cutoff = query["l"]
        grouping = query["g"] if not subgroups and "g" in query else (GROUPING_GROUP|GROUPING_NO_GROUP) # por defecto pide informacion sin y con agrupacion (solo para principal)?
        max_query_time = min(self.default_max_query_time+QUERY_TIME_STEP*search_info["tries"] if "tries" in search_info else query["mt"] if "mt" in query else self.default_max_query_time, self.max_max_query_time)

        # obtiene cliente de sphinx
        with self.sphinx_conns.get() as sphinx:
            sphinx.ResetFilters()
            sphinx.ResetGroupBy()

            # configura cliente
            sphinx.SetFieldWeights(field_weights)
            sphinx.SetSortMode(sphinxapi.SPH_SORT_EXTENDED, order)
            sphinx.SetMatchMode(sphinxapi.SPH_MATCH_EXTENDED)
            sphinx.SetRankingMode(sphinxapi.SPH_RANK_EXPR, ranking)
            sphinx.SetSelect("*, if(g>0xFFFFFFFF,1,0) as e, "+order_key+" as ok, "+weight+" as w")
            sphinx.SetMaxQueryTime(max_query_time)

            if range_ids:
                sphinx.SetIDRange(range_ids[0], range_ids[1])
            else:
                sphinx.SetIDRange(0, 0)

            # realiza la peticion
            if subgroups:
                for sg, current in subgroups.iteritems():
                    sphinx.SetFilter('bl', [0])
                    sphinx.SetFilter("g", [long(sg)])
                    sphinx.SetLimits(current[0], limit, max_matches, cutoff)
                    if filters: self._apply_filters(sphinx, filters)
                    sphinx.AddQuery(text, self.index_name, "d_s "+sg+" "+str(max_query_time))
                    sphinx.ResetFilters()
            else:  # traer resumen principal de todos los grupos
                sphinx.SetFilter('bl', [0])
                sphinx.SetFilter("s", self.blocked_sources, True)
                sphinx.SetLimits(offset, limit, max_matches, cutoff)

                if filters: self._apply_filters(sphinx, filters)

                if grouping&GROUPING_NO_GROUP:
                    sphinx.AddQuery(text, self.index_name, "d_ng "+str(max_query_time))

                if grouping&GROUPING_GROUP:
                    sphinx.SetGroupBy("g", sphinxapi.SPH_GROUPBY_ATTR, group_order)
                    sphinx.AddQuery(text, self.index_name, "d_m "+str(max_query_time))

            results = sphinx.RunQueries()
            error = sphinx.GetLastError()
            if error:
                raise SphinxError(error)

            sphinx.used = True

        return results

    def _apply_filters(self, sphinx, filters):
        if "z" in filters:
            sphinx.SetFilterFloatRange('z', float(filters["z"][0]), float(filters["z"][1]))
        if "e" in filters:
            sphinx.SetFilterRange('e', filters["e"])
        if "ct" in filters:
            sphinx.SetFilter('ct', filters["ct"])
        if "src" in filters:
            sphinx.SetFilter('s', set(filters["src"]).difference(self.blocked_sources))

    def store_results(self, redisc, search_info, results):
        # recorre resultados y los pone en el orden deseado
        subgroups = search_info["subgroups"]
        query = search_info["query"]
        query_key = search_info["query_key"]
        tries = search_info["tries"]

        # nueva version de los datos
        version = search_info["version"]+1

        save_info = {VERSION_KEY+self.part: version}
        now = time()

        if subgroups:
            ''' Va a guardar:
                - [part][sg] = con los resultados de los subgrupos de los que se han obtenido resultados '''
            for result, (sg, current) in izip(results, subgroups.iteritems()):
                current.extend((FULL_ID_STRUCT.pack(r["attrs"]["uri1"],r["attrs"]["uri2"],r["attrs"]["uri3"]), r["id"], version, r["attrs"]["r"], r["attrs"]["w"]) for r in result["matches"])
                current[0] = len(current) # el numero de resultados compensa el primer resultado
                if current[0]>1: # no guarda el subgrupo si no añade resultados
                    save_info[PART_SG_KEY+self.part+str(sg)] = format_data(current)
        else:
            # Tipo de agrupación
            grouping = query["g"]

            ''' Va a guardar:
                - INFO: si corresponde
                - [part]: con los resultados de la busqueda agrupada
                - [part][sg] = con los resultados de la busqueda no agrupada, para los subgrupos de los que se han obtenido resultados '''
            # Información de la busqueda agrupada
            if grouping&GROUPING_GROUP:
                result = results[-1] # es el ultimo resultado, puede ser el 0 o el 1 segun se haya pedido la busqueda sin agrupar
                save_info[PART_KEY+self.part] = format_data((now, bool(result["warning"]), tries, result["time"],
                                        {r["attrs"]["g"]:(r["attrs"]["@count"], (FULL_ID_STRUCT.pack(r["attrs"]["uri1"],r["attrs"]["uri2"],r["attrs"]["uri3"]), r["id"], version, r["attrs"]["r"], r["attrs"]["w"]))
                                            for r in result["matches"]}))

            # Almacena información de la búsqueda sin agrupar, si se ha pedido
            if grouping&GROUPING_NO_GROUP:
                result = results[0]

                # Agrupa resultados por subgrupos
                subgroups_extra = {}
                for r in result["matches"]:
                    sg = r["attrs"]["g"]
                    if sg in subgroups_extra:
                        subgroups_extra[sg].append((FULL_ID_STRUCT.pack(r["attrs"]["uri1"],r["attrs"]["uri2"],r["attrs"]["uri3"]), r["id"], version, r["attrs"]["r"], r["attrs"]["w"]))
                    else:
                        subgroups_extra[sg] = [] # no incluye el primer resultado, que ya está en el resumen

                # Genera listas a guardar
                for sg, files in subgroups_extra.iteritems():
                    if not files: continue # no crea grupos sin ficheros extra
                    files.insert(0,len(files)+1)
                    if files[0]>1: # no guarda el subgrupo si no añade resultados
                        save_info[PART_SG_KEY+self.part+str(sg)] = format_data(files)

            # genera información principal si hace falta
            if search_info["generate_info"]:
                save_info[INFO_KEY] = format_data([fix_sphinx_result(word["word"]).encode("utf-8") for word in results[0]["words"]])

        # almacena datos en redis
        if "delete_subgroups" in search_info:
            redisc.pipeline().hdel(query_key, search_info["delete_subgroups"]).hmset(query_key, save_info).execute()
        else:
            redisc.hmset(query_key, save_info)

        # avisa que estan disponibles los resultados principales
        if not subgroups:
            redisc.publish(RESULTS_CHANNEL, format_data((query_key, self.part, None)))
def main_loop(config):
    """
    Основной цикл приложения.

    :param config: конфигурация
    :type config: Config

    Алгоритм:
     * Открываем соединение с tarantool.queue, использую config.QUEUE_* настройки.
     * Создаем пул обработчиков.
     * Создаем очередь куда обработчики будут помещать выполненные задачи.
     * Пока количество обработчиков <= config.WORKER_POOL_SIZE, берем задачу из tarantool.queue
       и запускаем greenlet для ее обработки.
     * Посылаем уведомления о том, что задачи завершены в tarantool.queue.
     * Спим config.SLEEP секунд.
    """
    logger.info('Connect to queue server on {host}:{port} space #{space}.'.format(
        host=config.QUEUE_HOST, port=config.QUEUE_PORT, space=config.QUEUE_SPACE
    ))
    queue = tarantool_queue.Queue(
        host=config.QUEUE_HOST, port=config.QUEUE_PORT, space=config.QUEUE_SPACE
    )

    logger.info('Use tube [{tube}], take timeout={take_timeout}.'.format(
        tube=config.QUEUE_TUBE,
        take_timeout=config.QUEUE_TAKE_TIMEOUT
    ))

    tube = queue.tube(config.QUEUE_TUBE)

    logger.info('Create worker pool[{size}].'.format(size=config.WORKER_POOL_SIZE))
    worker_pool = Pool(config.WORKER_POOL_SIZE)

    processed_task_queue = gevent_queue.Queue()

    logger.info('Run main loop. Worker pool size={count}. Sleep time is {sleep}.'.format(
        count=config.WORKER_POOL_SIZE, sleep=config.SLEEP
    ))

    while run_application:
        free_workers_count = worker_pool.free_count()

        logger.debug('Pool has {count} free workers.'.format(count=free_workers_count))

        for number in xrange(free_workers_count):
            logger.debug('Get task from tube for worker#{number}.'.format(number=number))

            task = tube.take(config.QUEUE_TAKE_TIMEOUT)

            if task:
                logger.info('Start worker#{number} for task id={task_id}.'.format(
                    task_id=task.task_id, number=number
                ))

                worker = Greenlet(
                    notification_worker,
                    task,
                    processed_task_queue,
                    timeout=config.HTTP_CONNECTION_TIMEOUT,
                    verify=False
                )
                worker_pool.add(worker)
                worker.start()

        done_with_processed_tasks(processed_task_queue)

        sleep(config.SLEEP)
        if break_func_for_test():
            break
    else:
        logger.info('Stop application loop.')
Esempio n. 34
0
class Worker:
    def __init__(self, seeds, done_que, run_que):

        self.showpercounts = 10
        self.timeout = 5
        self.starttime = time.time()
        self.oldtime = 0

        self.quit = 0
        self.https_enable = 0

        self.run_que = run_que
        self.done_que = done_que
        self.tasks = []
        self.done = 1

        self.errdone = set()
        self.err = Error()

        self.loadstate()

        self.blacklist = set(
            ('.blog.', '.taobao.com', '.baidu.com', '.edu', '.gov', '.mil',
             'mail', '.google', 'weibo.com', 't.cn', 'wikipedia', 'facebook',
             'twitter', 'dropbox'))
        self.allowdDomain = set(('com', 'net', 'org', 'cn', 'info', 'biz',
                                 'me', 'name', 'cc', 'tv'))

        self.httpget = self.httpget_requests  # down method self.httpget_requests | httpget_curl

        self.poolsize = 60
        self.poolmaxfree = 20
        self.freecount = 0
        self.down_pool = Pool(size=self.poolsize)

        self.totalnettime = 0
        self.cbcputime = 0
        self.totaldownsize = 0

        self.curspeed = 0

        self.debugnosave = 1
        self.tt = 1

        self.done_sites_fname = 'done_sites.bin'
        try:
            self.bfdone = BloomFilter.open(self.done_sites_fname)
        except:
            self.bfdone = BloomFilter(2**23, 10**(-5),
                                      self.done_sites_fname)  #8M

        if self.run_que.qsize() == 0:
            for seed in seeds:
                self.run_que.put(seed.split("http://")[1])

        if self.https_enable == 0:
            self.urlpatern = re.compile(r'href=["\']http://([^/?#\"\']+)',
                                        re.I)
        else:
            self.urlpatern = re.compile(r'href=["\']http[s]?://([^/?#\"\'"]+)',
                                        re.I)

    def cb_httpget(self, data=None):

        if not data:
            return
        seed, err, headers, content = data
        st = time.time()

        if err:
            self.handle_error(err, seed)
            return

        if self.https_enable == 0:
            seed = seed[7:]

        self.bfdone.add(seed)
        self.done += 1

        data = {'seed': seed, 'headers': headers, 'content': content}

        dat = cPickle.dumps(data)
        self.done_que.put(dat)

        et = time.time()
        self.cbcputime += (et - st)
        #self.tt=(et-st)

        if self.done % self.showpercounts == 0:
            self.out(seed)
            pass

    def out(self, seed):

        spendtime = time.time() - self.starttime
        spendtime = 1 if spendtime == 0 else spendtime
        nowh = str(int(spendtime) / 3600) + ":" if spendtime > 3600 else ""
        now = "%s%02d:%02d" % (nowh, spendtime % 3600 / 60, spendtime % 60)
        print "%s D:%-4d R:%-7d [Speed: T%.2f/s C%3d/s A%.2f] CB:%0.4f Active:%d %s %s" % (now, (self.done), self.run_que.qsize(), \
         (self.done)/(spendtime+self.oldtime), self.curspeed, self.tt, self.totalnettime / self.done ,self.poolsize-self.freecount, str(self.err), seed )

    def work(self):

        while self.quit == 0:

            st = time.time()
            curdone = self.done

            self.freecount = self.down_pool.free_count()

            if self.freecount > self.poolmaxfree:
                self.tasks = []
                minlen = min(self.freecount + 1, self.run_que.qsize())
                #if minlen <=0:break

                for i in range(minlen):
                    stt = time.time()
                    url = self.run_que.get()
                    ett = time.time()
                    if url in self.bfdone:  # 5%-10%
                        continue

                    url = "http://" + url
                    self.tasks.append(url)

                for url in self.tasks:
                    self.down_pool.apply_async(self.httpget, (url, ),
                                               callback=self.cb_httpget)

            time.sleep(0.1)
            et = time.time()
            self.curspeed = (self.done - curdone) / (et - st)
            #self.tt = (et-st)

        self.down_pool.join()
        print "All OVER"

    def handle_error(self, e, url):

        if e.find('DNSError') > 0:
            self.err.dns += 1
            self.err.rdns.append(url)
        elif e.find('reset') > 0:  #Connection reset
            self.err.reset += 1
            self.err.rreset.append(url)
        elif e.find('Max retries') > 0 or e.find('Connection aborted'):  #
            self.err.conntimeout += 1
            self.err.rconntimeout.append(url)
        elif e.find('refused') > 0:  #Connection refused
            self.err.refuse += 1
            self.err.rrefuse.append(url)

        else:
            self.err.others += 1
            self.err.rothers.append(url)
            print "Error", url, e

    # requests is better through test
    def httpget_requests(self, url):

        st = time.time()
        con = ""
        e = ""
        res_headers = ""
        headers = {
            'Accept-Language':
            'zh-CN,zh;q=0.8,zh-TW;q=0.6',
            'Accept-Encoding':
            'gzip,deflate',
            'Connection':
            'close',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
        }

        res = None
        try:
            # todo: query the ip of the website before get through dns
            req = requests
            req.max_redirects = 1
            res = req.get(url, timeout=(3, 2), headers=headers)
            if self.https_enable == 0 and res.url.lower().startswith('http:'):
                if 'content-type' not in res.headers.keys(
                ) or 'html' not in res.headers['content-type']:
                    return None
                con = res.content

            res.close()

        except KeyboardInterrupt:
            raise
        except Exception as e:
            e = str(e)
            if res:
                res.close()

            return url, e, None, None

        et = time.time()
        self.totalnettime += (et - st)
        self.tt = (et - st)
        return url, e, res.headers, con

    def savestate(self):

        self.quit = 1
        now = time.time()
        self.oldtime += (now - self.starttime)

        #should hold on the singal for procdata done

        with open('state.txt', 'wb') as f:
            f.write(str(self.oldtime) + '\n')
            # tasks run_queue done
            f.write(str(len(self.tasks)) + '\n')
            for t in self.tasks:
                f.write(t + '\n')
            l = self.run_que.qsize()
            f.write(str(l) + '\n')
            while l > 0:
                f.write(self.run_que.pop() + '\n')
                l -= 1
            f.write(str((self.done)) + '\n')

        with open('err_records.pack', 'wb') as f:
            cPickle.dump(self.err, f, 2)

        print time.strftime("%Y-%m-%d %H:%M:%S",
                            time.localtime()), " Save state successfully."
        f.close()
        exit(0)

    def loadstate(self):

        try:
            with open('state.txt') as f:
                self.oldtime = float(f.readline())
                tasks = int(f.readline())
                for i in xrange(tasks):
                    self.run_que.add(f.readline().rstrip('\n'))

                runnings = int(f.readline())
                for i in xrange(runnings):
                    self.run_que.add(f.readline().rstrip('\n'))

                self.done = int(f.readline())

            with open('err_records.pack', 'rb') as f:
                self.err = cPickle.load(f)

            print time.strftime("%Y-%m-%d %H:%M:%S",
                                time.localtime()), " Load state successfuly."
        except Exception as e:
            print e
class FlowController:
    def __init__(self, base_data, plugin_data):
        #
        self.flow_init_result = True
        # 将测试任务基础数据转换为多个变量
        self.base_data = base_data
        self.base_task_id = base_data['task_id']
        self.base_exc_times = base_data['exc_times']
        self.base_vuser_num = base_data['v_user']
        self.plugin_data = plugin_data
        self.worker_info_id = app_config.getint("worker", "id")
        self.worker_info = {"id": self.worker_info_id}
        self.gevent_pool = None
        http_tell_test_task_status(task_id=self.base_task_id, status=2)
        self.parameters_storage = ParametersStorage()
        # 实例化日志控制器
        self.init_log_controller = SyncLogController('tasklog',
                                                     self.base_task_id,
                                                     '_init')
        if self.init_log_controller.log_pool_make_result:
            app_logger.debug('测试任务ID:%d基础日志控制器初始化成功' % self.base_task_id)
            self.trans_init_log('基础日志控制器初始化成功')
        else:
            app_logger.error('测试任务ID:%d基础日志控制器初始化失败')
            self.flow_init_result = False
        self.run_log_controller = AsyncLogController('tasklog',
                                                     self.base_task_id, '_run')
        if self.run_log_controller.log_pool_make_result:
            app_logger.debug('测试任务ID:%d运行日志控制器初始化成功' % self.base_task_id)
            self.trans_init_log('运行日志控制器初始化成功')
        else:
            app_logger.error('测试任务ID:%d运行日志控制器初始化失败')
            self.flow_init_result = False
        if self.flow_init_result:
            # 写一些环境信息
            self.trans_init_log("启动测试任务")
            # 递归原始数据
            self.trans_init_log("准备初始化各虚拟用户的插件树")
            # self.recurse_plugin_tree(plugin_data[0])
            # self.trans_init_log("插件及流程控制器初始化结束")
        else:
            http_tell_test_task_status(task_id=self.base_task_id, status=-2)

    def trans_init_log(self, msg, level=None):
        log = "%s %s Worker:%d " % (
            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f'),
            'INFO' if level is None else 'ERROR', self.worker_info_id) + msg
        self.init_log_controller.trans(log)

    def init_plugin_tree(self, tree_data, vuser_index=None):
        # 只在首次初始化插件树用以数据检查的时候才会用
        not vuser_index and self.trans_init_log("准备初始化插件树")

        def recurse_plugin_tree(_data, parent_node=None):
            """
            递归初始化插件树
            :param _data: 插件原始数据
            :param parent_node: 父级节点实例
            :return: 无返回
            """
            # 对于暂不支持的插件,忽略其初始化
            if _data['originalId'] in all_plugins:
                if _data['status'] is True:
                    self_plugin = all_plugins[_data['originalId']](
                        base_data=self.base_data,
                        plugin_data=_data,
                        worker_info=self.worker_info,
                        vuser_index=vuser_index if vuser_index else 0,
                        parent_node=parent_node,
                        init_log_ctrl=self.init_log_controller,
                        run_log_ctrl=self.run_log_controller,
                        parameter_ctrl=self.parameters_storage)
                    if not self_plugin.plugin_check_result:
                        self.flow_init_result = False
                    not vuser_index and self.trans_init_log(
                        "插件'%s'初始化结果:%s" %
                        (self_plugin.plugin_title,
                         '成功' if self_plugin.plugin_check_result else
                         ('失败,%s' % self_plugin.plugin_check_log)))
                    if "children" in _data:
                        for child in _data["children"]:
                            recurse_plugin_tree(child, self_plugin)
                    if parent_node is None:
                        return self_plugin
                    else:
                        if self_plugin.__class__.__bases__[0] in [
                                ConfigurationPlugin, ParameterPlugin
                        ]:
                            parent_node.plugins_configuration.append(
                                self_plugin)
                        elif self_plugin.__class__.__bases__[0] in [
                                PreprocessorPlugin
                        ]:
                            parent_node.plugins_preprocessor.append(
                                self_plugin)
                        elif self_plugin.__class__.__bases__[0] in [
                                ControllerPlugin, RequestPlugin, TimerPlugin
                        ]:
                            parent_node.plugins_common.append(self_plugin)
                        elif self_plugin.__class__.__bases__[0] in [
                                AssertionPlugin
                        ]:
                            parent_node.plugins_assertion.append(self_plugin)
                        elif self_plugin.__class__.__bases__[0] in [
                                PostprocessorPlugin
                        ]:
                            parent_node.plugins_postprocessor.append(
                                self_plugin)
            else:
                not vuser_index and self.trans_init_log(
                    "插件'%s'初始化结果:%s" % (_data['title'], '失败,插件暂不支持'))
                self.flow_init_result = False

        plugin_tree = recurse_plugin_tree(tree_data)
        not vuser_index and self.trans_init_log("插件树初始化完毕")
        return plugin_tree

    def vuser_excute(self, tree):
        # 不同的线程之间共用self.base_exc_times会导致执行时间误减
        base_exc_times = self.base_exc_times
        # 执行次数
        while base_exc_times > 0:
            tree.run_test()
            base_exc_times -= 1

    def init_vusers(self):
        # 首先初始化出来一颗原始的插件树用以基本检查
        self.init_plugin_tree(self.plugin_data[0])
        # 如果基本初始化失败则不操作协程池
        if self.flow_init_result:
            # 初始化协程池
            try:
                self.gevent_pool = GeventPool(self.base_vuser_num)
            except Exception as e:
                msg = '测试任务虚拟用户并发池创建失败:%s' % repr(e)
                self.flow_init_result = False
                app_logger.error(msg)
                self.trans_init_log(msg)
            else:
                msg = '测试任务虚拟用户并发池创建成功'
                app_logger.debug(msg)
                self.trans_init_log(msg)
                vuser_index = 1
                free_count = self.gevent_pool.free_count()
                while free_count > 0:
                    # 每个虚拟用户拥有属于自己的插件树,互不干扰
                    plugin_tree = self.init_plugin_tree(
                        self.plugin_data[0], vuser_index)
                    self.gevent_pool.spawn(self.vuser_excute, plugin_tree)
                    self.trans_init_log("虚拟用户%d准备完毕" % vuser_index)
                    vuser_index += 1
                    free_count -= 1

    def run(self):
        # 调测阶段直接回写结束
        http_tell_test_task_status(task_id=self.base_task_id, status=3)
        self.gevent_pool.join()
        self.run_log_controller.cancel()
        self.trans_init_log("测试结束")
        http_tell_test_task_status(task_id=self.base_task_id, status=10)
Esempio n. 36
0
            break
    #print "User: {user}; playtime: {playtime}; WaitTime: {wait}; Action:{action}".format(user=uid, playtime=playing_time,


#                                                                        wait=sleep_time, action=action)
    gevent.sleep(sleep_time)
    if action == "stop":
        inst.stop_instance()
    elif action == "noinput":
        inst.notify_instance('20')
    elif action == "crash":
        inst.notify_instance('11')
    else:
        pass
    print getcurrent()

pool = Pool(parallen)
pool.imap(cloud_play, range(1, 400))

weight['overtime'] = 0

while now_time < end_time - 30000:
    time.sleep(2)

    free_num = pool.free_count()
    print "==========", free_num
    if free_num > 0:
        pool.imap(cloud_play, range(end_num, end_num + free_num))
        end_num += free_num
    now_time = int(time.time() * 1000)
Esempio n. 37
0
#!/usr/bin/python

import time
import random

import gevent
from gevent import Greenlet
from gevent.pool import Pool


def thrFunc(n):
    print "sleep %d seconds start.\n" %n
    gevent.sleep(n)
    print "sleep %d seconds end.\n" %n

threadPool = Pool(size=3)
while True:
    sec = random.randint(3,6)
    #gThr = Greenlet(thrFunc,sec)
    #gThr.start()
    #gThr.join()
    print "+++free:",threadPool.free_count()
    threadPool.spawn(thrFunc,sec)
    #threadPool.apply_async(thrFunc,sec)
Esempio n. 38
0
class ArticalSpider(object):
    """协程捕捉URL爬虫并解析html,将结果存入数据库
    maxsize: 队列存储的最大值(默认为1000)
    poolSize:协程池最大同时激活greenlet个数(默认为5个)
    """
    def __init__(self):
        self.evt = Event()  # 等待初始化
        self.initConfig()  # 初始化配置文件
        self.initModules()  # 初始化模块

        self.q = Queue(maxsize=self.maxsize)  # 有界队列
        self.initQueue()  # 初始化队列

        self.crawlUrlsCount = 0  # 统计搜到的链接的个数
        self.crawlerID = 0  # 协程ID标志
        self.pool = Pool(self.poolSize)  # 协程池
        self.isInitializeCompletely = False  # 是否初始化完成

        self.startTime = None  # 爬虫启动时间

    def initModules(self):
        """初始化模块"""
        logger.info('Initializing modules...')
        self.htmlParser = HtmlParser()  # 加载智能解析模块
        self.sqlManager = SQLManager()  # 加载数据库模块
        logger.info('Reading url md5 from mysql...')
        self.urlDict = self.sqlManager.getAllMd5()  # 加载已解析URL字典

    def initConfig(self):
        """读取配置文件信息"""
        logger.info('Initializing config...')
        with open('data.conf') as json_file:
            data = json.load(json_file)
            self.maxsize = data['maxUrlQueueSize']  # URL队列最大存储值
            self.poolSize = data['poolSize']  # 协程池最大同时激活greenlet个数
            self.fileName = data['urlQueueFileName']  # 队列url的保存文件名
            self.startUrls = data['startUrls']  # 队列初始化url
            self.filterUrlsRegular = data['filterUrlsRegular']  # 过滤的url
            self.saveTime = data['saveTime']  # 队列url定时保存到本地文件

    def initQueue(self):
        """初始化队列,提供起始url列表

        :param urls: url列表
        :return:
        """
        self.loadLastUrlQueue()
        for url in self.startUrls[:self.maxsize]:
            self.q.put(url)
        self.isInitializeCompletely = True
        self.evt.set()

    def loadLastUrlQueue(self):
        """加载上次保存的队列url"""
        logger.info('Initializing queue...')
        hasLastUrls = False
        if not os.path.exists(self.fileName): return hasLastUrls
        with open(self.fileName, 'rb') as f:
            for url in pickle.load(f)[:self.maxsize - 100]:
                hasLastUrls = True
                self.q.put(url.strip())  # 注意把空格删除
        return hasLastUrls

    def getCrawlUrlsCount(self):
        """返回已捕捉到的URL数量"""
        return self.crawlUrlsCount

    def getQueueSize(self):
        """返回当前队列中URL数量"""
        return self.q.qsize()

    def saveQueueUrls(self):
        """将队列内容拷贝到文件"""
        # 拷贝队列进行遍历
        logger.info('Save queue urls')
        with open(self.fileName, 'wb') as f:
            urls = list(self.q.queue)
            pickle.dump(urls, f)

    def crawlURL(self, crawlerID):
        """每个工作者,搜索新的url"""
        # 为了减少协程的切换,每个新建的工作者会不断查找URL,直到队列空或满
        # 实际上因为有界队列的原因,协程仍然会不断切换
        while True:
            if not self.isInitializeCompletely:  # 还未初始化完成则等待
                self.evt.wait()
            # 定时保存队列数据,以便下次恢复
            if time.time() - self.startTime > self.saveTime:
                self.saveQueueUrls()
                self.startTime = time.time()

            gevent.sleep(random.uniform(0, 1))  # 防止爬取频率过快
            try:
                url = self.q.get(timeout=0.1)  # 当队列空时自动释放当前greenlet
                md5_url = MD5(url)
                if md5_url in self.urlDict: continue  # 如果已存在则抛弃
                self.urlDict[md5_url] = True  # 加入字典

                headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
                }
                r = requests.get(url, timeout=5, headers=headers)
                if r.status_code == 200:
                    if r.encoding == 'ISO-8859-1':
                        charset = self.detCharset(r.text)
                        if charset != "" and charset.lower() in [
                                'utf-8', 'gb2312', 'gbk'
                        ]:
                            r.encoding = charset
                        else:
                            r.encoding = chardet.detect(
                                r.content)['encoding']  # 确定网页编码

                    # 插入数据库
                    self.insertMysql(r.text, url, MD5(url))

                    # 寻找下一个url
                    for link in re.findall('<a[^>]+href="(http.*?)"', r.text):
                        if len(self.filterUrlsRegular) != 0:
                            for filterUrl in self.filterUrlsRegular:
                                if filterUrl in link:
                                    # 仅当队列中元素小于最大队列个数添加当前url到队列
                                    self.q.put(
                                        link.strip(),
                                        timeout=0.1)  # 当队列满时自动释放当前greenlet
                                    self.crawlUrlsCount += 1
                                    break
                        else:
                            if len(link.strip()) != 0:
                                self.q.put(link.strip(), timeout=0.1)
                                self.crawlUrlsCount += 1

                else:
                    logger.warning('Request error status: ' +
                                   str(r.status_code) + ': ' + url)
                    # 这里可以进行重连(这里不写了)

            except Empty:  # q.get()时队列为空异常
                # logger.info('URL Queue is Empty! URLSpider-' + str(crawlerID) + ': stopping crawler...')
                break
            except Full:  # q.put()时队列为满异常
                # logger.info('URL Queue is Full! URLSpider-' + str(crawlerID) + ': stopping crawler...')
                break
            except requests.exceptions.ConnectionError:  # 连接数过高,程序休眠
                logger.warning('Connection refused')
                time.sleep(3)
            except requests.exceptions.ReadTimeout:  # 超时
                logger.warning('Request readTimeout')
                # 接下去可以尝试重连,这里不写了

    def insertMysql(self, html, url, md5):
        """将解析结果插入队列"""
        parseDict = self.htmlParser.extract_offline(html)
        content = parseDict['content']
        description = parseDict['description']
        keyword = parseDict['keyword']
        title = parseDict['title']
        # 插入数据库
        if content != "":
            self.sqlManager.insert(
                Artical(content=content,
                        title=title,
                        keyword=keyword,
                        description=description,
                        url=url,
                        md5=md5))
            logger.info('Insert Mysql: ' + url)

    def detCharset(self, html):
        """检测网页编码"""
        charsetPattern = re.compile(
            '<\s*meta[^>]*?charset=["]?(.*?)"?\s*[/]>?', re.I | re.S)
        charset = charsetPattern.search(html)
        if charset: charset = charset.groups()[0]
        else: charset = ""
        return charset

    def run(self):
        """开启协程池,运行爬虫,在队列中无url时退出捕获"""
        if self.q.qsize() == 0:
            logger.error('Please init Queue first (Check your .conf file)')
            return
        logger.info('Starting crawler...')
        self.startTime = time.time()
        while True:
            # 当没有任何协程在工作,且队列中无url时退出捕获
            if self.q.empty() and self.pool.free_count() == self.poolSize:
                break

            # 每次创建和队列中url个数一样的协程数
            # 如果协程池所能同时工作的协程数小于url个数,则创建协程池所能同时工作的最大协程数
            # 保证协程池总是在最多激活greenlet数状态
            for _ in range(min(self.pool.free_count(), self.q.qsize())):
                self.crawlerID += 1
                self.pool.spawn(self.crawlURL, self.crawlerID)

            # 切换协程(因为只在遇到I/O才会自动切换协程)
            gevent.sleep(0.1)
        logger.warning('All crawler stopping...')
Esempio n. 39
0
# coding=gbk
import gevent
from gevent.queue import Queue
from gevent.pool import Pool
from gevent import getcurrent

def DoSomething():
	print "thread %s " % id(getcurrent())
	gevent.sleep(3)

# 本测试发现:pool中add 后超出size 限制 即会开始执行,可以看做pool size +1 =限制容量大小
# greenlet 对象在推拉窗模式中 可以复用
pool = Pool(2) # 可并行 n + 1 个任务
print pool.free_count()
pool.add(gevent.spawn(DoSomething))
pool.join()

raw_input("waiting...")
# print "stage"
# for i in range(10):
# 	pool.add(gevent.spawn(DoSomething))
#pool.join()
Esempio n. 40
0
def main_loop(config):
    """
    Основной цикл приложения.

    :param config: конфигурация
    :type config: Config

    Алгоритм:
     * Открываем соединение с tarantool.queue, использую config.QUEUE_* настройки.
     * Создаем пул обработчиков.
     * Создаем очередь куда обработчики будут помещать выполненные задачи.
     * Пока количество обработчиков <= config.WORKER_POOL_SIZE, берем задачу из tarantool.queue
       и запускаем greenlet для ее обработки.
     * Посылаем уведомления о том, что задачи завершены в tarantool.queue.
     * Спим config.SLEEP секунд.
    """
    logger.info(
        'Connect to queue server on {host}:{port} space #{space}.'.format(
            host=config.QUEUE_HOST,
            port=config.QUEUE_PORT,
            space=config.QUEUE_SPACE))
    queue = tarantool_queue.Queue(host=config.QUEUE_HOST,
                                  port=config.QUEUE_PORT,
                                  space=config.QUEUE_SPACE)

    logger.info('Use tube [{tube}], take timeout={take_timeout}.'.format(
        tube=config.QUEUE_TUBE, take_timeout=config.QUEUE_TAKE_TIMEOUT))

    tube = queue.tube(config.QUEUE_TUBE)

    logger.info(
        'Create worker pool[{size}].'.format(size=config.WORKER_POOL_SIZE))
    worker_pool = Pool(config.WORKER_POOL_SIZE)

    processed_task_queue = gevent_queue.Queue()

    logger.info(
        'Run main loop. Worker pool size={count}. Sleep time is {sleep}.'.
        format(count=config.WORKER_POOL_SIZE, sleep=config.SLEEP))

    while run_application:
        free_workers_count = worker_pool.free_count()

        logger.debug(
            'Pool has {count} free workers.'.format(count=free_workers_count))

        for number in xrange(free_workers_count):
            logger.debug('Get task from tube for worker#{number}.'.format(
                number=number))

            task = tube.take(config.QUEUE_TAKE_TIMEOUT)

            if task:
                logger.info(
                    'Start worker#{number} for task id={task_id}.'.format(
                        task_id=task.task_id, number=number))

                worker = Greenlet(notification_worker,
                                  task,
                                  processed_task_queue,
                                  timeout=config.HTTP_CONNECTION_TIMEOUT,
                                  verify=False)
                worker_pool.add(worker)
                worker.start()

        done_with_processed_tasks(processed_task_queue)

        sleep(config.SLEEP)
        if break_func_for_test():
            break
    else:
        logger.info('Stop application loop.')
class ArchivariusBridge(object):
    """Archivarius Bridge"""
    def __init__(self, config):
        self.config = config
        self.workers_config = {}
        self.log_dict = {}
        self.bridge_id = uuid.uuid4().hex
        self.api_host = self.config_get('resources_api_server')
        self.api_version = self.config_get('resources_api_version')

        # Workers settings
        for key in WORKER_CONFIG:
            self.workers_config[key] = (self.config_get(key)
                                        or WORKER_CONFIG[key])

        # Init config
        for key in DEFAULTS:
            value = self.config_get(key)
            setattr(self, key,
                    type(DEFAULTS[key])(value) if value else DEFAULTS[key])

        # Pools
        self.workers_pool = Pool(self.workers_max)
        self.retry_workers_pool = Pool(self.retry_workers_max)
        self.filter_workers_pool = Pool()

        # Queues
        self.api_clients_queue = Queue()
        if self.resource_items_queue_size == -1:
            self.resource_items_queue = Queue()
        else:
            self.resource_items_queue = Queue(self.resource_items_queue_size)
        if self.retry_resource_items_queue_size == -1:
            self.retry_resource_items_queue = Queue()
        else:
            self.retry_resource_items_queue = Queue(
                self.retry_resource_items_queue_size)

        # Default values for statistic variables
        for key in (
                'droped',
                'add_to_resource_items_queue',
                'add_to_retry',
                'exceptions_count',
                'not_found_count',
                'archived',
                'moved_to_public_archive',
                'dumped_to_secret_archive',
        ):
            self.log_dict[key] = 0

        if self.api_host != '' and self.api_host is not None:
            api_host = urlparse(self.api_host)
            if api_host.scheme == '' and api_host.netloc == '':
                raise ConfigError('Invalid \'resources_api_server\' url.')
        else:
            raise ConfigError('In config dictionary empty or missing'
                              ' \'resources_api_server\'')
        self.db = prepare_couchdb(self.couch_url, self.db_name, logger)
        self.archive_db = prepare_couchdb(self.couch_url, self.db_archive_name,
                                          logger)
        # TODO
        self.archive_db2 = prepare_couchdb(self.couch_url,
                                           self.db_archive_name + '_secret',
                                           logger)

        self.resources = {}
        for entry_point in iter_entry_points(
                'openprocurement.archivarius.resources'):
            self.resources[entry_point.name] = {
                'filter':
                entry_point.load(),
                'view_path':
                '_design/{}/_view/by_dateModified'.format(entry_point.name)
            }

    def create_api_client(self):
        client_user_agent = self.user_agent + '/' + self.bridge_id + '/' + uuid.uuid4(
        ).hex
        timeout = 0.1
        while True:
            try:
                api_client = APIClient(host_url=self.api_host,
                                       user_agent=client_user_agent,
                                       api_version=self.api_version,
                                       resource='RESOURCE',
                                       key=self.api_key)
                self.api_clients_queue.put({
                    'client': api_client,
                    'request_interval': 0
                })
                logger.info('Started api_client {}'.format(
                    api_client.session.headers['User-Agent']))
                break
            except RequestFailed as e:
                self.log_dict['exceptions_count'] += 1
                logger.error(
                    'Failed start api_client with status code {}'.format(
                        e.status_code))
                timeout = timeout * 2
                sleep(timeout)

    def fill_api_clients_queue(self):
        while self.api_clients_queue.qsize() == 0:
            self.create_api_client()

    def fill_resource_items_queue(self, resource):
        start_time = datetime.now(TZ)
        rows = self.db.iterview(self.resources[resource]['view_path'],
                                10**3,
                                include_docs=True)
        filter_func = partial(self.resources[resource]['filter'],
                              time=start_time)
        for row in ifilter(filter_func, rows):
            self.resource_items_queue.put({'id': row.id, 'resource': resource})
            self.log_dict['add_to_resource_items_queue'] += 1

    def queues_controller(self):
        while True:
            self.fill_api_clients_queue()
            #if self.workers_pool.free_count() > 0 and (self.resource_items_queue.qsize() > int((self.resource_items_queue_size / 100) * self.workers_inc_threshold)):
            if self.resource_items_queue.qsize(
            ) > 0 and self.workers_pool.free_count() > 0:
                w = ArchiveWorker.spawn(self.api_clients_queue,
                                        self.resource_items_queue, self.db,
                                        self.archive_db, self.archive_db2,
                                        self.workers_config,
                                        self.retry_resource_items_queue,
                                        self.log_dict)
                self.workers_pool.add(w)
                logger.info('Queue controller: Create main queue worker.')
            #elif self.resource_items_queue.qsize() < int((self.resource_items_queue_size / 100) * self.workers_dec_threshold):
            elif self.resource_items_queue.qsize() == 0:
                if len(self.workers_pool) > self.workers_min:
                    wi = self.workers_pool.greenlets.pop()
                    wi.shutdown()
                    logger.info('Queue controller: Kill main queue worker.')
            logger.info('Main resource items queue contains {} items'.format(
                self.resource_items_queue.qsize()))
            logger.info('Retry resource items queue contains {} items'.format(
                self.retry_resource_items_queue.qsize()))
            logger.info(
                'Status: add to queue - {add_to_resource_items_queue}, add to retry - {add_to_retry}, moved to public archive - {moved_to_public_archive}, dumped to secret archive - {dumped_to_secret_archive}, archived - {archived}, exceptions - {exceptions_count}, not found - {not_found_count}'
                .format(**self.log_dict))
            sleep(self.queues_controller_timeout)

    def gevent_watcher(self):
        self.fill_api_clients_queue()
        if not self.resource_items_queue.empty() and len(
                self.workers_pool) < self.workers_min:
            w = ArchiveWorker.spawn(self.api_clients_queue,
                                    self.resource_items_queue, self.db,
                                    self.archive_db, self.archive_db2,
                                    self.workers_config,
                                    self.retry_resource_items_queue,
                                    self.log_dict)
            self.workers_pool.add(w)
            logger.info('Watcher: Create main queue worker.')
        if not self.retry_resource_items_queue.empty() and len(
                self.retry_workers_pool) < self.retry_workers_min:
            w = ArchiveWorker.spawn(self.api_clients_queue,
                                    self.retry_resource_items_queue, self.db,
                                    self.archive_db, self.archive_db2,
                                    self.workers_config,
                                    self.retry_resource_items_queue,
                                    self.log_dict)
            self.retry_workers_pool.add(w)
            logger.info('Watcher: Create retry queue worker.')

    def run(self):
        logger.info('Start Archivarius Bridge',
                    extra={'MESSAGE_ID': 'edge_bridge_start_bridge'})
        for resource in self.resources:
            self.filter_workers_pool.spawn(self.fill_resource_items_queue,
                                           resource=resource)
        spawn(self.queues_controller)
        while True:
            self.gevent_watcher()
            if len(self.filter_workers_pool) == 0 and len(
                    self.workers_pool) == 0 and len(
                        self.retry_workers_pool) == 0:
                break
            sleep(self.watch_interval)

    def config_get(self, name):
        try:
            return self.config.get('main', name)
        except NoOptionError:
            return
Esempio n. 42
0
class CoroutineWorker(Worker):

    DEFAULT_GREENLET_SIZE = 10 # control the pool size

    def __init__(self, cfg, file_logger=None, ppid=None, sockets=None):
        super(CoroutineWorker, self).__init__(cfg, file_logger, ppid, sockets)
        self.max_greenlets = int(self.cfg.max_greenlets or self.DEFAULT_GREENLET_SIZE)

    def patch(self):
        from gevent import monkey
        monkey.noisy = False

        # if the new version is used make sure to patch subprocess
        if gevent.version_info[0] == 0:
            monkey.patch_all()
        else:
            monkey.patch_all(subprocess=True)

    def init_process(self):
        super(CoroutineWorker, self).init_process()
        self.patch()
        self.pool = Pool(self.max_greenlets)
        self.mutex = threading.Semaphore()
        self._stop_event = threading.Event()

    def run(self):
        super(CoroutineWorker, self).run()
        while self.alive:
            if not self.pool.full():
                self.pool.spawn(self._run)
            self.file_logger.debug("pool greenlet size %d" % (self.pool.size - self.pool.free_count()))
            gevent.sleep(1.0)

        self._stop_event.wait()
        gevent.spawn(self.stop).join()

    def _run(self):
        if self.LISTENERS:
            while self.alive:
                self.mutex.acquire()
                ret = select.select(self.rd_fds, [], [], 1.0)
                self.file_logger.debug("Before: socket fd length: %d, greenlet:%d, listen in:%s" % (len(self.rd_fds), id(getcurrent()), self.LISTENERS[0] in self.rd_fds))
                if ret[0]:
                    sock = ret[0][0]
                    self.rd_fds.remove(sock)
                else:
                    sock = None
                self.mutex.release()
                if sock:
                    #for sock in ret[0]:
                    if sock in self.LISTENERS:
                        try:
                            client, addr = sock.accept()
                            client.setblocking(0)
                            close_on_exec(client)
                            self.rd_fds.append(client)
                        except socket.error as e:
                            if e.args[0] not in (errno.EAGAIN, errno.EWOULDBLOCK,
                                         errno.ECONNABORTED):
                                self.file_logger.error(traceback.format_exc())

                        finally:
                            self.rd_fds.append(sock)
                    else:
                        r = self.handle_request(client=sock)
                        if r == -1:
                            sock.close()
                        else:
                            self.rd_fds.append(sock)

                if self.ppid and self.ppid != os.getppid():
                    self.file_logger.info("Parent changed, shutting down: %s", self)
                    return

        else:
            while self.alive:
                try:
                    self.handle_request()
                except:
                    self.file_logger.error(traceback.format_exc())

    def stop(self):
        Worker.stop(self)
        self.pool.join(timeout=1)

    def handle_quit(self, sig, frame):
        self.alive = False
        self._stop_event.set()
Esempio n. 43
0
class KittenServer(object):
    halting_signals = (
        signal.SIGINT,
        signal.SIGTERM,
    )

    def __init__(self, ns):
        self.ns = ns

        # Workers and queues
        self.pool = Pool(5)
        self.queue = Queue()

        # States
        self.working = None
        self.torn = False

        # Greenlets; to be populated when started
        self.listener = None
        self.worker = None

        self.log = logbook.Logger('Server-{0}'.format(self.ns.port))

    def start(self):
        self.setup()
        self.listener = gevent.spawn(self.listen_forever)
        self.worker = gevent.spawn(self.work_forever)

        return self.listener

    def stop(self, exit=True):
        self.log.warning('Stopping server')
        self.teardown(exit)

    def listen(self, socket):
        request = socket.recv_json()
        # Send the request for processing and handle any errors
        response = self.handle_request(request)
        socket.send_json(response)

        return True

    def listen_forever(self):
        try:
            socket = self.get_socket()
            while self.listen(socket):
                pass

        except Exception:
            self.log.exception('Server died.')

        finally:
            self.teardown()

    def teardown_listener(self):
        self.log.info('Stopping socket listener.')
        self.listener.kill(timeout=5)  # TODO: Configurable

    def handle_request(self, request):
        request = KittenRequest(request)
        self.queue.put(request)
        return request.ack()

    def work(self):
        if self.queue.empty():
            gevent.sleep(0.1)  # TODO: Configurable
            self.log.debug('Slept')
            return True

        request = self.queue.get()
        socket = self.get_socket(zmq.REQ, request.host)
        self.pool.spawn(request.process, socket)

        return True

    def work_forever(self):
        self.working = True

        while self.work():
            pass  # pragma: nocover

        self.working = False
        self.log.warning('Worker pool stopped.')

    def teardown_workers(self):
        free = self.pool.free_count()
        if free == self.pool.size:
            self.log.info('Workers idle. Killing without timeout.')
            self.pool.kill()
            return True

        timeout = 5  # TODO: Configurable
        count = self.pool.size - free
        self.log.info('Giving {1} requests {0}s to finish', timeout, count)
        self.pool.kill(timeout=timeout)
        self.log.info('Requests finished or timed out.')

    def get_socket(self, kind=zmq.REP, host=None):
        context = zmq.Context()
        socket = context.socket(kind)

        if not host:
            host = 'tcp://*:{0}'.format(self.ns.port)

        self.log.info(
            'Binding {1} on {0}',
            host,
            {zmq.REP: 'REP', zmq.REQ: 'REQ'}.get(kind, kind)
        )
        if kind == zmq.REP:
            socket.bind(host)
        else:
            socket.connect(host)

        return socket

    def setup(self):
        self.log.info('Setting up server')
        self.setup_signals()
        self.setup_pidfile()

    def teardown(self, exit=True):
        if self.torn:
            # The greenlets will try to exit as well upon signals, so we need
            # to keep state to make sure that we don't loopingly kill
            # everything.
            return False

        self.torn = True
        self.log.info('Tearing down server')
        self.teardown_workers()
        self.teardown_pidfile()
        self.teardown_listener()
        self.log.info('Server teardown complete.')

        if exit:
            self.log.info('Exiting.')
            sys.exit(0)

    def setup_signals(self):
        for sig in self.halting_signals:
            gevent.signal(sig, self.signal_handler)

    def signal_handler(self):
        self.log.warning('Recieved halting signal')
        self.stop(True)

    @property
    def pidfile(self):
        return conf.pidfile(self.ns.port)

    def setup_pidfile(self):
        pid = str(os.getpid())
        self.log.debug('Pid: {0}', pid)
        with open(self.pidfile, 'w') as pidfile:
            pidfile.write(pid)

    def teardown_pidfile(self):
        self.log.debug('Removing pidfile')
        os.remove(self.pidfile)