コード例 #1
0
def do_by_gevent_pool(pool_size=100,
                      job_func=None,
                      loop_items=None,
                      timeout=None,
                      wait_timeout=5 * 60,
                      **kwargs):
    if not job_func or not loop_items:
        return
    worker_pool = Pool(pool_size)
    if hasattr(loop_items, '__call__'):
        for item in loop_items():
            while worker_pool.full():
                try:
                    worker_pool.wait_available(timeout=wait_timeout)
                except Timeout:
                    worker_pool.kill()
            worker_pool.spawn(job_func, item, **kwargs)
    else:
        for item in loop_items:
            while worker_pool.full():
                try:
                    worker_pool.wait_available(timeout=wait_timeout)
                except Timeout:
                    worker_pool.kill()
            worker_pool.spawn(job_func, item, **kwargs)
    try:
        worker_pool.join(timeout=timeout)
        return True  # 表示处理完成
    except:
        return False
コード例 #2
0
class GEventStatsdClient(StatsdClient):
    """ GEvent Enabled statsd client
    """
    def __init__(self,
                 pool_size=None,
                 host=None,
                 port=None,
                 prefix=None,
                 sample_rate=None):
        """
        Create GEvent enabled statsd client
        :param pool_size: Option size of the greenlet pool
        :param host: hostname for the statsd server
        :param port: port for the statsd server
        :param prefix: user defined prefix
        :param sample_rate: rate to which stats are dropped
        """
        super(GEventStatsdClient, self).__init__(host, port, prefix,
                                                 sample_rate)
        self._send_pool = Pool(pool_size or STATSD_GREEN_POOL_SIZE)
        self._socket = socket(AF_INET, SOCK_DGRAM)

    def _socket_send(self, stat):
        """
        Override the subclasses send method to schedule a udp write.
        :param stat: Stat string to write
        """
        # if we exceed the pool we drop the stat on the floor
        if not self._send_pool.full():
            # We can't monkey patch this as we don't want to ever block the calling greenlet
            self._send_pool.spawn(self._socket.sendto, stat,
                                  (self._host, self._port))
コード例 #3
0
def process_jobs(todo, end):
    pool = Pool(concurrency)
    jobs = {}

    try:
        while True:
            # add jobs if within grace period
            if not pool.full() and time.time() < end:
                try:
                    for _ in range(concurrency - len(jobs.keys())):
                        func, args = todo.pop(0) # raises IndexError if no more jobs todo
                        greenthread = pool.spawn(func, *args)
                        jobs[greenthread] = args

                except IndexError:
                    logging.warning("no more jobs todo")
            # check if done
            done = pool.join(timeout=1)
            # iterate through results, delete done greenthreads
            for greenthread, args in reversed(jobs.items()):
                if greenthread.successful():
                    logging.warning("success! args:{}, result:{}".format(args, greenthread.value))
                    jobs.pop(greenthread)
                elif greenthread.ready():
                    logging.warning("fail! args:{}, result:{}".format(args, greenthread.value))
            # if no more jobs and graceperiod ended then shutdown
            if time.time() > end and done :
                logging.warning("no more jobs, no more time")
                break
    except:
        logging.exception("error")
    return jobs
コード例 #4
0
class GEventStatsdClient(StatsdClient):
    """ GEvent Enabled statsd client
    """
    def __init__(self, pool_size=None,
                 host=None, port=None, prefix=None, sample_rate=None):
        """
        Create GEvent enabled statsd client
        :param pool_size: Option size of the greenlet pool
        :param host: hostname for the statsd server
        :param port: port for the statsd server
        :param prefix: user defined prefix
        :param sample_rate: rate to which stats are dropped
        """
        super(GEventStatsdClient, self).__init__(host, port, prefix, sample_rate)
        self._send_pool = Pool(pool_size or STATSD_GREEN_POOL_SIZE)
        self._socket = socket(AF_INET, SOCK_DGRAM)

    def _socket_send(self, stat):
        """
        Override the subclasses send method to schedule a udp write.
        :param stat: Stat string to write
        """
        # if we exceed the pool we drop the stat on the floor
        if not self._send_pool.full():
            # We can't monkey patch this as we don't want to ever block the calling greenlet
            self._send_pool.spawn(self._socket.sendto, stat, (self._host, self._port))
コード例 #5
0
 def init_all_stock_tick(self):
     start_date = '2015-01-01'
     _today = datetime.now().strftime('%Y-%m-%d')
     num_days = delta_days(start_date, _today)
     start_date_dmy_format = time.strftime(
         "%m/%d/%Y", time.strptime(start_date, "%Y-%m-%d"))
     data_times = pd.date_range(start_date_dmy_format,
                                periods=num_days,
                                freq='D')
     date_only_array = np.vectorize(lambda s: s.strftime('%Y-%m-%d'))(
         data_times.to_pydatetime())
     date_only_array = date_only_array[::-1]
     obj_pool = Pool(4)
     df = self.stock_info_client.get()
     for _, code_id in df.code.iteritems():
         _obj = self.stock_objs[
             code_id] if code_id in self.stock_objs else CStock(
                 self.dbinfo, code_id)
         for _date in date_only_array:
             if self.cal_client.is_trading_day(_date):
                 try:
                     if obj_pool.full(): obj_pool.join()
                     obj_pool.spawn(_obj.set_ticket, _date)
                 except Exception as e:
                     logger.info(e)
     obj_pool.join()
     obj_pool.kill()
コード例 #6
0
ファイル: gevent_run.py プロジェクト: zhiiker/FarBox
def do_by_gevent_pool(pool_size=100,
                      job_func=None,
                      loop_items=None,
                      timeout=None,
                      wait_timeout=5 * 60,
                      callback_func=None,
                      **kwargs):
    if not job_func or not loop_items:
        return
    worker_pool = Pool(pool_size)
    for item in loop_items:
        while worker_pool.full():
            try:
                worker_pool.wait_available(timeout=wait_timeout)
            except Timeout:
                worker_pool.kill()
        worker_pool.spawn(job_func, item, **kwargs)
    try:
        worker_pool.join(timeout=timeout)
        if callback_func and hasattr(callback_func, "__call__"):
            try:
                callback_func()
            except:
                pass
        return True  # 表示处理完成
    except:
        return False
コード例 #7
0
class SocketPool(object):



    def __init__(self):
        self.pool = Pool(1000)  #设置池容量1000
        self.pool.start()



    def listen(self, socket):
        while True:
            socket.recv()



    def add_handler(self, socket):
        if self.pool.full(): #容量慢报错
            raise Exception("At maximum pool size")
        else: #否则执行在新的grenlet里面执行listen方法
            self.pool.spawn(self.listen, socket)



    def shutdown(self):
        self.pool.kill() #关闭pool
コード例 #8
0
 def collect_combination_runtime_data(self):
     obj_pool = Pool(10)
     for code_id in self.combination_objs:
         try:
             if obj_pool.full(): obj_pool.join()
             obj_pool.spawn(self.combination_objs[code_id].run)
         except Exception as e:
             logger.info(e)
     obj_pool.join()
     obj_pool.kill()
コード例 #9
0
ファイル: gevent_core.py プロジェクト: abranches/pumba
class GeventExecutor(AbstractExecutor):
    def __init__(self, task_cls, max_threads, multiple_instances=False):
        super(GeventExecutor, self).__init__(task_cls)
        self._max_threads = max_threads
        self._multiple_instances = multiple_instances
        if multiple_instances:
            self._tasks_pool = Queue()
            for _ in xrange(max_threads):
                self._tasks_pool.put(task_cls())
        else:
            self._task = task_cls()
        self._thread_pool = Pool(size=max_threads)

    def setup_tasks(self):
        if self._multiple_instances:
            for task in self._tasks_pool.queue:
                task.setup()
        else:
            self._task.setup()

    def join(self, timeout=sys.maxint):
        super(GeventExecutor, self).join()
        self._thread_pool.join()

    def available(self):
        is_it = not self._thread_pool.full()
        #if not is_it:
        #    gevent.sleep(0)
        gevent.sleep(0)
        return is_it

    def wait_available(self):
        gevent.sleep(0)
        self._thread_pool.wait_available()

    def _run_task(self, run_id):
        self._thread_pool.apply_async(self._run_on_thread_pool, (run_id,))
        #gevent.sleep(0)

    def _run_on_thread_pool(self, run_id):
        try:
            if self._multiple_instances:
                try:
                    task = self._tasks_pool.get()
                    result = run_task_func_wrapper(task.run, run_id)
                finally:
                    self._tasks_pool.put(task)
            else:
                result = run_task_func_wrapper(self._task.run, run_id)
            self.on_async_run_finished(result)
        except:
            log.debug("DEUUU MEEERDA", exc_info=True)
コード例 #10
0
class RudiusAuthServer(DatagramServer):
    """Radius auth server"""
    def __init__(self, adapter, host="0.0.0.0", port=1812, pool_size=32):
        DatagramServer.__init__(self, (host, port))
        self.pool = Pool(pool_size)
        self.adapter = adapter

    def handle(self, data, address):
        if not self.pool.full():
            self.pool.spawn(self.adapter.handleAuth, self.socket, data,
                            address)
        else:
            logger.error("radius auth workpool full")
コード例 #11
0
 def collect_stock_runtime_data(self):
     obj_pool = Pool(100)
     for code_id in self.stock_objs:
         try:
             if obj_pool.full(): obj_pool.join()
             ret, df = self.subscriber.get_tick_data(add_prifix(code_id))
             if 0 == ret:
                 df = df.set_index('time')
                 df.index = pd.to_datetime(df.index)
                 obj_pool.spawn(self.stock_objs[code_id].run, df)
         except Exception as e:
             logger.info(e)
     obj_pool.join()
     obj_pool.kill()
コード例 #12
0
class SocketPool(object):
	def __init__(self):
		self.pool = Pool(1000)
		self.pool.start()
	def listen(self, socket):
		while True:
			socket.recv()
	def add_handler(self, socket):
		if self.pool.full():
			raise Exception('At maximum pool size')
		else:
			self.pool.spawn(self.listen, socket)
	def shutdown(self):
		self.pool.kill()
コード例 #13
0
ファイル: rss.py プロジェクト: flybird1971/spider
class RssPool(object):

    def __init__(self):

        self.pool = Pool(RSS_MAX_POOL_NUM)
        self.start = False
        self.times = 0
        self.beginTime = int(time.time())

    def run(self):

        while True:

            if (not self.start) and (not self.pool.full()):
                self.addRssSpider()
                # self.syncDagrame()
                continue

            self.start = False
            if self.pool.free_count() < RSS_MAX_POOL_NUM:
                logging.info("---------------join run ")
                self.pool.join()
            else:
                logging.info("---------------not data ,sleep %s senconds " % MAIN_LOOP_SLEEP_TIME)
                time.sleep(MAIN_LOOP_SLEEP_TIME)

    def syncDagrame(self):
        """同步数据到线上"""
        self.times += 1
        if self.times > RUN_SYNC_INTERVAL_TIMES or int(time.time()) - self.beginTime > RUN_SYNC_INTERVAL_TIME:
            logging.info("**********sync crawl infos ************")
            sync = SyncCrawlInfos()
            sync.index()
            self.times = 0
            self.beginTime = int(time.time())

    def addRssSpider(self):

        configList = getCrawlRssRequest()
        if not configList:
            self.start = True
            return True

        try:
            spider = CommonFeedRss()
            self.pool.spawn(spider.run, configList)
        except Exception, e:
            logging.info("------------------add spider exception : %s " % e)
コード例 #14
0
 def init_today_stock_tick(self):
     _date = datetime.now().strftime('%Y-%m-%d')
     obj_pool = Pool(50)
     df = self.stock_info_client.get()
     if self.cal_client.is_trading_day(_date):
         for _, code_id in df.code.iteritems():
             _obj = self.stock_objs[
                 code_id] if code_id in self.stock_objs else CStock(
                     self.dbinfo, code_id)
             try:
                 if obj_pool.full(): obj_pool.join()
                 obj_pool.spawn(_obj.set_ticket, _date)
                 obj_pool.spawn(_obj.set_k_data)
             except Exception as e:
                 logger.info(e)
     obj_pool.join()
     obj_pool.kill()
コード例 #15
0
class SocketPool(object):
	def __init__(self):
		self.pool = Pool(1000)
		self.pool.start()
		
	def listen(self, socket):
		while True:
			socket.recv()
			
	def add_handler(self, socket):
		if self.pool.full():
			raise Exception('Maximum pool size reached')
		else:
			self.pool.spawn(self.listen, socket)
			
	def shutdown(self):
		self.pool.kill()
コード例 #16
0
ファイル: gevent_server3.py プロジェクト: mabotech/maboss.py
class SocketPool(object):

    def __init__(self): 
        self.pool = Pool(1000)

    def listen(self, socket):
        
        while True:
            line =  socket.recv(10240) 
            #print line
            if  not line:
                #socket.close()
                break
            gevent.spawn(self.wait, socket, line)#.join()
            print 'after spawn'

    def add_handler(self, socket, address):
        
        print address
        
        if self.pool.full(): 
            raise Exception("At maximum pool size")
        else: self.pool.spawn(self.listen, socket)

    
    def wait(self, socket,  line):
        
        gevent.sleep(1)
        gevent.sleep(random.randint(0,5)*0.1)
        #print line
        try:
            v = unpackb(line)
            print v
        except Exception, e:
            v = "error"
            print v
            print e
        
        try:
            socket.send( packb({'status':'ok', 'val':v}))
            print 'after sleep'
        except Exception, e:
            print e.message
            
            print 'socket closed'
コード例 #17
0
class SocketPool(object):
    def __init__(self):
        self.pool = Pool(1000)
        self.pool.start()
        
    def listen(self, socket):
        while True:
            socket.recv()
            
    def add_handler(self, socket):
        if self.pool.full():
            raise Exception("At maximum pool size")
        self.pool.spawn(self.listen, socket)
        
    def shutdown(self):
        self.pool.kill()

# 当构造gevent驱动的服务时,经常将围绕一个池结构的整个服务作为中心。
# 这个例子就是在各个socket上轮询的类。
コード例 #18
0
ファイル: test.py プロジェクト: madchoy/Lab
class SocketPool(object):

# why is this example written without a way to start it?  Hopefully the other examples in this tutorial will show an example of this is actually used.
    def __init__(self):
        self.pool = Pool(1000)
        self.pool.start()

    def listen(self, socket):
        while True:
            socket.recv()

    def add_handler(self, socket):
        if self.pool.full():
            raise Exception("At maximum pool size")
        else:
            self.pool.spawn(self.listen, socket)

    def shutdown(self):
        self.pool.kill()
コード例 #19
0
class SocketPool(object):
    def __init__(self):
        self.pool = Pool(1000)
        self.pool.start()

    def listen(self, socket):
        while True:
            socket.recv()

    def add_handler(self, socket):
        if self.pool.full():
            raise Exception("At maximum pool size")
        self.pool.spawn(self.listen, socket)

    def shutdown(self):
        self.pool.kill()


# 当构造gevent驱动的服务时,经常将围绕一个池结构的整个服务作为中心。
# 这个例子就是在各个socket上轮询的类。
コード例 #20
0
ファイル: gevent_extractor.py プロジェクト: damnever/pigar
class GeventExtractor(BaseExtractor):

    def __init__(self, names, max_workers=222):
        super(self.__class__, self).__init__(names, max_workers)
        self._pool = Pool(self._max_workers)
        self._exited_greenlets = 0

    def extract(self, job):
        job = self._job_wrapper(job)
        for name in self._names:
            if self._pool.full():
                self._pool.wait_available()
            self._pool.spawn(job, name)

    def _job_wrapper(self, job):
        def _job(name):
            result = None
            try:
                result = job(name)
            except greenlet.GreenletExit:
                self._exited_greenlets += 1
            except Exception:
                e = sys.exc_info()[1]
                logger.error('Extracting "{0}", got: {1}'.format(name, e))
            return result
        return _job

    def wait_complete(self):
        self._pool.join()

    def shutdown(self):
        self._pool.kill(block=True)

    def final(self):
        count = self._exited_greenlets
        if count != 0:
            print(
                Color.YELLOW(
                    '** {0} running job exited.'.format(count)
                )
            )
コード例 #21
0
class PyMySQLPool(object):
    """Pool for pymysql

    """

    version = __version__

    def __init__(self, min):
        self.pool = Pool(10)
        self.pool.start()

    def addConnection(self, db):
        while True:
            db.recv()

    def add_handler(self, socket):
        if self.pool.full():
            raise Exception("At maximum pool size")
        else:
            self.pool.spawn(self.addConnection, socket)

    def shutdown(self):
        self.pool.kill()
コード例 #22
0
class GeventExtractor(BaseExtractor):

    def __init__(self, names, max_workers=222):
        super(self.__class__, self).__init__(names, max_workers)
        self._pool = Pool(self._max_workers)
        self._exited_greenlets = 0

    def extract(self, job):
        job = self._job_wrapper(job)
        for name in self._names:
            if self._pool.full():
                self._pool.wait_available()
            self._pool.spawn(job, name)

    def _job_wrapper(self, job):
        def _job(name):
            result = None
            try:
                result = job(name)
            except greenlet.GreenletExit:
                self._exited_greenlets += 1
            except Exception:
                e = sys.exc_info()[1]
                logger.error('Extracting "{0}", got: {1}'.format(name, e))
            return result
        return _job

    def wait_complete(self):
        self._pool.join()

    def shutdown(self):
        self._pool.kill(block=True)

    def final(self):
        count = self._exited_greenlets
        if count != 0:
            print('** {0} running job exited.'.format(count))
コード例 #23
0
ファイル: testpoolsocket.py プロジェクト: bynoting/python
class SocketPool(object):
	def __init__(self):
		self.pool = Pool(1)
		self.pool.add(self.server())

	# 适合聊天室的按回车发送文本方式
	def listen( self, socket,address):
		f = socket.makefile()
		print "listen"

		while True:
			name = f.readline().strip()
			print name

	def listen2( self, socket,address):
		print "listen2"
		print self.pool.free_count()
		while True:
			name =socket.recv(1010).strip()
			print name

	def add_handler( self, socket,address):
		if self.pool.full():
			raise Exception( "At maximum pool size")
		else:
			print (" pool insert")
			s = self.pool.spawn(self.listen2(socket,address))

			# self. pool.spawn( self. listen, socket,address)

	def shutdown( self):
		self. pool. kill()

	def server(self):
		print "server"
		server = StreamServer(('0.0.0.0', 8000), self.add_handler)
		server.serve_forever()
コード例 #24
0
ファイル: cekresi.py プロジェクト: Vaziria/common
def run_gevent():

    worker = _config.get('count_worker', 4)
    pool = Pool(worker)

    funcs = run()

    while True:

        if pool.full():
            time.sleep(1)
            continue

        # getting func delete
        try:
            funcnya = next(funcs)
            pool.spawn(funcnya['func'], *funcnya['param'])

        except StopIteration as e:

            if pool.free_count() == worker:
                break

        time.sleep(0.01)
コード例 #25
0
ファイル: __init__.py プロジェクト: penpen/srq-python
class Queue(object):

    def __init__(self, redis, name, ttl=None):
        self._redis = redis
        self.name = name
        self.tasks_key = self._get_key_(name, 'tasks')
        self.result_key = self._get_key_(name, 'results')
        # For killing by timeout (memory leak)
        self.started = time.time()
        self.working = set()
        self._greenlets = []
        self.ttl = ttl
        self.worker = get_worker_name()
        # if show_stats:
        #    self.stats_start = time.time()
        #    self.tasks_processed = 0

    def _get_key_(self, name, token, modifier='queue'):
        return 'srq:{modifier}:{name}:{token}'.format(name=name, token=token, modifier=modifier)

    def process(self, func, pool=20, workers=[], stats=None):
        try:
            self._pool = Pool(pool)
            self.func = func
            self.spawn(self._get_work_)
            # if self.show_stats:
            #    self.spawn(self._show_stats_)
            for worker in workers:
                self.spawn(worker)
            if stats:
                self.spawn(self.push_stats, stats)
            self._pool.join()
        except Exception:
            logger.error('Gevent error', exc_info=True)

    def push_stats(self, fn):
        while True:
            stats = fn()
            self._redis.setex('srqstats:%s:%s' % (self.name, self.token), 6, stats)
            if sleep:
                sleep(5)
            else:
                yield 5

    def _show_stats_(self):
        while True:
            if time.time() - self.started > self.ttl:
                self.stop()
            elapsed = time.time() - self.stats_start
            speed = self.tasks_processed / elapsed
            print('Speed: %d t/s (%d tasks by %d sec)' % (speed, self.tasks_processed, elapsed))
            self.stats_start = time.time()
            self.tasks_processed = 0
            if sleep:
                sleep(5)
            else:
                yield 5

    def _get_work_(self):
        while True:
            if self.ttl:
                if time.time() - self.started > self.ttl:
                    self._pool.spawn(self.stop)
            if self._pool.full():
                sleep(5)
                continue
            task = self._redis.lpop(self.tasks_key)
            if task:
                self.spawn(self._work_, task)
            if sleep:
                sleep(5)
            else:
                yield 5

    def _work_(self, task_data):
        self.working.add(task_data)
        task = json.loads(task_data)
        uuid, args, kwargs = task
        logger.debug('Got task: %s', uuid)
        try:
            result = self.func(*args, **kwargs)
            self._push_result_(uuid, result)
            logger.debug('Processed: %s', uuid)
            if self.show_stats:
                self.tasks_processed += 1
        except Exception:
            logger.error('Proccessing error: #%s', uuid, exc_info=True)
        try:
            self.working.remove(task_data)
        except KeyError:
            pass

    def _push_result_(self, uuid, result):
        result = json.dumps((uuid, result))
        self._redis.rpush(self.result_key, result)

    @property
    def tasks(self):
        return self._redis.llen(self.tasks_key)

    @property
    def results(self):
        return self._redis.llen(self.result_key)

    def request(self, *args, **kwargs):
        logger.debug('Requesting {name}(*{args}, **{kwargs})'.format(name=self.name,
                                                                     args=str(args),
                                                                     kwargs=str(kwargs)))
        uuid = uuid4().hex
        task = (uuid, args, kwargs)
        self._redis.rpush(self.tasks_key, json.dumps(task))
        return uuid

    def pop_result(self):
        result = self._redis.lpop(self.result_key)
        if result:
            return json.loads(result)

    def pull_result(self):
        while True:
            result = self._redis.lpop(self.result_key)
            if result:
                yield json.loads(result)

    def spawn(self, fn, *args, **kwargs):
        greenlet = self._pool.spawn(fn, *args, **kwargs)
        self._greenlets.append(greenlet)
        return greenlet

    def stop(self):
        for greenlet in self._greenlets:
            self._pool.killone(greenlet)
        for task in self.working:
            self._redis.rpush(self.tasks_key, task)
コード例 #26
0
# pool = ProcessPool(48)

cnt = 0

def send(msg):
    res = requests.get('http://localhost:4567/?msg=' + msg.decode('UTF-8'))
    print(res)


while True:

    cnt += 1
    logging.debug('polling...')
    socks = dict(poll.poll())
    if recv in socks and socks[recv] == zmq.POLLIN:

        msg = recv.recv()
        print('got %s from upstream' % msg)

        # send(msg)

        if pool.full():
            pool.join(timeout=1)
        else:
            pool.spawn(send, msg)

        if cnt == 1000:
            break

pool.join()
コード例 #27
0
class ProxyPool:


    def __init__(self):
        self.THREAD_ID = 0
        self.proxy_list = []
        self.wait_for_verify = Queue()
        self.thread_pool = Pool()
        self.output = []
        gevent.monkey.patch_socket()
        gevent.monkey.patch_ssl()
        # self.thread_pool.start()

    def http_headers(self):
        headers = {
            'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Chrome/59.0.%d.%d Safari/537.36' % (random.randint(1000, 9999), random.randint(100, 999)),
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'en-US,en;q=0.8'
        }
        return headers

    def add_thread(self, func, *args):
        # print('test')
        if self.thread_pool.full():
            raise Exception("At maximum pool size")
        else:
            self.thread_pool.spawn(func, *args)
            # self.thread_pool.join()

    def add_proxy(self, proxy):
        self.proxy_list.append(proxy)

        self.output.append(proxy.dic())
        # print(self.output)
        self.output = sorted(self.output, key=lambda k: k['delay'])
        open('proxy.json', 'w').write(json.dumps(self.output, ensure_ascii=True, indent=4))


    def kill_thread(self):
        self.thread_pool.kill()

    def start(self):
        self.add_thread(self.kuaidaili_com)
        self.add_thread(self.goubanjia_com)
        self.add_thread(self._66ip_cn)
        self.thread_pool.join()

    def get(self):
        pass

    def get_all(self):
        return self.proxy_list


    def kuaidaili_com(self, *args):
        self.add_thread(self.kuaidaili_type_com, 'inha')
        self.add_thread(self.kuaidaili_type_com, 'intr')
        self.add_thread(self.kuaidaili_type_com, 'outha')
        self.add_thread(self.kuaidaili_type_com, 'outtr')

    def kuaidaili_type_com(self, t, *args):
        logger.info('kuaidaili.com %s start' % t)
        i = 1
        self.THREAD_ID += 1
        rq = requests.Session()
        headers = self.http_headers()
        rq.get('http://www.kuaidaili.com/', headers=headers)
        while(1):
            gevent.sleep(3)
            url = 'http://www.kuaidaili.com/free/%s/%d/' % (t, i)
            r = rq.get(url, headers=headers)
            if 'qo=eval;qo(po);' in r.text:
                c = PyV8.JSContext()
                c.enter()
                f = c.eval(r.text)
                print(f)
                # exit()
                print(r.text)
                logger.debug('bypass...')
                continue
            if r.status_code == 404:
                break
            if r.status_code == 503:
                logger.error('%s return <%d>' % (url, r.status_code))
                continue
            try:
                html = BeautifulSoup(r.text, 'lxml')
                tbody = html.tbody
                if tbody is None:
                    print(html)
                    continue
                for tr in tbody.find_all('tr'):
                    # print(tr)

                    p = proxy()
                    p.ip = tr.find_all('td', {'data-title':"IP"})[0].text
                    p.port = int(tr.find_all('td', {'data-title':"PORT"})[0].text)
                    p.safe = tr.find_all('td', {'data-title':"匿名度"})[0].text
                    p.type = tr.find_all('td', {'data-title':"类型"})[0].text
                    p.place = tr.find_all('td', {'data-title':"位置"})[0].text

                        # print(tr.find_all('td', {'data-title':"响应速度"})[0].text)
                    # print(tr.find_all('td', {'data-title':"最后验证时间"})[0].text)
                    logger.debug('<get>%s' % p)
                    self.wait_for_verify.put(p)
                    self.THREAD_ID += 1
                    self.add_thread(self.verify_proxy_thread, self.THREAD_ID)
                logger.debug('%s ok' % url)
                gevent.sleep(1)
            except AttributeError as e:
                print(e)
                # print(r.text)

                logger.error('%s Error, sleep 10s' % url)
                gevent.sleep(10)
                continue

            # exit()
            i += 1


    def goubanjia_com(self, *args):
        logger.info('giubanjia.com start')
        i = 1
        self.THREAD_ID += 1
        while(1):
            url = 'http://www.goubanjia.com/free/index%d.shtml' % (i)
            r = requests.get(url, headers=self.http_headers())
            if r.status_code == 404:
                break
            try:
                html = BeautifulSoup(r.text, 'lxml')
                tbody = html.tbody
                for tr in tbody.find_all('tr'):
                    p = proxy()

                    [x.extract() for x in tr.find_all('p')]


                    try:
                        _ = tr.find_all('td', {'class':"ip"})[0].text
                        _ = _.split(':')
                        p.ip = _[0]
                        p.port = int(_[1])
                        # p.port = int(tr.find_all('td', {'data-title':"PORT"})[0].text)

                        p.safe = tr.find_all('td')[1].text.replace(' ', '').replace('\n', '').replace('\t', '')
                        p.type = tr.find_all('td')[2].text.replace(' ', '').replace('\n', '').replace('\t', '')
                        p.place = tr.find_all('td')[3].text.replace(' ', '').replace('\n', '').replace('\t', '').replace('\r', '').replace('\xa0', '')
                        p.net = tr.find_all('td')[4].text.replace(' ', '').replace('\n', '').replace('\t', '')
                    except IndexError as e:
                        print(tr)
                        logger.error('%s is index error' % p)
                        # exit(0)

                    logger.debug('<get>%s' % p)
                    self.wait_for_verify.put(p)
                    self.THREAD_ID += 1
                    self.add_thread(self.verify_proxy_thread, self.THREAD_ID)
                logger.debug('%s ok' % url)
                gevent.sleep(1)
            except AttributeError as e:
                print(e)
                # print(r.text)
                gevent.sleep(10)
                logger.error('%s Error, sleep 10s' % url)
                continue

            # exit()
            i += 1

    def _66ip_cn(self, *args):
        logger.info('giubanjia.com start')
        i = 1
        self.THREAD_ID += 1
        while(1):
            url = 'http://www.66ip.cn/%d.html' % (i)
            r = requests.get(url, headers=self.http_headers())
            if r.status_code == 404:
                break
            try:
                html = BeautifulSoup(r.content.decode('gb2312'), 'lxml')
                tbody = html.find_all('table')[2]

                for tr in tbody.find_all('tr'):
                    p = proxy()
                    _ = tr.find_all('td')[0].text
                    if _ == 'ip':
                        continue
                    else:
                        p.ip = _

                    p.port = int(tr.find_all('td')[1].text)

                    p.place = tr.find_all('td')[2].text
                    p.safe = tr.find_all('td')[3].text



                    logger.debug('<get>%s' % p)
                    self.wait_for_verify.put(p)
                    self.THREAD_ID += 1
                    self.add_thread(self.verify_proxy_thread, self.THREAD_ID)
                logger.debug('%s ok' % url)
                gevent.sleep(1)
            except AttributeError as e:
                print(e)
                # print(r.text)

                logger.error('%s Error, sleep 10s' % url)
                gevent.sleep(10)
                continue

            # exit()
            i += 1



    def get_delay(self, p):
        r = 0
        try:
            # r = requests.get('http://www.baidu.com', proxies={p.scheme: '%s:%d' % (p.ip, p.port)}).elapsed.microseconds/100000
             r = requests.get('http://www.baidu.com', proxies={p.scheme: '%s:%d' % (p.ip, p.port)}).elapsed
             r = r.seconds + (r.microseconds + 0.0)/1000000

        except requests.exceptions.ProxyError:
            return 0
        # except ConnectionError:
        #     return 0
        # except ConnectionResetError:
        #     return r
        except:
            # logger.error(str(p) + ' cannot get delay)
            return 0
        return r

    def verify_proxy_thread(self, thread_id):
        # logger.debug('<thread %d> start' % thread_id)
        if self.wait_for_verify.empty():
            # logger.debug('<thread %d> exit' % thread_id)
            self.THREAD_ID -= 1
            return None
            # if t <= 0:
            #     logger.info('<thread %d> exit' %  thread_id)
            #     return
            # else:
            #     logger.debug('<thread %d> wait for 1s' %  thread_id)
            #     gevent.sleep(1)
            #     return self.verify_proxy_thread(thread_id, t-1)

        p = self.wait_for_verify.get()
        delay = self.get_delay(p)


        if delay > 0:
            p.delay = delay
            p.verify = time.time()
            self.add_proxy(p)
        # for td in tr.find_all('td'):
        #     print(td.text)
            logger.info('<thread %d> get a proxy %s' % (thread_id, p))
        else:
            pass
            # logger.debug('<thread %d> throw away a proxy %s' % (thread_id, p))
        return self.verify_proxy_thread(thread_id)
コード例 #28
0
class CoroutineWorker(Worker):

    DEFAULT_GREENLET_SIZE = 10 # control the pool size

    def __init__(self, cfg, file_logger=None, ppid=None, sockets=None):
        super(CoroutineWorker, self).__init__(cfg, file_logger, ppid, sockets)
        self.max_greenlets = int(self.cfg.max_greenlets or self.DEFAULT_GREENLET_SIZE)

    def patch(self):
        from gevent import monkey
        monkey.noisy = False

        # if the new version is used make sure to patch subprocess
        if gevent.version_info[0] == 0:
            monkey.patch_all()
        else:
            monkey.patch_all(subprocess=True)

    def init_process(self):
        super(CoroutineWorker, self).init_process()
        self.patch()
        self.pool = Pool(self.max_greenlets)
        self.mutex = threading.Semaphore()
        self._stop_event = threading.Event()

    def run(self):
        super(CoroutineWorker, self).run()
        while self.alive:
            if not self.pool.full():
                self.pool.spawn(self._run)
            self.file_logger.debug("pool greenlet size %d" % (self.pool.size - self.pool.free_count()))
            gevent.sleep(1.0)

        self._stop_event.wait()
        gevent.spawn(self.stop).join()

    def _run(self):
        if self.LISTENERS:
            while self.alive:
                self.mutex.acquire()
                ret = select.select(self.rd_fds, [], [], 1.0)
                self.file_logger.debug("Before: socket fd length: %d, greenlet:%d, listen in:%s" % (len(self.rd_fds), id(getcurrent()), self.LISTENERS[0] in self.rd_fds))
                if ret[0]:
                    sock = ret[0][0]
                    self.rd_fds.remove(sock)
                else:
                    sock = None
                self.mutex.release()
                if sock:
                    #for sock in ret[0]:
                    if sock in self.LISTENERS:
                        try:
                            client, addr = sock.accept()
                            client.setblocking(0)
                            close_on_exec(client)
                            self.rd_fds.append(client)
                        except socket.error as e:
                            if e.args[0] not in (errno.EAGAIN, errno.EWOULDBLOCK,
                                         errno.ECONNABORTED):
                                self.file_logger.error(traceback.format_exc())

                        finally:
                            self.rd_fds.append(sock)
                    else:
                        r = self.handle_request(client=sock)
                        if r == -1:
                            sock.close()
                        else:
                            self.rd_fds.append(sock)

                if self.ppid and self.ppid != os.getppid():
                    self.file_logger.info("Parent changed, shutting down: %s", self)
                    return

        else:
            while self.alive:
                try:
                    self.handle_request()
                except:
                    self.file_logger.error(traceback.format_exc())

    def stop(self):
        Worker.stop(self)
        self.pool.join(timeout=1)

    def handle_quit(self, sig, frame):
        self.alive = False
        self._stop_event.set()