def wrapper(func): for table in tables: # hash ring for cache update queues = [Queue() for _ in range(workers)] hash_ring = ketama.Continuum() for q in queues: hash_ring[str(hash(q))] = q self.update_queues[table] = hash_ring cache_update = self._cache_update_gen(table, func, multi=multi) self.workers[table] = [ Worker("%s_cache_update" % table, q, cache_update, multi=multi, logger_name="%s.%s" % (self.name, table)) for q in queues ] # single worker for cache delete delete_q = Queue() self.delete_queues[table] = delete_q cache_delete = self._cache_delete_gen(table) self.workers[table].append( Worker("%s_cache_delete" % table, delete_q, cache_delete, multi=True, logger_name="%s.%s" % (self.name, table))) self.socket.setsockopt(zmq.SUBSCRIBE, asbytes(table)) return func
def __init__(self, ketama_server_file): self.server_list = self.parse_server_file(ketama_server_file) self.continuum = ketama.Continuum(ketama_server_file) for hostname, port in self.server_list: server_string = "{0}:{1}".format(hostname, port) # creating a emtpy record for lazy connection responses. self.SERVERS.update({ server_string: None, })
def center_node_dispather(self): """主节点任务调度""" while True: self.logger.debug('获取新加入的URLs.........') tasks = [] if self.server.llen('seeds'): tasks.append(self.server.lpop('seeds')) self.tasks.extend(tasks) state = self.spider_state_watcher() if state: self.logger.debug('遍历爬虫节点并依次暂停当前运行的爬虫..........') spider_ids = [] spider_ip_ids = [] for spider_key in self.spiders: spider_ids.append(spider_key.split(':')[3]) spider_ip_ids.append((spider_key.split(':')[2], spider_key.split(':')[3])) for spider_ip_id in spider_ip_ids: key = '{job}:status'.format(job=spider_ip_id[1]) self.server.set(key, 'pause') time.sleep(4) self.logger.debug('由于爬虫节点状态改变,调整哈希分布...........') self.chose = ketama.Continuum(spider_ids) self.logger.debug('调整爬虫节点所负责的站点数据抓取任务, 请勿在此段时间启动额外的爬虫..........') queue_keys = self.server.keys('*:queue') for queue_key in queue_keys: tasks.extend(self.server.zrange(queue_key, 0, -1)) # 获取所有爬虫队列中的urls self.server.zremrangebyrank(queue_key, 0, -1) # 清空爬虫队列 self.logger.debug('恢复先前暂停的爬虫节点.......') for spider_ip_id in spider_ip_ids: key = '{job}:status'.format(job=spider_ip_id[1]) self.server.set(key, 'running') self.logger.debug('等待!, 重新分配URLs..............') for task_json in tasks: task = pickle.loads(task_json) if 'url' in task and 'spider_type' in task: extract = tldextract.TLDExtract() url = task['url'] spider_type = task['spider_type'] domain = extract(url).domain job_id = self.chose[url.encode('utf-8')] queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(spider_type=spider_type, job_id=job_id, domain=domain) priority = task['priority'] self.server.zadd(queue_key, pickle.dumps(task), priority) else: self.logger.error("please input url and spider_type that you want to crawl!")
def status_from_redis(self): self.create_throttle_queues() self.expire_queues() status = self.redis_conn.get('{job}:status'.format(job=self.job_id)) if status == 'pause': # 暂停爬虫 && 重置一致性分布 self.paused = True spiders = self.redis_conn.keys('stats:spider:*:*') spider_ids = [] for spider in spiders: spider_ids.append(spider.split(':')[3]) self.chose = ketama.Continuum(spider_ids) return if status == 'running': self.paused = False
def wrapper(func): for topic in topics: queues = [Queue() for _ in range(workers)] hash_ring = ketama.Continuum() for q in queues: hash_ring[str(hash(q))] = q self.worker_queues[topic] = hash_ring self.workers[topic] = WorkerPool(queues, topic, func, multi=multi, queue_limit=queue_limit, logger_name="%s.%s" % (self.name, topic)) self.socket.setsockopt(zmq.SUBSCRIBE, asbytes(topic)) return func
def initial_seeds(self): """初始化调度器""" while True: initial_len = self.server.llen('seeds') if initial_len: break time.sleep(180) continue self.logger.debug('获取初始种子列表.........') while True: tasks = self.server.lrange('seeds', 0, -1) self.server.ltrim('seeds', -1, 0) self.tasks.extend(tasks) if self.tasks: break self.logger.debug('获取初始爬虫进程个数.........') self.spiders = self.server.keys('stats:spider:*:*') # spiders列表 self.spider_count = len(self.spiders) if self.spider_count: self.logger.debug('调用一致性哈希算法布局爬虫节点位置.......') job_ids = [] for spider in self.spiders: job_ids.append(spider.split(':')[3]) self.chose = ketama.Continuum(job_ids) self.logger.debug('分配初始种子URLs队列........') for task_json in self.tasks: task = pickle.loads(task_json) if 'url' in task and 'spider_type' in task: extract = tldextract.TLDExtract() url = task['url'] spider_type = task['spider_type'] domain = extract(url).domain job_id = self.chose[url.encode('utf-8')] queue_key = '{spider_type}:{job_id}:{domain}:queue'.format(spider_type=spider_type, job_id=job_id, domain=domain) priority = task['priority'] self.server.zadd(queue_key, pickle.dumps(task), priority) else: self.logger.error("please input url and spider_type that you want to crawl!")
def test_ketama_compatibility(ketama_config_file): if not ketama: return ring = HashRing( nodes={"127.0.0.1:11211": 600, "127.0.0.1:11212": 400}, replicas=4, vnodes=40, compat=True, ) continuum = ketama.Continuum(ketama_config_file) assert ring.get_points() == continuum.get_points() numhits = 1000 numvalues = 10000 for i in range(numhits): key = str(randint(1, numvalues)) assert ring.get_server(key) == continuum.get_server(key)
def from_settings(cls, settings): server = redis.Redis(host=settings.get('REDIS_HOST'), port=settings.get('REDIS_PORT'), db=settings.get('REDIS_DB')) persist = settings.get('SCHEDULER_PERSIST', True) up_int = settings.get('SCHEDULER_QUEUE_REFRESH', 10) hits = settings.get('QUEUE_HITS', 10) window = settings.get('QUEUE_WINDOW', 60) mod = settings.get('QUEUE_MODERATED', False) timeout = settings.get('DUPEFILTER_TIMEOUT', 600) ip_refresh = settings.get('SCHEDULER_IP_REFRESH', 60) add_type = settings.get('SCHEDULER_TYPE_ENABLED', True) add_ip = settings.get('SCHEDULER_IP_ENABLED', False) retries = settings.get('SCHEUDLER_ITEM_RETRIES', 3) ip_regex = settings.get('IP_ADDR_REGEX', '.*') backlog_blacklist = settings.get('SCHEDULER_BACKLOG_BLACKLIST', True) queue_timeout = settings.get('SCHEDULER_QUEUE_TIMEOUT', 3600) my_level = settings.get('SC_LOG_LEVEL', 'INFO') my_name = settings.get('SC_LOGGER_NAME', 'sc-logger') my_output = settings.get('SC_LOG_STDOUT', True) my_json = settings.get('SC_LOG_JSON', False) my_dir = settings.get('SC_LOG_DIR', 'logs') my_bytes = settings.get('SC_LOG_MAX_BYTES', '10MB') my_file = settings.get('SC_LOG_FILE', 'main.log') my_backups = settings.get('SC_LOG_BACKUPS', 5) logger = LogFactory.get_instance(json=my_json, name=my_name, stdout=my_output, level=my_level, dir=my_dir, file=my_file, bytes=my_bytes, backups=my_backups) # spider_ids = ['1', ] spider_ids = ['1', '2', '3'] chose = ketama.Continuum(spider_ids) return cls(server, persist, up_int, timeout, retries, logger, hits, window, mod, ip_refresh, add_type, add_ip, ip_regex, backlog_blacklist, queue_timeout, chose)
def test_points(self): ring = ketama.Continuum(self.server_list_file) for i in xrange(0, 100000): print i, ring.get_server(str(i))[1]
def test_points(self): cont = ketama.Continuum(self.valid_list_file) self.assertEqual(len(cont.get_points()), 160 * 3)
def test_server_modified_count(self): cont = ketama.Continuum(self.valid_list_file) self.assertEqual(cont.get_server_count(), 3)
def test_removal(self): cont = ketama.Continuum(self.valid_list_file) cont.remove_server("127.0.0.1:11211") self.assertTrue(1)
def test_hashing(self): cont = ketama.Continuum(self.valid_list_file) self.assertEqual(cont.get_server("test"), (2959911115, '127.0.0.1:11211'))
def test_valid(self): cont = ketama.Continuum(self.valid_list_file) self.assertEqual(type(cont), ketama.Continuum)
import ketama import sys connections = {} filename = '' if len(sys.argv) < 2: print "Usage: test_python.py key_to_be_tested" sys.exit() elif len(sys.argv) == 3: filename = sys.argv[2] print 'Testing file: ' + filename cont = ketama.Continuum(filename) else: cont = ketama.Continuum('key:12324') test_key = sys.argv[1] print "Testing key: " + test_key servers = open('../ketama.servers') for server in servers: server_info = server.split() server_name = server_info[0] memory = int(server_info[1]) cont.add_server(server_name, memory) print "Adding server: " + server_name + ":" + str(memory) info = cont.get_info() print info cont.sync_servers("node1:1000,node2:1000,node3:1000,node4:1000") print cont.get_info()
from time import time from tempfile import NamedTemporaryFile from uhashring import HashRing num = 1000000 print('running {} key generation comparison'.format(num)) # ketama C binding if ketama: with NamedTemporaryFile(prefix='benchmark_') as ketama_config_file: ketama_config_file.write("127.0.0.1:11211\t600\n") ketama_config_file.write("127.0.0.1:11212\t400\n") ketama_config_file.flush() kt = ketama.Continuum(ketama_config_file.name) pt = time() for i in range(num): key = 'myval-{}'.format(i) kt.get_server(key) print('ketama took {} ms'.format(time() - pt)) # pure python implementation ring = HashRing( nodes={'127.0.0.1:11211': 600, '127.0.0.1:11212': 400}, replicas=4, vnodes=40, compat=True) pt = time() for i in range(num):
def test_adding(self): cont = ketama.Continuum(self.valid_list_file) old_count = cont.get_server_count() cont.add_server("127.0.0.1:11213", 700) self.assertEqual(cont.get_server_count(), old_count + 1)