def run(yaml_path): config = load_base_config(yaml_path) global ALL_CONFIG_DIC ALL_CONFIG_DIC = config shard_service_d = config.get("shard_service") logging.info("[start_for_service][service_num:{}][detail:{}]".format( len(shard_service_d.keys()), ",".join(shard_service_d.keys()), )) # queue sync_dq = deque() # consul consul_addr = config.get("consul").get("host") consul_port = int(config.get("consul").get("port")) consul_obj = Consul(consul_addr, consul_port) # 注册服务 && 初始化hash-map for service_name, ii in shard_service_d.items(): nodes = ii.get("nodes") port = ii.get("port") ring = ConsistentHashRing(1000, nodes) SERVICE_HASH_MAP[service_name] = ring for host in nodes: res = consul_obj.register_service( service_name, host, int(port) ) logging.info("[register_service_res:{}][service:{}][node:{},port:{}]".format( res, service_name, host, port )) M_SHARD_SERVICE_DES.labels(service_name=service_name, service_port=port).set(1) # 开启watch变化结果队列消费线程 consumer_thread = Thread(target=consumer, kwargs={'sync_dq': sync_dq}) consumer_thread.setDaemon(True) consumer_thread.start() # 开启consul watch for service_name in shard_service_d.keys(): t = Thread(target=consul_obj.block_get_health, args=(service_name, SERVICE_HASH_MAP, sync_dq)) t.setDaemon(True) t.start() # metrics server start_http_server(int(ALL_CONFIG_DIC.get("http").get("port"))) # ticker 刷新target并send run_sync_target_thread()
def process(udp_clients): # params: for consistent hash_ring takes three arguments # first the nodes/servers list # second replication factor , default 3 # third is number of virtual nodes required , default is 8 REPLICATION_FACTOR = 2 NUMBER_OF_VIRTUAL_NODES = 10 client_ring = ConsistentHashRing(udp_clients, REPLICATION_FACTOR, NUMBER_OF_VIRTUAL_NODES) hash_codes = set() # PUT all users. for u in USERS: data_bytes, key = serialize_PUT(u) list_of_nodes = client_ring.get_node(key) # replicating the data in multiple nodes replication_flag = False for node in list_of_nodes: if replication_flag: print("Replicating to other nodes") response = node.send(data_bytes) print(response, '\n') hash_codes.add(str(response.decode())) replication_flag = True print( f"Number of Users={len(USERS)}\nNumber of Users Cached={len(hash_codes)}" ) # GET all users. for hc in hash_codes: print(hc) data_bytes, key = serialize_GET(hc) list_of_nodes = client_ring.get_node(key) # for get operation iterating just once with the original node for node in list_of_nodes: response = node.send(data_bytes) print(response) break
def run(): yaml_path = "config.yaml" config = load_base_config(yaml_path) global ALL_CONFIG_DIC ALL_CONFIG_DIC = config shard_service_d = config.get("shard_service") logging.info("[start_for_service][service_num:{}][detail:{}]".format( len(shard_service_d.keys()), ",".join(shard_service_d.keys()), )) # ioloop loop = asyncio.get_event_loop() # consul consul_addr = config.get("consul").get("host") consul_port = int(config.get("consul").get("port")) consul_obj = Consul(consul_addr, consul_port, loop) # 注册服务 && 初始化hash-map for service_name, ii in shard_service_d.items(): nodes = ii.get("nodes") port = ii.get("port") ring = ConsistentHashRing(1000, nodes) SERVICE_HASH_MAP[service_name] = ring for host in nodes: res = consul_obj.register_service(service_name, host, int(port)) logging.info( "[register_service_res:{}][service:{}][node:{},port:{}]". format(res, service_name, host, port)) # 开启watch变化结果队列消费线程 consumer_thread = Thread(target=consumer) consumer_thread.setDaemon(True) consumer_thread.start() # 开启consul watch loop_thread = Thread(target=start_thread_loop, args=(loop, )) loop_thread.setDaemon(True) loop_thread.start() for service_name in shard_service_d.keys(): asyncio.run_coroutine_threadsafe( watch_service(service_name, consul_obj.aio_consul, SERVICE_HASH_MAP, SYNC_DQ), loop) # ticker 刷新target并send run_sync_target_thread()
def __init__(self, server = None, node_addresses=[], wait_time=30): self.logger = logging.getLogger('{}'.format(self.__class__.__name__)) self.logger.debug('__init__') self._server = server self._wait_time = wait_time self._node_lookup = {util.get_hash(str(node_address)) : node_address for node_address in node_addresses} self._consistent_hash_ring = ConsistentHashRing(node_hashes = sorted(self._node_lookup.keys())) self.logger.debug('__init__. node_lookup: {}'.format(self._node_lookup)) self._failed_to_contact_node_hashes = collections.defaultdict(dict) self._processor = self._handle_membership_checks()
async def watch_service(service_name, async_consul, service_hash_map, dq): # always better to pass ``loop`` explicitly, but this # is not mandatory, you can relay on global event loop # port = 8500 # c = consul.aio.Consul(host=host, port=port, loop=loop) index = None data = None # set value, same as default api but with ``await`` while True: try: index, d = await async_consul.health.service(service_name, passing=True, index=index) if d: data = d new_nodes = [] serivce_name = "" for x in data: sn = x.get("Service").get("Service") address = x.get("Service").get("Address") if address: new_nodes.append(address) if sn and not serivce_name: serivce_name = sn old_nodes = service_hash_map[serivce_name].nodes if set(old_nodes) != set(new_nodes): print("[new_num:{} old_num:{}][new_nodes:{} old_nodes:{}]". format( len(new_nodes), len(old_nodes), ",".join(new_nodes), ",".join(old_nodes), )) new_ring = ConsistentHashRing(100, new_nodes) service_hash_map[serivce_name] = new_ring dq.appendleft(str(service_name)) except Timeout: # gracefully handle request timeout continue except Exception as e: print("[watch_error,service:{},error:{}]".format(service_name, e)) continue
def watch_service(self, service_name, service_hash_map, sync_q): index = None while True: try: last_index = index index, d = self.consul.health.service(service_name, passing=True, index=index, wait='10s') if last_index == None or last_index == index: # 索引没变说明结果没变化,无需处理 # last_index == None 代表第一次处理 continue msg = "[节点变化,需要收敛][service:{}]".format(service_name) logging.warning(msg) data = d new_nodes = [] for x in data: address = x.get("Service").get("Address") if address: new_nodes.append(address) old_nodes = service_hash_map[service_name].nodes if set(old_nodes) != set(new_nodes): logging.info( "[new_num:{} old_num:{}][new_nodes:{} old_nodes:{}]". format( len(new_nodes), len(old_nodes), ",".join(new_nodes), ",".join(old_nodes), )) new_ring = ConsistentHashRing(100, new_nodes) service_hash_map[service_name] = new_ring sync_q.put(str(service_name)) except Exception as e: logging.error("[watch_error,service:{},error:{}]".format( service_name, e)) time.sleep(5) continue
def block_get_health(self, service_name, service_hash_map, dq): index = None while True: try: index, d = self.consul.health.service(service_name, passing=True, index=index) if d: data = d new_nodes = [] for x in data: address = x.get("Service").get("Address") if address: new_nodes.append(address) old_nodes = service_hash_map[service_name].nodes if set(old_nodes) != set(new_nodes): logging.info( "[new_num:{} old_num:{}][new_nodes:{} old_nodes:{}]" .format( len(new_nodes), len(old_nodes), ",".join(new_nodes), ",".join(old_nodes), )) new_ring = ConsistentHashRing(100, new_nodes) service_hash_map[service_name] = new_ring dq.appendleft(str(service_name)) # dq.put(str(service_name)) M_SERVICE_CHANGES.labels(service_name=service_name, old_nodes=len(old_nodes), new_nodes=len(new_nodes)).set( len(new_nodes)) except Exception as e: logging.error("[watch_error,service:{},error:{}]".format( service_name, e)) time.sleep(5) continue
def run(yaml_path): config = load_base_config(yaml_path) shard_service_d = config.get("shard_service") logging.info("[start_for_service][service_num:{}][detail:{}]".format( len(shard_service_d.keys()), ",".join(shard_service_d.keys()), )) # 1.创建一个Manger对象 manager = Manager() # 2. 创建一个 全局一致性哈希map service_hash_map = manager.dict() # 同步的队列,用来通知是哪个采集服务节点发生收敛了 sync_q = Queue() # consul consul_addr = config.get("consul").get("host") consul_port = int(config.get("consul").get("port")) consul_obj = Consul(consul_addr, consul_port) # step_1 注册服务 && 初始化hash-map # 获取consul所有服务 all_service = consul_obj.get_all_service() for service_name, ii in shard_service_d.items(): nodes = ii.get("nodes") port = ii.get("port") for host in nodes: one_service_id = "{}_{}_{}".format(service_name, host, port) this_service = all_service.get(one_service_id) if not this_service: # 说明服务不存在,需要注册 res = consul_obj.register_service(service_name, host, int(port)) logging.info( "[new_service_need_register][register_service_res:{}][service:{}][node:{},port:{}]" .format(res, service_name, host, port)) # 给新注册的服务探测时间 time.sleep(1) alive_nodes = consul_obj.get_service_health_node(service_name) ring = ConsistentHashRing(1000, alive_nodes) service_hash_map[service_name] = ring M_SHARD_SERVICE_DES.labels(service_name=service_name, service_port=port).set(1) # step_2 开启watch变化结果队列消费进程 p_consumer = Process(target=consumer, args=(sync_q, service_hash_map, config)) p_consumer.start() # step_3 开启consul watch 进程 for service_name in shard_service_d.keys(): p = Process(target=consul_obj.watch_service, args=(service_name, service_hash_map, sync_q)) p.start() # step_4 开启metrics server统计线程 # 但是这个库是线程模式,在多进程中不work start_http_server(int(config.get("http").get("port"))) logging.info("[start_metrics_server:{}]".format(port)) # step_5 主进程:开启定时同步target并发往采集器进程 run_sync_targets(service_hash_map, config)
class MembershipStage(object): """ Stage for managing ring membeship and failure detection.""" def __init__(self, server = None, node_addresses=[], wait_time=30): self.logger = logging.getLogger('{}'.format(self.__class__.__name__)) self.logger.debug('__init__') self._server = server self._wait_time = wait_time self._node_lookup = {util.get_hash(str(node_address)) : node_address for node_address in node_addresses} self._consistent_hash_ring = ConsistentHashRing(node_hashes = sorted(self._node_lookup.keys())) self.logger.debug('__init__. node_lookup: {}'.format(self._node_lookup)) self._failed_to_contact_node_hashes = collections.defaultdict(dict) self._processor = self._handle_membership_checks() @property def node_hashes(self): return self._consistent_hash_ring.hash_ring def remove_node_hash(self, node_hash): success = self._consistent_hash_ring.remove_node_hash(node_hash) self._server.num_nodes = len(self.node_hashes) return success @util.coroutine def _handle_membership_checks(self): self.logger.debug('_handle_membership_checks') next_check_time = util.add_time(util.current_time(), self._wait_time) while True: # handle failures try: to_be_removed = [] for node_hash in self._failed_to_contact_node_hashes: count = self._failed_to_contact_node_hashes[node_hash]['count'] if count >= 3: if node_hash in self.node_hashes: self._server.internal_request_stage.handle_unannounced_failure(failure_node_hash=node_hash) to_be_removed.append(node_hash) # flush stale contact failures for node_hash in self._failed_to_contact_node_hashes: timeout = self._failed_to_contact_node_hashes[node_hash]['timeout'] if util.current_time() > timeout: to_be_removed.append(node_hash) for node_hash in list(set(to_be_removed)): try: del self._failed_to_contact_node_hashes[node_hash] except: pass # retry contacting failure node hashes: for node_hash in self._failed_to_contact_node_hashes: if util.current_time() > self._failed_to_contact_node_hashes[node_hash]['timeout']: self._server.internal_request_stage.handle_membership_check(gossip_node_hash=node_hash) if util.current_time() > next_check_time: self._server.internal_request_stage.handle_membership_check() next_check_time = util.add_time(util.current_time(), 1) yield else: yield except Exception as e: self.logger.error('_handle_membership_checks error: {}, {}'.format(e, sys.exc_info())) def process(self): self.logger.debug('process') return self._processor.next() def report_contact_failure(self, node_hash=None): self.logger.debug('report_contact_failure') try: self._failed_to_contact_node_hashes[node_hash]['count'] += 1 except: self._failed_to_contact_node_hashes[node_hash]['count'] = 1 try: timeout = self._failed_to_contact_node_hashes[node_hash]['timeout'] new_timeout = util.add_time(timeout, 10) except: new_timeout = util.add_time(util.current_time(), 10) finally: self._failed_to_contact_node_hashes[node_hash]['timeout'] = new_timeout self._failed_to_contact_node_hashes[node_hash]['next_check_time'] = util.add_time(util.current_time(), 1) def node_address(self, node_hash=None): """ Returns the address of a node identified by its hash value in the node ring.""" if node_hash: hostname, _, internal_port = self._node_lookup[node_hash].split(',') return hostname, internal_port def get_responsible_node_hashes(self, *args, **kwargs): """ Returns num_replicas number of node_hashes responsible for the key""" self.logger.debug('get_responsible_node_hashes') return self._consistent_hash_ring.get_responsible_node_hashes(*args, **kwargs) def get_gossip_node_hashes(self, num_gossip_nodes): """ Returns num_replicas number of random node_hashes for gossip protocol """ active_node_hashes = self.node_hashes if len(active_node_hashes) >= num_gossip_nodes: gossip_node_hashes = random.sample(active_node_hashes, num_gossip_nodes) else: gossip_node_hashes = random.sample(active_node_hashes, len(active_node_hashes)) return gossip_node_hashes def get_unannounced_failure_repair_node_hashes(self, failure_node_hash=None): self.logger.debug('get_unannounced_failure_repair_node_hashes') try: node_hashes = self.node_hashes num_replicas = self._server.num_replicas failure_node_hash_index = node_hashes.index(failure_node_hash) repair_node_hashes = [node_hashes[i%len(node_hashes)] for i in xrange(failure_node_hash_index-num_replicas+1, failure_node_hash_index+num_replicas)] repair_node_hashes.remove(failure_node_hash) except: pass return repair_node_hashes def _partition_keys(self): """ Returns: a dict() where: key: node_hashes value: list of keys for which the given node_hash is responsible """ keys = sorted(self._server.persistence_stage.keys()) partition = dict() left_bound = 0 for node_hash in self.node_hashes: right_bound = bisect.bisect_left(keys, node_hash) partition[node_hash] = keys[left_bound:right_bound] left_bound = right_bound else: partition[self.node_hashes[0]] += keys[right_bound:] return partition def key_value_partition(self): """ Returns: a dict() where: key: node_hashes value: dict of {key: values} for which the given node_hash is responsible """ key_partition = self._partition_keys() key_value_partition = collections.defaultdict(dict) for node_hash in key_partition: for key in key_partition[node_hash]: reply = self._server.persistence_stage.get(key) key_value_partition[node_hash][key] = {'value': reply['value'], 'timestamp': reply['timestamp']} return key_value_partition def _partition_for_failure(self, node_hash=None): if not node_hash: node_hash = self._server.node_hash index_self = self.node_hashes.index(node_hash) num_replicas = self._server.num_replicas old_partition = self.key_value_partition() new_partition = dict() for offset in xrange(1-num_replicas, 1): index_old = index_self + offset index_new = (index_self + offset + num_replicas) % len(self.node_hashes) old_responsible_node_hash = self.node_hashes[index_old] new_reponsible_node_hash = self.node_hashes[index_new] new_partition[new_reponsible_node_hash] = old_partition[old_responsible_node_hash] return new_partition