Example #1
0
def run(yaml_path):
    config = load_base_config(yaml_path)
    global ALL_CONFIG_DIC
    ALL_CONFIG_DIC = config
    shard_service_d = config.get("shard_service")

    logging.info("[start_for_service][service_num:{}][detail:{}]".format(
        len(shard_service_d.keys()),
        ",".join(shard_service_d.keys()),

    ))
    # queue

    sync_dq = deque()

    # consul
    consul_addr = config.get("consul").get("host")
    consul_port = int(config.get("consul").get("port"))
    consul_obj = Consul(consul_addr, consul_port)

    # 注册服务 && 初始化hash-map
    for service_name, ii in shard_service_d.items():
        nodes = ii.get("nodes")
        port = ii.get("port")

        ring = ConsistentHashRing(1000, nodes)
        SERVICE_HASH_MAP[service_name] = ring
        for host in nodes:
            res = consul_obj.register_service(
                service_name, host, int(port)
            )
            logging.info("[register_service_res:{}][service:{}][node:{},port:{}]".format(
                res, service_name, host, port
            ))

        M_SHARD_SERVICE_DES.labels(service_name=service_name, service_port=port).set(1)

    # 开启watch变化结果队列消费线程
    consumer_thread = Thread(target=consumer, kwargs={'sync_dq': sync_dq})
    consumer_thread.setDaemon(True)
    consumer_thread.start()

    # 开启consul watch
    for service_name in shard_service_d.keys():
        t = Thread(target=consul_obj.block_get_health, args=(service_name, SERVICE_HASH_MAP, sync_dq))
        t.setDaemon(True)
        t.start()

    # metrics server
    start_http_server(int(ALL_CONFIG_DIC.get("http").get("port")))

    # ticker 刷新target并send
    run_sync_target_thread()
Example #2
0
def process(udp_clients):
    # params: for consistent hash_ring takes three arguments
    # first the nodes/servers list
    # second replication factor , default 3
    # third is number of virtual nodes required , default is 8

    REPLICATION_FACTOR = 2
    NUMBER_OF_VIRTUAL_NODES = 10

    client_ring = ConsistentHashRing(udp_clients, REPLICATION_FACTOR,
                                     NUMBER_OF_VIRTUAL_NODES)
    hash_codes = set()
    # PUT all users.
    for u in USERS:
        data_bytes, key = serialize_PUT(u)
        list_of_nodes = client_ring.get_node(key)
        # replicating the data in multiple nodes
        replication_flag = False
        for node in list_of_nodes:
            if replication_flag:
                print("Replicating to other nodes")
            response = node.send(data_bytes)
            print(response, '\n')
            hash_codes.add(str(response.decode()))
            replication_flag = True

    print(
        f"Number of Users={len(USERS)}\nNumber of Users Cached={len(hash_codes)}"
    )

    # GET all users.
    for hc in hash_codes:
        print(hc)
        data_bytes, key = serialize_GET(hc)
        list_of_nodes = client_ring.get_node(key)
        # for get operation iterating just once with the original node
        for node in list_of_nodes:
            response = node.send(data_bytes)
            print(response)
            break
Example #3
0
def run():
    yaml_path = "config.yaml"

    config = load_base_config(yaml_path)
    global ALL_CONFIG_DIC
    ALL_CONFIG_DIC = config
    shard_service_d = config.get("shard_service")

    logging.info("[start_for_service][service_num:{}][detail:{}]".format(
        len(shard_service_d.keys()),
        ",".join(shard_service_d.keys()),
    ))

    # ioloop
    loop = asyncio.get_event_loop()

    # consul
    consul_addr = config.get("consul").get("host")
    consul_port = int(config.get("consul").get("port"))
    consul_obj = Consul(consul_addr, consul_port, loop)

    # 注册服务 && 初始化hash-map
    for service_name, ii in shard_service_d.items():
        nodes = ii.get("nodes")
        port = ii.get("port")

        ring = ConsistentHashRing(1000, nodes)
        SERVICE_HASH_MAP[service_name] = ring
        for host in nodes:
            res = consul_obj.register_service(service_name, host, int(port))
            logging.info(
                "[register_service_res:{}][service:{}][node:{},port:{}]".
                format(res, service_name, host, port))

    # 开启watch变化结果队列消费线程
    consumer_thread = Thread(target=consumer)
    consumer_thread.setDaemon(True)
    consumer_thread.start()

    # 开启consul watch
    loop_thread = Thread(target=start_thread_loop, args=(loop, ))
    loop_thread.setDaemon(True)
    loop_thread.start()
    for service_name in shard_service_d.keys():
        asyncio.run_coroutine_threadsafe(
            watch_service(service_name, consul_obj.aio_consul,
                          SERVICE_HASH_MAP, SYNC_DQ), loop)

    # ticker 刷新target并send

    run_sync_target_thread()
Example #4
0
    def __init__(self, server = None,  node_addresses=[], wait_time=30):
        self.logger = logging.getLogger('{}'.format(self.__class__.__name__))
        self.logger.debug('__init__')

        self._server = server
        self._wait_time = wait_time
        self._node_lookup = {util.get_hash(str(node_address)) : node_address for node_address in node_addresses}
        self._consistent_hash_ring = ConsistentHashRing(node_hashes = sorted(self._node_lookup.keys()))

        self.logger.debug('__init__.  node_lookup: {}'.format(self._node_lookup))

        self._failed_to_contact_node_hashes = collections.defaultdict(dict)

        self._processor = self._handle_membership_checks()
Example #5
0
async def watch_service(service_name, async_consul, service_hash_map, dq):
    # always better to pass ``loop`` explicitly, but this
    # is not mandatory, you can relay on global event loop
    # port = 8500
    # c = consul.aio.Consul(host=host, port=port, loop=loop)
    index = None
    data = None
    # set value, same as default api but with ``await``
    while True:
        try:

            index, d = await async_consul.health.service(service_name,
                                                         passing=True,
                                                         index=index)
            if d:
                data = d
                new_nodes = []
                serivce_name = ""
                for x in data:
                    sn = x.get("Service").get("Service")
                    address = x.get("Service").get("Address")
                    if address:
                        new_nodes.append(address)
                    if sn and not serivce_name:
                        serivce_name = sn

                old_nodes = service_hash_map[serivce_name].nodes

                if set(old_nodes) != set(new_nodes):
                    print("[new_num:{} old_num:{}][new_nodes:{} old_nodes:{}]".
                          format(
                              len(new_nodes),
                              len(old_nodes),
                              ",".join(new_nodes),
                              ",".join(old_nodes),
                          ))
                    new_ring = ConsistentHashRing(100, new_nodes)
                    service_hash_map[serivce_name] = new_ring
                    dq.appendleft(str(service_name))

        except Timeout:
            # gracefully handle request timeout
            continue
        except Exception as e:
            print("[watch_error,service:{},error:{}]".format(service_name, e))
            continue
Example #6
0
    def watch_service(self, service_name, service_hash_map, sync_q):
        index = None
        while True:
            try:
                last_index = index

                index, d = self.consul.health.service(service_name,
                                                      passing=True,
                                                      index=index,
                                                      wait='10s')
                if last_index == None or last_index == index:
                    # 索引没变说明结果没变化,无需处理
                    # last_index == None 代表第一次处理
                    continue

                msg = "[节点变化,需要收敛][service:{}]".format(service_name)
                logging.warning(msg)
                data = d
                new_nodes = []
                for x in data:
                    address = x.get("Service").get("Address")
                    if address:
                        new_nodes.append(address)
                old_nodes = service_hash_map[service_name].nodes

                if set(old_nodes) != set(new_nodes):
                    logging.info(
                        "[new_num:{} old_num:{}][new_nodes:{} old_nodes:{}]".
                        format(
                            len(new_nodes),
                            len(old_nodes),
                            ",".join(new_nodes),
                            ",".join(old_nodes),
                        ))
                    new_ring = ConsistentHashRing(100, new_nodes)
                    service_hash_map[service_name] = new_ring
                    sync_q.put(str(service_name))

            except Exception as e:
                logging.error("[watch_error,service:{},error:{}]".format(
                    service_name, e))
                time.sleep(5)
                continue
Example #7
0
    def block_get_health(self, service_name, service_hash_map, dq):
        index = None
        while True:
            try:
                index, d = self.consul.health.service(service_name,
                                                      passing=True,
                                                      index=index)
                if d:
                    data = d
                    new_nodes = []
                    for x in data:
                        address = x.get("Service").get("Address")
                        if address:
                            new_nodes.append(address)

                    old_nodes = service_hash_map[service_name].nodes

                    if set(old_nodes) != set(new_nodes):
                        logging.info(
                            "[new_num:{} old_num:{}][new_nodes:{} old_nodes:{}]"
                            .format(
                                len(new_nodes),
                                len(old_nodes),
                                ",".join(new_nodes),
                                ",".join(old_nodes),
                            ))
                        new_ring = ConsistentHashRing(100, new_nodes)
                        service_hash_map[service_name] = new_ring
                        dq.appendleft(str(service_name))
                        # dq.put(str(service_name))
                        M_SERVICE_CHANGES.labels(service_name=service_name,
                                                 old_nodes=len(old_nodes),
                                                 new_nodes=len(new_nodes)).set(
                                                     len(new_nodes))
            except Exception as e:
                logging.error("[watch_error,service:{},error:{}]".format(
                    service_name, e))
                time.sleep(5)
                continue
Example #8
0
def run(yaml_path):
    config = load_base_config(yaml_path)
    shard_service_d = config.get("shard_service")

    logging.info("[start_for_service][service_num:{}][detail:{}]".format(
        len(shard_service_d.keys()),
        ",".join(shard_service_d.keys()),
    ))
    # 1.创建一个Manger对象
    manager = Manager()

    # 2. 创建一个 全局一致性哈希map
    service_hash_map = manager.dict()
    # 同步的队列,用来通知是哪个采集服务节点发生收敛了
    sync_q = Queue()

    # consul
    consul_addr = config.get("consul").get("host")
    consul_port = int(config.get("consul").get("port"))
    consul_obj = Consul(consul_addr, consul_port)

    # step_1 注册服务 && 初始化hash-map
    # 获取consul所有服务

    all_service = consul_obj.get_all_service()
    for service_name, ii in shard_service_d.items():
        nodes = ii.get("nodes")
        port = ii.get("port")

        for host in nodes:
            one_service_id = "{}_{}_{}".format(service_name, host, port)
            this_service = all_service.get(one_service_id)
            if not this_service:
                # 说明服务不存在,需要注册

                res = consul_obj.register_service(service_name, host,
                                                  int(port))
                logging.info(
                    "[new_service_need_register][register_service_res:{}][service:{}][node:{},port:{}]"
                    .format(res, service_name, host, port))
        # 给新注册的服务探测时间
        time.sleep(1)
        alive_nodes = consul_obj.get_service_health_node(service_name)

        ring = ConsistentHashRing(1000, alive_nodes)
        service_hash_map[service_name] = ring

        M_SHARD_SERVICE_DES.labels(service_name=service_name,
                                   service_port=port).set(1)

    # step_2 开启watch变化结果队列消费进程
    p_consumer = Process(target=consumer,
                         args=(sync_q, service_hash_map, config))
    p_consumer.start()

    # step_3 开启consul watch 进程
    for service_name in shard_service_d.keys():
        p = Process(target=consul_obj.watch_service,
                    args=(service_name, service_hash_map, sync_q))
        p.start()

    # step_4 开启metrics server统计线程
    # 但是这个库是线程模式,在多进程中不work
    start_http_server(int(config.get("http").get("port")))
    logging.info("[start_metrics_server:{}]".format(port))
    # step_5 主进程:开启定时同步target并发往采集器进程

    run_sync_targets(service_hash_map, config)
Example #9
0
class MembershipStage(object):
    """ Stage for managing ring membeship and failure detection."""

    def __init__(self, server = None,  node_addresses=[], wait_time=30):
        self.logger = logging.getLogger('{}'.format(self.__class__.__name__))
        self.logger.debug('__init__')

        self._server = server
        self._wait_time = wait_time
        self._node_lookup = {util.get_hash(str(node_address)) : node_address for node_address in node_addresses}
        self._consistent_hash_ring = ConsistentHashRing(node_hashes = sorted(self._node_lookup.keys()))

        self.logger.debug('__init__.  node_lookup: {}'.format(self._node_lookup))

        self._failed_to_contact_node_hashes = collections.defaultdict(dict)

        self._processor = self._handle_membership_checks()



    @property
    def node_hashes(self):
        return self._consistent_hash_ring.hash_ring

    def remove_node_hash(self, node_hash):
        success = self._consistent_hash_ring.remove_node_hash(node_hash)
        self._server.num_nodes = len(self.node_hashes)
        return success

    @util.coroutine
    def _handle_membership_checks(self):
        self.logger.debug('_handle_membership_checks')
        next_check_time = util.add_time(util.current_time(), self._wait_time)

        while True:
            # handle failures
            try:
                to_be_removed = []
                for node_hash in self._failed_to_contact_node_hashes:
                    count = self._failed_to_contact_node_hashes[node_hash]['count']
                    if count >= 3:
                        if node_hash in self.node_hashes:
                            self._server.internal_request_stage.handle_unannounced_failure(failure_node_hash=node_hash)
                        to_be_removed.append(node_hash)
                # flush stale contact failures
                for node_hash in self._failed_to_contact_node_hashes:
                    timeout = self._failed_to_contact_node_hashes[node_hash]['timeout']
                    if util.current_time() > timeout:
                        to_be_removed.append(node_hash)

                for node_hash in list(set(to_be_removed)):
                    try:
                        del self._failed_to_contact_node_hashes[node_hash]
                    except:
                        pass

                # retry contacting failure node hashes:
                for node_hash in self._failed_to_contact_node_hashes:
                    if util.current_time() > self._failed_to_contact_node_hashes[node_hash]['timeout']:
                        self._server.internal_request_stage.handle_membership_check(gossip_node_hash=node_hash)

                if util.current_time() > next_check_time:
                    self._server.internal_request_stage.handle_membership_check()
                    next_check_time = util.add_time(util.current_time(), 1)
                    yield
                else:
                    yield
            except Exception as e:
                self.logger.error('_handle_membership_checks error: {}, {}'.format(e, sys.exc_info()))

    def process(self):
        self.logger.debug('process')
        return self._processor.next()

    def report_contact_failure(self, node_hash=None):
        self.logger.debug('report_contact_failure')
        try:
            self._failed_to_contact_node_hashes[node_hash]['count'] += 1
        except:
            self._failed_to_contact_node_hashes[node_hash]['count'] = 1


        try:
            timeout = self._failed_to_contact_node_hashes[node_hash]['timeout']
            new_timeout = util.add_time(timeout, 10)
        except:
            new_timeout = util.add_time(util.current_time(), 10)
        finally:
            self._failed_to_contact_node_hashes[node_hash]['timeout'] = new_timeout

        self._failed_to_contact_node_hashes[node_hash]['next_check_time'] = util.add_time(util.current_time(), 1)

    def node_address(self, node_hash=None):
        """ Returns the  address of a node identified by its hash value in the node ring."""
        if node_hash:
            hostname, _, internal_port = self._node_lookup[node_hash].split(',')
            return hostname, internal_port

    def get_responsible_node_hashes(self, *args, **kwargs):
        """ Returns num_replicas number of node_hashes responsible for the key"""
        self.logger.debug('get_responsible_node_hashes')
        return self._consistent_hash_ring.get_responsible_node_hashes(*args, **kwargs)

    def get_gossip_node_hashes(self, num_gossip_nodes):
        """ Returns num_replicas number of random node_hashes for gossip protocol """
        active_node_hashes = self.node_hashes
        if len(active_node_hashes) >= num_gossip_nodes:
            gossip_node_hashes = random.sample(active_node_hashes, num_gossip_nodes)
        else:
            gossip_node_hashes = random.sample(active_node_hashes, len(active_node_hashes))
        return gossip_node_hashes

    def get_unannounced_failure_repair_node_hashes(self, failure_node_hash=None):
        self.logger.debug('get_unannounced_failure_repair_node_hashes')
        try:
            node_hashes = self.node_hashes
            num_replicas = self._server.num_replicas
            failure_node_hash_index = node_hashes.index(failure_node_hash)
            repair_node_hashes = [node_hashes[i%len(node_hashes)] for i in xrange(failure_node_hash_index-num_replicas+1, failure_node_hash_index+num_replicas)]
            repair_node_hashes.remove(failure_node_hash)
        except:
            pass
        return repair_node_hashes

    def _partition_keys(self):
        """ Returns:
                a dict() where:
                    key: node_hashes
                    value: list of keys for which the given node_hash is responsible
        """
        keys = sorted(self._server.persistence_stage.keys())
        partition = dict()
        left_bound = 0
        for node_hash in  self.node_hashes:
            right_bound = bisect.bisect_left(keys, node_hash)
            partition[node_hash] = keys[left_bound:right_bound]
            left_bound = right_bound
        else:
            partition[self.node_hashes[0]] += keys[right_bound:]

        return partition

    def key_value_partition(self):
        """ Returns:
                a dict() where:
                    key: node_hashes
                    value: dict of {key: values} for which the given node_hash is responsible
        """
        key_partition = self._partition_keys()
        key_value_partition = collections.defaultdict(dict)
        for node_hash in key_partition:
            for key in key_partition[node_hash]:
                reply = self._server.persistence_stage.get(key)
                key_value_partition[node_hash][key] = {'value': reply['value'], 'timestamp': reply['timestamp']}

        return key_value_partition

    def _partition_for_failure(self, node_hash=None):
        if not node_hash:
            node_hash = self._server.node_hash
        index_self = self.node_hashes.index(node_hash)
        num_replicas = self._server.num_replicas
        old_partition = self.key_value_partition()
        new_partition = dict()
        for offset in xrange(1-num_replicas, 1):
            index_old = index_self + offset
            index_new = (index_self + offset + num_replicas) % len(self.node_hashes)

            old_responsible_node_hash = self.node_hashes[index_old]
            new_reponsible_node_hash = self.node_hashes[index_new]
            new_partition[new_reponsible_node_hash] = old_partition[old_responsible_node_hash]

        return new_partition