Beispiel #1
0
class ZooWrap():
    def __init__(self):
        self.zk = KazooClient(hosts="%s,%s" % (
            str(cc.conf['zookeeper']['host']),
            str(cc.conf['zookeeper']['port'])))
        self.zk.start()
        self.root = cc.conf['zookeeper']['rootpath']
        self.log = logging.getLogger('L.ZOOKEEPER')
        self.zk.ensure_path('/%s/sleeping' % (self.root))
        self.whoami = cc.conf['whoami']

    def get_sleeping(self):
        return self.zk.get_children('/%s/sleeping' % (self.root))

    def sleep(self):
        try:
            self.zk.create('/%s/sleeping/%s' % (self.root, self.whoami))
            self.log.info('Sleeping correctly')
        except NodeExistsError:
            self.log.error('Node already sleeping... seems weird')

    def wake(self):
        try:
            self.zk.delete('/%s/sleeping/%s' % (self.root, self.whoami))
        except NoNodeError:
            self.log.error('Node was not sleeping... seems weird')
Beispiel #2
0
class ActorAddressBook(object):
    def __init__(self, zk_hosts, timeout=60.0):
        self.retry = KazooRetry(max_tries=10)
        self.zk = KazooClient(hosts=zk_hosts, timeout=timeout)
        self.zk.start()

    def lookup(self, path):
        return self.retry(self._lookup, path)

    def _lookup(self, path):
        actor_url, stat = self.zk.get(path)
        return RemoteActor(actor_url.decode('utf-8'))

    def register(self, path, actor_url):
        return self.retry(self._register, path, actor_url)

    def _register(self, path, actor_url):
        self.zk.ensure_path(path)
        self.zk.set(path, actor_url.encode('utf-8'))

    def delete(self, path):
        self.zk.delete(path, recursive=True)

    def __del__(self):
        self.zk.stop()
def cleanup(args):
    now = dt.utcnow()
    server = '{server}:{port}'.format(server=args.server, port=args.port)
    logging.info('Connecting to {}'.format(server))
    zk = KazooClient(hosts=server)
    zk.start()

    for path in args.zk_paths:
        zk_path = '{}/{}'.format(args.zk_root_path, path)
        nodes = zk.get_children(zk_path)
        logging.info("Found {} nodes under {}".format(len(nodes), zk_path))

        deleted = 0
        for node in nodes:
            node_path = '{}/{}'.format(zk_path, node)
            data, stat = zk.get(node_path)
            last_modified = dt.fromtimestamp(stat.mtime/1000.0)
            if ((now - last_modified).days > args.age) or (args.inclusive and (now - last_modified).days >= args.age):
                if not args.dry_run:
                    # Kazoo does not support recursive async deletes
                    if stat.children_count == 0:
                        res = zk.delete_async(node_path)
                    else:
                        zk.delete(node_path, recursive=True)
                deleted += 1

        logging.info("Deleted {} nodes".format(deleted))

    zk.stop()
Beispiel #4
0
class ShellTestCase(unittest.TestCase):
    """ base class for all tests """

    def setUp(self):
        """
        make sure that the prefix dir is empty
        """
        self.tests_path = os.getenv("ZKSHELL_PREFIX_DIR", "/tests")
        self.zk_host = os.getenv("ZKSHELL_ZK_HOST", "localhost:2181")
        self.username = os.getenv("ZKSHELL_USER", "user")
        self.password = os.getenv("ZKSHELL_PASSWD", "user")
        self.digested_password = os.getenv("ZKSHELL_DIGESTED_PASSWD", "F46PeTVYeItL6aAyygIVQ9OaaeY=")
        self.super_password = os.getenv("ZKSHELL_SUPER_PASSWD", "secret")
        self.scheme = os.getenv("ZKSHELL_AUTH_SCHEME", "digest")

        self.client = KazooClient(self.zk_host, 5)
        self.client.start()
        self.client.add_auth(self.scheme, self.auth_id)
        if self.client.exists(self.tests_path):
            self.client.delete(self.tests_path, recursive=True)
        self.client.create(self.tests_path, str.encode(""))

        self.output = StringIO()
        self.shell = Shell([self.zk_host], 5, self.output, setup_readline=False, async=False)

        # Create an empty test dir (needed for some tests)
        self.temp_dir = tempfile.mkdtemp()

    @property
    def auth_id(self):
        return "%s:%s" % (self.username, self.password)

    @property
    def auth_digest(self):
        return "%s:%s" % (self.username, self.digested_password)

    def tearDown(self):
        self.output = None
        self.shell = None

        if os.path.isdir(self.temp_dir):
            shutil.rmtree(self.temp_dir)

        if self.client.exists(self.tests_path):
            self.client.delete(self.tests_path, recursive=True)

        self.client.stop()

    ###
    # Helpers.
    ##

    def create_compressed(self, path, value):
        """
        ZK Shell doesn't support creating directly from a bytes array so we use a Kazoo client
        to create a znode with zlib compressed content.
        """
        compressed = zlib.compress(bytes(value, "utf-8") if PYTHON3 else value)
        self.client.create(path, compressed, makepath=True)
Beispiel #5
0
class ZooKeeperTestMixin(object):

    zk_hosts = None
    _zk_hosts_internal = None
    zk_base_path = None
    proxy = None

    def setup_zookeeper(self, base_path_prefix="/int_tests", use_proxy=False):

        zk_hosts = os.environ.get("ZK_HOSTS")
        if not zk_hosts:
            raise unittest.SkipTest("export ZK_HOSTS env to run ZooKeeper integration tests")

        if use_proxy:
            hosts_list = zk_hosts.split(",")
            if len(hosts_list) == 1:
                self.proxy = SocatProxy(zk_hosts)
                self.proxy.start()
                self.zk_hosts = self.proxy.address

            else:
                proxies = [SocatProxy(host) for host in hosts_list]
                self.proxy = MultiProxy(proxies)
                self.proxy.start()
                self.zk_hosts = ",".join(proxy.address for proxy in proxies)

            self._zk_hosts_internal = zk_hosts

        else:
            self.zk_hosts = self._zk_hosts_internal = zk_hosts

        self.zk_base_path = base_path_prefix + uuid.uuid4().hex

        if os.environ.get("EPU_USE_GEVENT"):
            from kazoo.handlers.gevent import SequentialGeventHandler

            handler = SequentialGeventHandler()
            self.use_gevent = True
        else:
            handler = None
            self.use_gevent = False

        self.kazoo = KazooClient(self._zk_hosts_internal + self.zk_base_path, handler=handler)
        self.kazoo.start()

    def teardown_zookeeper(self):
        if self.kazoo:
            try:
                self.kazoo.delete("/", recursive=True)
                self.kazoo.stop()
            except Exception:
                log.exception("Problem tearing down ZooKeeper")
        if self.proxy:
            self.proxy.stop()

    cleanup_zookeeper = teardown_zookeeper
Beispiel #6
0
def _clean_up_zookeeper_autoscaling(context):
    """If max_instances was set for autoscaling, clean up zookeeper"""
    client = KazooClient(hosts='%s/mesos-testcluster' % get_service_connection_string('zookeeper'), read_only=True)
    client.start()
    try:
        client.delete('/autoscaling', recursive=True)
    except NoNodeError:
        pass
    client.stop()
    client.close()
def do_zookeeper_delete(addr, path):
	print(path)
	zk = KazooClient(addr)
	zk.start()
	zk.delete(path)

	try:
		do_zookeeper_read(addr, path)
	except kazoo.exceptions.NoNodeError:
		print('deleted')
Beispiel #8
0
class CockRoach(object):
    def __init__(self, zkHost, stale_max_days=30, assume_yes=False, preview=False):
        self.ConsumerGroups = []
        self.zk_client = KazooClient(hosts=zkHost)
        self.zk_client.start()
        self.stale_max_days = stale_max_days
        self.assume_yes = assume_yes
        self.preview = preview
        if self.zk_client.exists("/consumers"):
            for cg_name in self.zk_client.get_children("/consumers"):
                self.ConsumerGroups.append(ConsumerGroup(cg_name, \
                                                         self.zk_client))

    def get_stale_cgroups(self, display):
        """
           get_stale_cgroups returns ConsumerGroups
           that were not used for stale_max_days
        """
        ret = []
        for consumergroup in self.ConsumerGroups:
            delta = datetime.now() - consumergroup.last_seen().mtime
            if delta.days > self.stale_max_days:
                if display:
                    print "Stale: %s" % (consumergroup)
                ret.append(consumergroup)
        return ret

    def delete_stale_cgroups(self):
        """ Delete consumer groups that are considered stale"""
        stale_cgroups = self.get_stale_cgroups(display=False)
        for stale_cg in stale_cgroups:
            print stale_cg
            if self.assume_yes is False:
                confirm = raw_input("Delete?")
            else:
                confirm = "Y"

            if confirm == "Y":
                self.delete_cgroup(stale_cg)

    def delete_cgroup(self, consumergroup):
        """Deletes a consumer Group"""
        print "Deleting %s" % (consumergroup.gid)
        if self.preview is False:
          self.zk_client.delete("/consumers/%s" % (consumergroup.gid), version=-1, recursive=True)
          print "executed"
        else:
            print "pass"


    def __str__(self):
        ret = ""
        for consumer in self.ConsumerGroups:
            ret += "%s" % (consumer)
        return ret
def delete_zk_data():
    print "Deleting zk data..."

    zk = KazooClient(hosts=config["zk"])
    zk.start()

    zk.delete("/kafka/brokers/topics", recursive=True)
    zk.delete("/kafka/consumers", recursive=True)

    zk.stop()

    print "Deleted zk data."
Beispiel #10
0
def zk():
    pytest.importorskip('kazoo')
    pytest.mark.skipif(zookeeper_host is None,
                       reason='No ZOOKEEPER_HOST envar defined')

    from kazoo.client import KazooClient
    zk = KazooClient(hosts=zookeeper_host)
    try:
        zk.start()
        yield zk
    finally:
        zk.delete('/satyr', recursive=True)
        zk.stop()
Beispiel #11
0
def zookeeper_command(hosts, command, path):
    try:
        zk = KazooClient(hosts=hosts)
        zk.start()
        if command == 'get':
            data, stat = zk.get(path)
            return data.decode("utf-8")
        elif command == 'delete':
            zk.delete(path, recursive=True)
            return 'Successfully deleted ' + path
        else:
            return False
        zk.stop()
    except:
        return False
Beispiel #12
0
    def test_create_makepath_incompatible_acls(self):
        from kazoo.client import KazooClient
        from kazoo.security import make_digest_acl_credential, CREATOR_ALL_ACL
        credential = make_digest_acl_credential("username", "password")
        alt_client = KazooClient(self.cluster[0].address + self.client.chroot,
            max_retries=5, auth_data=[("digest", credential)])
        alt_client.start()
        alt_client.create("/1/2", b"val2", makepath=True, acl=CREATOR_ALL_ACL)

        try:
            self.assertRaises(NoAuthError, self.client.create, "/1/2/3/4/5",
                b"val2", makepath=True)
        finally:
            alt_client.delete('/', recursive=True)
            alt_client.stop()
Beispiel #13
0
 def zk_command(self, command, path):
     if self.client is None:
         self._init_client()
     zk_url = self.client.zk_url()
     try:
         zk = KazooClient(hosts=zk_url)
         zk.start()
         res = False
         if command == "get":
             data, stat = zk.get(path)
             res = data.decode("utf-8")
         elif command == "delete":
             zk.delete(path, recursive=True)
             res = "Successfully deleted " + path
         zk.stop()
         return res
     except Exception as e:
         self.vlog(e)
         return False
Beispiel #14
0
class MasterElection(object):
    def __init__(self):
        self.zk = KazooClient(hosts=state.ARGS.zookeeper)
        self.master_notified = False
        self.my_node = None
        self.zk.start()  # Stop never called
        self.zk.ensure_path(ELECTION_PATH)

    def start_election(self, master_callback):
        """
        Start the master election.

        If this node is the master, the callback will be called once.

        :params master_callback: Called if this node is the master
        """
        self.callback = master_callback
        self.my_node = self.zk.create(ELECTION_PATH + '/n_',
                                      ephemeral=True, sequence=True)
        self.zk.get_children(ELECTION_PATH, watch=self._election_update)
        self._election_update()

    def _election_update(self, data=None):
        def worker():
            try:
                self.master_notified = True
                self.callback()
            except Exception as e:
                self.master_notified = False
                log.info("Failed to activate master, redoing election: %r", e)
                self.zk.delete(self.my_node)
                self.my_node = self.zk.create(ELECTION_PATH + '/n_',
                                              ephemeral=True, sequence=True)
                self._election_update()

        if not self.master_notified and \
                sorted(self.zk.get_children(ELECTION_PATH))[0] == \
                os.path.basename(self.my_node):
            t = threading.Thread(target=worker)
            t.daemon = True
            t.start()
Beispiel #15
0
class ZookeeperClient(object):

    def __init__(self, zk_hosts):
        self.zk = KazooClient(hosts=zk_hosts)
        self.zk.start()

    def put(self, entity, id, data):
        path = '/{}/{}'.format(entity, id)
        binary_value = json.dumps(data).encode('utf-8')
        try:
            self.zk.create(path, binary_value, makepath=True)
            return True
        except NodeExistsError:
            self.zk.set(path, binary_value)
            return False

    def get(self, entity, id):
        path = '/{}/{}'.format(entity, id)
        try:
            binary_data, _ = self.zk.get(path)
        except NoNodeError:
            return None
        return json.loads(binary_data.decode('utf-8'))

    def delete(self, entity, id):
        path = '/{}/{}'.format(entity, id)
        try:
            self.zk.delete(path)
            return True
        except NoNodeError:
            return False

    def list(self, entity):
        path = '/{}'.format(entity)
        try:
            children = self.zk.get_children(path)
            for child in children:
                value = self.get(entity, child)
                yield (child, value)
        except NoNodeError:
            yield []
Beispiel #16
0
def test_init_servers(server_id, host = '127.0.0.1', port = 2181):
    zookeeper = KazooClient('%s:%s' % (host, port,))
    zookeeper.start()

    try:
        node = '/test/server_list'
        if zookeeper.exists(node) is None:
            zookeeper.create(node, json.dumps({'update_time' : time.time()}), makepath = True)
    except kazoo.exceptions.NodeExistsError:
        pass

    try:
        node = '/test/server_list/s%s' % server_id
        zookeeper.delete(node)
        zookeeper.create(node, json.dumps({
            'update_time' : time.time(),
            'server_name' : 's%s' % server_id,
            'server_id'   : server_id,
        }), makepath = True)
    except kazoo.exceptions.NodeExistsError:
        pass
Beispiel #17
0
def main():
    (options, args) = parse_options(sys.argv[1:])

    data = options.znode_data_size * b"D"

    s = KazooClient(options.server)
    s.start()

    if s.exists(options.root_znode):
        children = s.get_children(options.root_znode)
        print("delete old entries: %d" % len(children))
        for child in children:
            s.delete("%s/%s" % (options.root_znode, child))
    else:
        s.create(options.root_znode, "kazoo root znode")

    evaluation(s, options.root_znode, data, options)

    s.stop()

    print("Performance test complete")
Beispiel #18
0
def zk_server(tmpdir):
    zk_container_name = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(6))
    # TODO(cmaloney): Add a python context manager for dockerized daemons
    subprocess.check_call(['docker', 'run', '-d', '-p', '2181:2181', '-p',
                           '2888:2888', '-p', '3888:3888', '--name', zk_container_name, 'jplock/zookeeper'])

    conn_retry_policy = KazooRetry(max_tries=-1, delay=0.1, max_delay=0.1)
    cmd_retry_policy = KazooRetry(max_tries=3, delay=0.3, backoff=1, max_delay=1, ignore_expire=False)
    zk = KazooClient(hosts=zk_hosts, connection_retry=conn_retry_policy, command_retry=cmd_retry_policy)
    zk.start()

    children = zk.get_children('/')
    for child in children:
        if child == 'zookeeper':
            continue
        zk.delete('/' + child, recursive=True)

    yield zk

    zk.stop()
    zk.close()
    subprocess.check_call(['docker', 'rm', '-f', zk_container_name])
Beispiel #19
0
class ZookClient(object):

    def __init__(self, api_client=None):
        logging.basicConfig()
        # Create a client and start it
        self.zk = KazooClient()
        self.zk.start()

    def create_accounts_path(self, name, **kwargs):
        path = "/dso/" + name
        self.zk.ensure_path(path)
        self.zk.set(path, b"id: 7b4235ca-00fb-4dca-ad3e-8b6e3662631a\ngroupname: hr\ndescription: 人力资源")


    def create_accountinfo_path(self, account_id, **kwargs):
        self.zk.ensure_path("/app/someservice")

    def create_path(self, path, **kwargs):
        self.zk.ensure_path(path)

    def get_data(self, path):
        return self.zk.get(path)

    def test_tran(self):
        self.zk.delete("/app22")
        self.zk.create("/app22", b"" + '{"12": "12"}')

        tran = self.zk.transaction()
        tran.delete("/app22")
        tran.create("/app22", b"" + '{"22": "22"}')
        tran.commit()
        print "commit"




    def stop(self):
        # In the end, stop it
        self.zk.stop()
Beispiel #20
0
class ClusterMonitor(threading.Thread):
    """periodically checks cluster member.
    This class is delegated to change state between ACT clustered and ACT declustered."""
    def __init__(self, zha):
        threading.Thread.__init__(self)
        self.zha = zha
        self.should_run = True
        self.zk = KazooClient(hosts=self.zha.config.get("connection_string","127.0.0.1:2181"), logger=logger)
        self.zk.add_listener(self._zk_listener)
        self.zk.start()
        self.zroot = self.zha.config.get("cluster_znode","/zha-state")
        self.znode = self.zroot + "/" + self.zha.config.get("id") 
        self._zk_register(first=True)
        self.not_alone = None
    def run(self):
        while self.should_run:
            time.sleep(self.zha.config.get("clustercheck_interval",3))
            self.zha.recheck()
            self._zk_register()
            self.check_cluster()
            self.trigger()
        if self.zha.is_clustered:
            self.zha.config.become_declustered()
            self.zha.is_clustered = False
        self.zk.delete(self.znode)
        logger.info("cluster monitor thread stopped.")
    def check_cluster(self):
        try:
            count = 0
            chs = self.zk.get_children(self.zroot)
            for ch in chs:
                data, stats = self.zk.get(self.zroot+"/"+ch)
                if data.strip()=="SBY:HEALTHY" and ch != self.zha.config.get("id"):
                    count += 1
            if count != 0:
                self.not_alone = time.time()
            logger.debug("healthy sbys: %d"%(count,))
        except Exception,e:
            logger.warn("check cluster failed. Try next time.%s"%e)
Beispiel #21
0
class ZooDict(object):

    def __init__(self, path_root=None):
        self.path_root = path_root or default_path_root

    def __enter__(self):
        self.zk = KazooClient(hosts=zk_hosts)
        self.zk.start(timeout=5)
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.zk.stop()

    def start(self):
        return self.__enter__()

    def stop(self):
        return self.__exit__()

    def get(self, key):
        try:
            r = self.zk.get(self.path_root + '/' + key)

            if r and r[0]:
                return json.loads(r[0])
        except NoNodeError as no_node:
            return None

    def set(self, key, data_dict):
        self.zk.ensure_path(self.path_root + '/' + key)
        self.zk.set(self.path_root + '/' + key, json.dumps(data_dict))

    def delete(self, key):
        self.zk.delete(self.path_root + '/' + key)

    def items(self):
        return [ (child, self.get(child)) for child in self.zk.get_children(self.path_root) ]
Beispiel #22
0
class MicroService(object):
	"""docstring for MicroService"""
	def __init__(self, hosts, app):
		from kazoo.client import KazooClient

		self.zk = KazooClient(hosts=hosts)
		self.app = "/" + app

	def zkStart(self):
		self.zk.start()

	def zkStop(self):
		self.zk.stop()
		
	def registerService(self, service, url):
		if self.zk.exists(self.app + "/" + service):
			self.zk.delete(self.app + "/" + service, recursive=True)
			self.zk.delete(self.app, recursive=True)
		self.zk.ensure_path(self.app + "/" + service)
		self.zk.set(self.app + "/" + service, bytes(url, encoding="utf-8"))

		self.__watchServiceChanged(service)

		return 'Service registed'

	def unregisterService(self, service):
		self.zk.delete(self.app + "/" + service, recursive=True)
		return "Service deleted"

	def findService(self, service):
		data, stat = self.zk.get(self.app + "/" + service)
		return data.decode("utf-8")

	def __watchServiceChanged(self, service):

		@self.zk.DataWatch(self.app + "/" + service)
		def changed(data, stat):
			print("Version: %s, data: %s" % (stat.version, data.decode("utf-8")))
						next_temp=str(key)+" "+str(alphadict1[key])
						clientSocket.send(next_temp)
						clientSocket.close()

					

				else :
					pass

			else :
				connectionSocket.close()

		else :
			connectionSocket.close()

		zk.delete("/alphanode")
		

	else :
		print("I might become the master!")
		time.sleep(2)
		if zk.exists("/masternode1") :
			print("Someone else became the master so I can't! Going back to my regular duties")
			os.system("gnome-terminal -e 'python Server2.py'")
			quit()
		else :
			print 'I have become the master! I am ready to receive!I will carry out my regular duties and master duties!\n'

			if zk.exists("/zlist") :
				zk.set("/zlist","12000 /masternode 12000 /masternode1 65000 /numnode 45678 /alphanode1 45876 /specnode")	
				check_lis=zk.get("/zlist")
Beispiel #24
0
class ShellTestCase(unittest.TestCase):
    """ base class for all tests """
    @classmethod
    def setUpClass(cls):
        get_global_cluster().start()

    def setUp(self):
        """
        make sure that the prefix dir is empty
        """
        self.tests_path = os.getenv("ZKSHELL_PREFIX_DIR", "/tests")
        self.zk_hosts = ",".join(server.address
                                 for server in get_global_cluster())
        self.username = os.getenv("ZKSHELL_USER", "user")
        self.password = os.getenv("ZKSHELL_PASSWD", "user")
        self.digested_password = os.getenv("ZKSHELL_DIGESTED_PASSWD",
                                           "F46PeTVYeItL6aAyygIVQ9OaaeY=")
        self.super_password = os.getenv("ZKSHELL_SUPER_PASSWD", "secret")
        self.scheme = os.getenv("ZKSHELL_AUTH_SCHEME", "digest")

        self.client = KazooClient(self.zk_hosts, 5)
        self.client.start()
        self.client.add_auth(self.scheme, self.auth_id)
        if self.client.exists(self.tests_path):
            self.client.delete(self.tests_path, recursive=True)
        self.client.create(self.tests_path, str.encode(""))

        self.output = XStringIO()
        self.shell = Shell([self.zk_hosts],
                           5,
                           self.output,
                           setup_readline=False,
                           async=False)

        # Create an empty test dir (needed for some tests)
        self.temp_dir = tempfile.mkdtemp()

    @property
    def auth_id(self):
        return "%s:%s" % (self.username, self.password)

    @property
    def auth_digest(self):
        return "%s:%s" % (self.username, self.digested_password)

    def tearDown(self):
        if self.output is not None:
            self.output.close()
            self.output = None

        if self.shell is not None:
            self.shell._disconnect()
            self.shell = None

        if os.path.isdir(self.temp_dir):
            shutil.rmtree(self.temp_dir)

        if self.client is not None:
            if self.client.exists(self.tests_path):
                self.client.delete(self.tests_path, recursive=True)

            self.client.stop()
            self.client.close()
            self.client = None

    ###
    # Helpers.
    ##

    def create_compressed(self, path, value):
        """
        ZK Shell doesn't support creating directly from a bytes array so we use a Kazoo client
        to create a znode with zlib compressed content.
        """
        compressed = zlib.compress(bytes(value, "utf-8") if PYTHON3 else value)
        self.client.create(path, compressed, makepath=True)
Beispiel #25
0
# @Time     : 2018/11/19 9:05
# @describe :

from kazoo.client import KazooClient
from kazoo.client import KazooState
import logging
import time

logging.basicConfig()  # 解决建立连接异常因没有传入log日志对象而产生的报错
zk = KazooClient(hosts='39.107.88.145:2181', timeout=1)  # 建立zk连接,超时时间设置为1s


@zk.add_listener  # my_listener = zk.add_listener(my_listener)
def my_listener(state):
    """监听客户端连接状态
        此函数看似没有显示调用,实际调用在装饰器中,也就是return时"""
    if state == KazooState.LOST:
        print 'lost'
    elif state == KazooState.SUSPENDED:
        print 'suspended'
    else:
        print 'connected'


zk.start()

# zk.set("/lws/test/1", b"some data test")  # 更新给定节点数据,可以提供版本号,更新前需匹配
zk.delete("/lws/test/1", recursive=True)  # 删除zNode节点,可选择递归删除,也可提供版本号删除
time.sleep(2)
zk.create("/lws/test/1", b"value one")
#!/usr/bin/env python
# coding=utf-8

# Jack Kang
from kazoo.client import KazooClient

zk = KazooClient("123.206.89.123:2181")
zk.start()
if zk.exists("/task/taobao") != None:
    zk.delete('task/taobao', recursive=True)
if len(zk.get_children("/signal/taobao")) != 0:
    zk.delete("/signal/taobao", recursive=True)
    zk.create("/signal/taobao")
print "begin"

zk.create("/signal/taobao/start")
zk.stop()
Beispiel #27
0
class ZookeeperHandler:
    def __init__(self, zk_hosts, prefix, host=None, port=None):
        self.zk = KazooClient(hosts=zk_hosts)
        self.prefix = prefix
        self.is_service = host
        self.children = None
        self.root_path = self.prefix + "/{0}:{1}".format(host, port)

        @self.zk.add_listener
        def zk_listener(state):
            # 这个方法也要放在start前面
            if state == KazooState.LOST:
                print("lost", state)
            elif state == KazooState.CONNECTED:
                # 检查是否是只读的客户端
                if self.zk.client_state == KeeperState.CONNECTED_RO:
                    print("Read only mode!")
                else:
                    print("Read/Write mode!")
            elif state == KazooState.SUSPENDED:
                print("SUSPENDED", state)
            else:
                print("other", state)

        self.zk.start()

    def register(self):
        """
        服务注册
        :return:
        """
        if not self.is_service:
            return
        if not self.zk.exists(self.prefix):
            # 注册这个目录,如果不存在的话就创建,可以连着创建几个目录(但是不能存储数据)
            print("创建节点了", self.prefix)
            self.zk.ensure_path(self.prefix)
        if self.zk.exists(self.root_path):
            self.zk.delete(self.root_path, recursive=True)
        self.zk.create(self.root_path,
                       b'{"weight":1}',
                       ephemeral=True,
                       sequence=False)

        # 监视子节点变化
        @self.zk.ChildrenWatch(self.prefix)
        def watch_children(children):
            self.children = children
            print("子节点变化了(需要更新服务列表了): %s" % children)

        # 实时监视数据的变化(可以做动态配置)
        @self.zk.DataWatch(self.root_path)
        def watch_node(data, stat):
            print("节点下的数据变化了(这边可以写些处理逻辑):", data, stat)
            if stat:
                print("Version: %s, data: %s" %
                      (stat.version, data.decode("utf-8")))

    def discover(self):
        """
        服务发现
        :return:
        """
        # 获取子节点
        ret = self.zk.get_children(self.prefix)
        return ret
Beispiel #28
0
class KazooCommandProxy():
    def __init__(self, module):
        self.module = module
        self.zk = KazooClient(module.params['hosts'])

    def absent(self):
        return self._absent(self.module.params['name'])

    def exists(self, znode):
        return self.zk.exists(znode)

    def list(self):
        children = self.zk.get_children(self.module.params['name'])
        return True, {'count': len(children), 'items': children, 'msg': 'Retrieved znodes in path.',
                      'znode': self.module.params['name']}

    def present(self):
        return self._present(self.module.params['name'], self.module.params['value'])

    def get(self):
        return self._get(self.module.params['name'])

    def shutdown(self):
        self.zk.stop()
        self.zk.close()

    def start(self):
        self.zk.start()

    def wait(self):
        return self._wait(self.module.params['name'], self.module.params['timeout'])

    def _absent(self, znode):
        if self.exists(znode):
            self.zk.delete(znode, recursive=self.module.params['recursive'])
            return True, {'changed': True, 'msg': 'The znode was deleted.'}
        else:
            return True, {'changed': False, 'msg': 'The znode does not exist.'}

    def _get(self, path):
        if self.exists(path):
            value, zstat = self.zk.get(path)
            stat_dict = {}
            for i in dir(zstat):
                if not i.startswith('_'):
                    attr = getattr(zstat, i)
                    if isinstance(attr, (int, str)):
                        stat_dict[i] = attr
            result = True, {'msg': 'The node was retrieved.', 'znode': path, 'value': value,
                            'stat': stat_dict}
        else:
            result = False, {'msg': 'The requested node does not exist.'}

        return result

    def _present(self, path, value):
        if self.exists(path):
            (current_value, zstat) = self.zk.get(path)
            if value != current_value:
                self.zk.set(path, to_bytes(value))
                return True, {'changed': True, 'msg': 'Updated the znode value.', 'znode': path,
                              'value': value}
            else:
                return True, {'changed': False, 'msg': 'No changes were necessary.', 'znode': path, 'value': value}
        else:
            self.zk.create(path, to_bytes(value), makepath=True)
            return True, {'changed': True, 'msg': 'Created a new znode.', 'znode': path, 'value': value}

    def _wait(self, path, timeout, interval=5):
        lim = time.time() + timeout

        while time.time() < lim:
            if self.exists(path):
                return True, {'msg': 'The node appeared before the configured timeout.',
                              'znode': path, 'timeout': timeout}
            else:
                time.sleep(interval)

        return False, {'msg': 'The node did not appear before the operation timed out.', 'timeout': timeout,
                       'znode': path}
Beispiel #29
0
class ZooKeeper(AbstractDCS):

    def __init__(self, config):
        super(ZooKeeper, self).__init__(config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self._client = KazooClient(hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']),
                                   timeout=config['ttl'], connection_retry=KazooRetry(max_delay=1, max_tries=-1,
                                   sleep_func=time.sleep), command_retry=KazooRetry(deadline=config['retry_timeout'],
                                   max_delay=1, max_tries=-1, sleep_func=time.sleep))
        self._client.add_listener(self.session_listener)

        self._fetch_cluster = True
        self._fetch_optime = True

        self._orig_kazoo_connect = self._client._connection._connect
        self._client._connection._connect = self._kazoo_connect

        self._client.start()

    def _kazoo_connect(self, *args):
        """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no
        response on Ping after Ping interval (1/2 from read_timeout) it will consider current
        connection dead and try to connect to another node. Without this "magic" it was taking
        up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had
        only small time for reconnect and retry.

        This method is needed to return different value of read_timeout, which is not calculated
        from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller
        than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and
        `write_leader_optime()` methods, which also may hang..."""

        ret = self._orig_kazoo_connect(*args)
        return max(self.loop_wait - 2, 2)*1000, ret[1]

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def optime_watcher(self, event):
        self._fetch_optime = True
        self.event.set()

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.optime_watcher(event)

    def reload_config(self, config):
        self.set_retry_timeout(config['retry_timeout'])

        loop_wait = config['loop_wait']

        loop_wait_changed = self._loop_wait != loop_wait
        self._loop_wait = loop_wait
        self._client.handler.set_connect_timeout(loop_wait)

        # We need to reestablish connection to zookeeper if we want to change
        # read_timeout (and Ping interval respectively), because read_timeout
        # is calculated in `_kazoo_connect` method. If we are changing ttl at
        # the same time, set_ttl method will reestablish connection and return
        # `!True`, otherwise we will close existing connection and let kazoo
        # open the new one.
        if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed:
            self._client._connection._socket.close()

    def set_ttl(self, ttl):
        """It is not possible to change ttl (session_timeout) in zookeeper without
        destroying old session and creating the new one. This method returns `!True`
        if session_timeout has been changed (`restart()` has been called)."""
        if self._client._session_timeout != ttl:
            self._client._session_timeout = ttl
            self._client.restart()
            return True

    @property
    def ttl(self):
        return self._client._session_timeout

    def set_retry_timeout(self, retry_timeout):
        retry = self._client.retry if isinstance(self._client.retry, KazooRetry) else self._client._retry
        retry.deadline = retry_timeout

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    def get_leader_optime(self, leader):
        watch = self.optime_watcher if not leader or leader.name != self._name else None
        optime = self.get_node(self.leader_optime_path, watch)
        self._fetch_optime = False
        return optime and int(optime[0]) or 0

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner, value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self, sync_standby):
        members = []
        for member in self.get_children(self.members_path, self.cluster_watcher):
            watch = member == sync_standby and self.cluster_watcher or None
            data = self.get_node(self.members_path + member, watch)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None

        # get global dynamic configuration
        config = self.get_node(self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None
        config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid)

        # get timeline history
        history = self.get_node(self.history_path, watch=self.cluster_watcher) if self._HISTORY in nodes else None
        history = history and TimelineHistory.from_node(history[1].mzxid, history[0])

        # get synchronization state
        sync = self.get_node(self.sync_path, watch=self.cluster_watcher) if self._SYNC in nodes else None
        sync = SyncState.from_node(sync and sync[1].version, sync and sync[0])

        # get list of members
        sync_standby = sync.leader == self._name and sync.sync_standby or None
        members = self.load_members(sync_standby) if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if not self._ctl and leader[0] == self._name and client_id is not None \
                    and client_id[0] != leader[1].ephemeralOwner:
                logger.info('I am leader but not owner of the session. Removing leader node')
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]] or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner, member)
                self._fetch_cluster = member.index == -1

        # get last leader operation
        last_leader_operation = self._OPTIME in nodes and self.get_leader_optime(leader)

        # failover key
        failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        failover = failover and Failover.from_node(failover[1].version, failover[0])

        return Cluster(initialize, config, leader, last_leader_operation, members, failover, sync, history)

    def _load_cluster(self):
        cluster = self.cluster
        if self._fetch_cluster or cluster is None:
            try:
                cluster = self._client.retry(self._inner_load_cluster)
            except Exception:
                logger.exception('get_cluster')
                self.cluster_watcher(None)
                raise ZooKeeperError('ZooKeeper in not responding properly')
        # Optime ZNode was updated or doesn't exist and we are not leader
        elif (self._fetch_optime and not self._fetch_cluster or not cluster.last_leader_operation) and\
                not (cluster.leader and cluster.leader.name == self._name):
            try:
                optime = self.get_leader_optime(cluster.leader)
                cluster = Cluster(cluster.initialize, cluster.config, cluster.leader, optime,
                                  cluster.members, cluster.failover, cluster.sync, cluster.history)
            except Exception:
                pass
        return cluster

    def _bypass_caches(self):
        self._fetch_cluster = True

    def _create(self, path, value, retry=False, ephemeral=False):
        try:
            if retry:
                self._client.retry(self._client.create, path, value, makepath=True, ephemeral=ephemeral)
            else:
                self._client.create_async(path, value, makepath=True, ephemeral=ephemeral).get(timeout=1)
            return True
        except Exception:
            logger.exception('Failed to create %s', path)
        return False

    def attempt_to_acquire_leader(self, permanent=False):
        ret = self._create(self.leader_path, self._name.encode('utf-8'), retry=True, ephemeral=not permanent)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def _set_or_create(self, key, value, index=None, retry=False, do_not_create_empty=False):
        value = value.encode('utf-8')
        try:
            if retry:
                self._client.retry(self._client.set, key, value, version=index or -1)
            else:
                self._client.set_async(key, value, version=index or -1).get(timeout=1)
            return True
        except NoNodeError:
            if do_not_create_empty and not value:
                return True
            elif index is None:
                return self._create(key, value, retry)
            else:
                return False
        except Exception:
            logger.exception('Failed to update %s', key)
        return False

    def set_failover_value(self, value, index=None):
        return self._set_or_create(self.failover_path, value, index)

    def set_config_value(self, value, index=None):
        return self._set_or_create(self.config_path, value, index, retry=True)

    def initialize(self, create_new=True, sysid=""):
        sysid = sysid.encode('utf-8')
        return self._create(self.initialize_path, sysid, retry=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path, sysid)

    def touch_member(self, data, permanent=False):
        cluster = self.cluster
        member = cluster and cluster.get_member(self._name, fallback_to_leader=False)
        encoded_data = json.dumps(data, separators=(',', ':')).encode('utf-8')
        if member and (self._client.client_id is not None and member.session != self._client.client_id[0] or
                       not (deep_compare(member.data.get('tags', {}), data.get('tags', {})) and
                            member.data.get('version') == data.get('version') and
                            member.data.get('checkpoint_after_promote') == data.get('checkpoint_after_promote'))):
            try:
                self._client.delete_async(self.member_path).get(timeout=1)
            except NoNodeError:
                pass
            except Exception:
                return False
            member = None

        if member:
            if deep_compare(data, member.data):
                return True
        else:
            try:
                self._client.create_async(self.member_path, encoded_data, makepath=True,
                                          ephemeral=not permanent).get(timeout=1)
                return True
            except Exception as e:
                if not isinstance(e, NodeExistsError):
                    logger.exception('touch_member')
                    return False
        try:
            self._client.set_async(self.member_path, encoded_data).get(timeout=1)
            return True
        except Exception:
            logger.exception('touch_member')

        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def _write_leader_optime(self, last_operation):
        return self._set_or_create(self.leader_optime_path, last_operation)

    def _update_leader(self):
        return True

    def _delete_leader(self):
        self._client.restart()
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except Exception:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete, self.client_path(''), recursive=True)
        except NoNodeError:
            return True

    def set_history_value(self, value):
        return self._set_or_create(self.history_path, value)

    def set_sync_state_value(self, value, index=None):
        return self._set_or_create(self.sync_path, value, index, retry=True, do_not_create_empty=True)

    def delete_sync_state(self, index=None):
        return self.set_sync_state_value("{}", index)

    def watch(self, leader_index, timeout):
        if super(ZooKeeper, self).watch(leader_index, timeout) and not self._fetch_optime:
            self._fetch_cluster = True
        return self._fetch_cluster
Beispiel #30
0
class ZkOperation(object):
    def __init__(self, zk_hosts, zk_root):
        self.zk = KazooClient(zk_hosts)
        self.root = zk_root
        self.tasks = set()
        self.event = threading.Event()

    def start(self):
        if self.zk.exists:
            self.zk.start()
            self.zk.add_auth('digest', 'publish:publish')
        if self.zk.connected:
            self.zk.ensure_path(self.root)

    def is_job_exist(self, job_id):
        if job_id == '':
            raise Exception('job_id is ""')
        node = self.root + '/jobs/' + job_id
        return self.zk.exists(node)

    def check_task_status(self, path):
        if path == '':
            raise Exception('path is ""')
        node = self.root + path
        data, _ = self.zk.get(node)
        return data['Status']

    def _is_exist(self, node):
        if self.zk.connected and self.zk.exists(node):
            if self.zk.exists(node):
                return True
            else:
                return False
        else:
            logger.error('zk not connected or node is exists')
            return False

    def _create_node(self, node, value=None):
        if value is None:
            value = ''
        value = json.dumps(value)
        if self.zk.connected and not self.zk.exists(node):
            self.zk.create(node, makepath=True, value=value.encode())
            return True
        else:
            logger.error('zk not connected or node is exists')
            return False

    def _update_node(self, node, value):
        if self.zk.connected and self.zk.exists(node):
            tx = self.zk.transaction()
            tx.set_data(node, json.dumps(value).encode())
            tx.commit()
            return True
        else:
            logger.error(
                'update node failed: zk not connected or node is not exists')
            return False

    def _get_node(self, node):
        if self.zk.connected and self.zk.exists(node):
            node_value, _ = self.zk.get(node)
            if node_value:
                return json.loads(node_value.decode())
            else:
                return {}
        else:
            logger.error('zk not connected or node is exists')
            return False

    def _delete_node(self, node):
        if self.zk.connected:
            if not self.zk.exists(node):
                return True
            else:
                self.zk.delete(node, recursive=True)
                return True
        else:
            logger.error('zk not connected')
            return False

    # is exist
    def is_exist_signal(self, job_id):
        node = '/{}/signal/{}'.format(self.root, job_id)
        return self._is_exist(node)

    # CREATE
    def create_new_job(self, job_id, job_value=None):
        if job_value is None:
            job_value = ''
        if job_id != '':
            node = self.root + '/jobs/' + job_id
            ret = self._create_node(node, job_value)
            return ret
        else:
            logger.error('job_id is null')
            return False

    def create_new_target(self, job_id, target, target_value):
        node = '/{}/jobs/{}/targets/{}'.format(self.root, job_id, target)
        ret = self._create_node(node, target_value)
        return ret

    def create_new_task(self, job_id, target, task):
        node = '/{}/jobs/{}/targets/{}/tasks/{}'.format(
            self.root, job_id, target, task['task_id'])
        ret = self._create_node(node, task)
        return ret

    def create_job_signal(self, job_id):
        node = '/{}/signal/{}'.format(self.root, job_id)
        ret = self._create_node(node, uuid.uuid4().hex)
        return ret

    # GET
    def get_job_info(self, job_id):
        job_node = '{}/jobs/{}'.format(self.root, job_id)
        job_value, _ = self.zk.get(job_node)
        job_info = json.loads(job_value.decode())
        return job_info

    def get_target_info(self, job_id, target):
        target_node = '{}/jobs/{}/targets/{}'.format(self.root, job_id, target)
        target_value, _ = self.zk.get(target_node)
        target_info = json.loads(target_value.decode())
        return target_info

    def get_task_info(self, job_id, target, task_id):
        task_node = '{}/jobs/{}/targets/{}/tasks/{}'.format(
            self.root, job_id, target, task_id)
        task_value, _ = self.zk.get(task_node)
        task_info = json.loads(task_value.decode())
        return task_info

    # UPDATE
    def update_job_status(self, job_id, task):
        if job_id != '' and task is not None:
            node = self.root + '/signal/' + job_id
        else:
            raise Exception('job_id is ""')
        if self.zk.connected and self.is_job_exist(job_id):
            tx = self.zk.transaction()
            tx.set_data(node, task.encode())
            tx.commit()

    def handler_task(self, job_id, task_id, task_name, task_message, status):
        # 为不必传回target, 遍历任务节点
        if not self.is_job_exist(job_id):
            logger.error("can not find this jobid: {}".format(job_id))
            return False
        job_node = "{}/jobs/{}/targets".format(self.root, job_id)
        for target in self.zk.get_children(job_node):
            target_node = "{}/{}/tasks".format(job_node, target)
            for task in self.zk.get_children(target_node):
                if task == task_id:
                    task_node = "{}/{}".format(target_node, task)
                    task_value, _ = self.zk.get(task_node)
                    new_task_value = json.loads(task_value.decode())
                    new_task_value['status'] = status
                    tx = self.zk.transaction()
                    tx.set_data(task_node, json.dumps(new_task_value).encode())
                    tx.commit()
                    task_value, _ = self.zk.get(task_node)
                    return True
        logger.error("can not find this taskid: {} in {}".format(
            task_id, job_id))
        return False

    def get_target_by_taskid(self, job_id, task_id):
        if self.is_job_exist(job_id):
            node = "{}/jobs/{}/targets".format(self.root, job_id)
            for target in self.zk.get_children(node):
                path = '{}/{}/tasks'.format(node, target)
                for taskid in self.zk.get_children(path):
                    if taskid == task_id:
                        return target
            return False
        else:
            logger.error("job is not exist: job_id={}".format(job_id))

    def send_signal(self, job_id):
        node = '{}/signal/{}'.format(self.root, job_id)
        logger.info("send singal: {}".format(job_id))
        tx = self.zk.transaction()
        tx.set_data(node, uuid.uuid4().bytes)
        tx.commit()

    # DELETE
    def delete_job(self, job_id):
        node = "{}/jobs/{}".format(self.root, job_id)
        logger.info("delete job: job_id={}".format(job_id))
        self._delete_node(node)

    def delete_signal(self, job_id):
        node = '{}/signal/{}'.format(self.root, job_id)
        logger.info("delete singal: {}".format(job_id))
        self._delete_node(node)

    def delete_target(self, job_id, target):
        target_node = '{}/jobs/{}/targets/{}'.format(self.root, job_id, target)
        logger.info("delete target: job_id={}, target={}".format(
            job_id, target))
        self._delete_node(target_node)

    def delete_task(self, job_id, target, task_id):
        task_node = '{}/jobs/{}/targets/{}/tasks/{}'.format(
            self.root, job_id, target, task_id)
        logger.info("delete task: job_id ={}, target={}, task_id={}".format(
            job_id, target, task_id))
        self._delete_node(task_node)

#################################
# CALLBACK
## exsit CALLBACK

    def is_exist_callback(self, callback_node):
        node = "{}/callback/{}".format(self.root, callback_node)
        if self.zk.exists(node):
            return True
        else:
            return False

    ## INIT CALLBACK
    def init_callback_by_jobid(self, job_id):
        node = "{}/callback/{}".format(self.root, job_id)
        job_callback_value = {
            "job_id": job_id,
            "status": JobStatus.init.value,
            "messages": ""
        }
        callback = {
            "callback_level": CallbackLevel.job.value,
            "callback_info": job_callback_value
        }
        ret = self._create_node(node, callback)
        return ret

    def init_callback_by_target(self, job_id, target):
        node = "{}/callback/{}".format(self.root, job_id + "_" + target)
        target_callback_value = {
            "job_id": job_id,
            "target": target,
            "status": TargetStatus.init.value,
            "messages": ""
        }
        callback = {
            "callback_level": CallbackLevel.target.value,
            "callback_info": target_callback_value
        }
        ret = self._create_node(node, callback)
        return ret

    def init_callback_by_taskid(self, job_id, target, task_id, task_name):
        node = "{}/callback/{}".format(self.root, task_id)
        taskid_callback_value = {
            "job_id": job_id,
            "target": target,
            "task_name": task_name,
            "status": JobStatus.init.value,
            "messages": "",
        }
        callback = {
            "callback_level": CallbackLevel.task.value,
            "callback_info": taskid_callback_value
        }
        ret = self._create_node(node, callback)
        return ret

    ## GET CALLBACK
    def get_callback_info(self, callback):
        node = "{}/callback/{}".format(self.root, callback)
        if self.zk.exists(node):
            node_value = self._get_node(node)
            return node_value
        else:
            return False

    ## UPDATE CALLBACK
    def update_callback_by_jobid(self, job_id, status, messages=None):
        node = "{}/callback/{}".format(self.root, job_id)
        if not self.zk.exists(node):
            return False
        node_value = self._get_node(node)
        node_value["callback_info"]["status"] = status
        if messages is not None:
            node_value["callback_info"]["messages"] = messages
        ret = self._update_node(node, node_value)
        return ret

    def update_callback_by_target(self, job_id, target, status, messages=None):
        node = "{}/callback/{}".format(self.root, job_id + "_" + target)
        if not self.zk.exists(node):
            return False
        node_value = self._get_node(node)
        node_value["callback_info"]["status"] = status
        if messages is not None:
            node_value["callback_info"]["messages"] = messages
        ret = self._update_node(node, node_value)
        return ret

    def update_callback_by_taskid(self, job_id, taskid, status, messages=None):
        node = "{}/callback/{}".format(self.root, taskid)
        if not self.zk.exists(node):
            return False
        node_value = self._get_node(node)
        node_value["callback_info"]["status"] = status
        if messages is not None:
            node_value["callback_info"]["messages"] = messages
        ret = self._update_node(node, node_value)
        return ret

    ## DELETE CALLBACK
    def delete_callback_node(self, callback):
        node = "{}/callback/{}".format(self.root, callback)
        ret = self._delete_node(node)
        if ret:
            logger.info(
                "delete callback node success: callback={}".format(node))
        else:
            logger.error("delete callback node fail: callback={}".format(node))
        return ret
Beispiel #31
0
                   default=False,
                   help='Delete items on /path recursively')

args = parser.parse_args()

zookeeper = {'server': '127.0.0.1', 'port': '2181'}

zk = KazooClient(hosts='%(server)s:%(port)s' % zookeeper)
zk.start()

if args.listing:
    try:
        answer = zk.get_children(args.item)
        print(answer)
    except kazoo.exceptions.NoNodeError:
        print('ERROR: No such path!')

elif args.delete:
    zk.delete(args.item, recursive=True)
    print(args.item + ' deleted!')

else:
    try:
        answer = zk.get(args.item)
        data = answer[0].decode('utf-8')
        print(data)
    except kazoo.exceptions.NoNodeError:
        print('none')

zk.stop()
Beispiel #32
0
class ZK:
    """Opens a connection to a kafka zookeeper. "
    "To be used in the 'with' statement."""
    def __init__(self, cluster_config):
        self.cluster_config = cluster_config

    def __enter__(self):
        kazooRetry = KazooRetry(max_tries=5, )
        self.zk = KazooClient(
            hosts=self.cluster_config.zookeeper,
            read_only=True,
            connection_retry=kazooRetry,
        )
        _log.debug(
            "ZK: Creating new zookeeper connection: {zookeeper}".format(
                zookeeper=self.cluster_config.zookeeper), )
        self.zk.start()
        return self

    def __exit__(self, type, value, traceback):
        self.zk.stop()

    def get_children(self, path, watch=None):
        """Returns the children of the specified node."""
        _log.debug("ZK: Getting children of {path}".format(path=path), )
        return self.zk.get_children(path, watch)

    def get(self, path, watch=None):
        """Returns the data of the specified node."""
        _log.debug("ZK: Getting {path}".format(path=path), )
        return self.zk.get(path, watch)

    def set(self, path, value):
        """Sets and returns new data for the specified node."""
        _log.debug("ZK: Setting {path} to {value}".format(path=path,
                                                          value=value))
        return self.zk.set(path, value)

    def get_json(self, path, watch=None):
        """Reads the data of the specified node and converts it to json."""
        data, _ = self.get(path, watch)
        return load_json(data) if data else None

    def get_broker_metadata(self, broker_id):
        try:
            broker_json, _ = self.get(
                "/brokers/ids/{b_id}".format(b_id=broker_id))
        except NoNodeError:
            _log.error("broker '{b_id}' not found.".format(b_id=broker_id), )
            raise
        return load_json(broker_json)

    def get_brokers(self, names_only=False):
        """Get information on all the available brokers.

        :rtype : dict of brokers
        """
        try:
            broker_ids = self.get_children("/brokers/ids")
        except NoNodeError:
            _log.info("cluster is empty.")
            return {}
        # Return broker-ids only
        if names_only:
            return {int(b_id): None for b_id in broker_ids}
        return {
            int(b_id): self.get_broker_metadata(b_id)
            for b_id in broker_ids
        }

    def get_topic_config(self, topic):
        """Get configuration information for specified topic.

        :rtype : dict of configuration
        """
        try:
            config_data = load_json(
                self.get("/config/topics/{topic}".format(topic=topic))[0])
        except NoNodeError as e:

            # Kafka version before 0.8.1 does not have "/config/topics/<topic_name>" path in ZK and
            # if the topic exists, return default dict instead of raising an Exception.
            # Ref: https://cwiki.apache.org/confluence/display/KAFKA/Kafka+data+structures+in+Zookeeper.

            topics = self.get_topics(topic_name=topic,
                                     fetch_partition_state=False)
            if len(topics) > 0:
                _log.info(
                    "Configuration not available for topic {topic}.".format(
                        topic=topic))
                config_data = {"config": {}}
            else:
                _log.error("topic {topic} not found.".format(topic=topic))
                raise e
        return config_data

    def set_topic_config(self, topic, value, kafka_version=(
        0,
        10,
    )):
        """Set configuration information for specified topic.

        :topic : topic whose configuration needs to be changed
        :value :  config value with which the topic needs to be
            updated with. This would be of the form key=value.
            Example 'cleanup.policy=compact'
        :kafka_version :tuple kafka version the brokers are running on.
            Defaults to (0, 10, x). Kafka version 9 and kafka 10
            support this feature.
        """
        config_data = dump_json(value)

        try:
            # Change value
            return_value = self.set(
                "/config/topics/{topic}".format(topic=topic), config_data)
            # Create change
            version = kafka_version[1]

            # this feature is supported in kafka 9 and kafka 10
            assert version in (
                9, 10), "Feature supported with kafka 9 and kafka 10"

            if version == 9:
                # https://github.com/apache/kafka/blob/0.9.0.1/
                #     core/src/main/scala/kafka/admin/AdminUtils.scala#L334
                change_node = dump_json({
                    "version": 1,
                    "entity_type": "topics",
                    "entity_name": topic
                })
            else:  # kafka 10
                # https://github.com/apache/kafka/blob/0.10.2.1/
                #     core/src/main/scala/kafka/admin/AdminUtils.scala#L574
                change_node = dump_json({
                    "version": 2,
                    "entity_path": "topics/" + topic,
                })

            self.create('/config/changes/config_change_',
                        change_node,
                        sequence=True)
        except NoNodeError as e:
            _log.error("topic {topic} not found.".format(topic=topic))
            raise e
        return return_value

    def get_topics(
        self,
        topic_name=None,
        names_only=False,
        fetch_partition_state=True,
    ):
        """Get information on all the available topics.

        Topic-data format with fetch_partition_state as False :-
        topic_data = {
            'version': 1,
            'partitions': {
                <p_id>: {
                    replicas: <broker-ids>
                }
            }
        }

        Topic-data format with fetch_partition_state as True:-
        topic_data = {
            'version': 1,
            'ctime': <timestamp>,
            'partitions': {
                <p_id>:{
                    replicas: [<broker_id>, <broker_id>, ...],
                    isr: [<broker_id>, <broker_id>, ...],
                    controller_epoch: <val>,
                    leader_epoch: <val>,
                    version: 1,
                    leader: <broker-id>,
                    ctime: <timestamp>,
                }
            }
        }
        Note: By default we also fetch partition-state which results in
        accessing the zookeeper twice. If just partition-replica information is
        required fetch_partition_state should be set to False.
        """
        try:
            topic_ids = [topic_name] if topic_name else self.get_children(
                "/brokers/topics", )
        except NoNodeError:
            _log.error("Cluster is empty.")
            return {}

        if names_only:
            return topic_ids
        topics_data = {}
        for topic_id in topic_ids:
            try:
                topic_info = self.get(
                    "/brokers/topics/{id}".format(id=topic_id))
                topic_data = load_json(topic_info[0])
                topic_ctime = topic_info[1].ctime / 1000.0
                topic_data['ctime'] = topic_ctime
            except NoNodeError:
                _log.info(
                    "topic '{topic}' not found.".format(topic=topic_id), )
                return {}
            # Prepare data for each partition
            partitions_data = {}
            for p_id, replicas in six.iteritems(topic_data['partitions']):
                partitions_data[p_id] = {}
                if fetch_partition_state:
                    # Fetch partition-state from zookeeper
                    partition_state = self._fetch_partition_state(
                        topic_id, p_id)
                    partitions_data[p_id] = load_json(partition_state[0])
                    partitions_data[p_id][
                        'ctime'] = partition_state[1].ctime / 1000.0
                else:
                    # Fetch partition-info from zookeeper
                    partition_info = self._fetch_partition_info(topic_id, p_id)
                    partitions_data[p_id][
                        'ctime'] = partition_info.ctime / 1000.0
                partitions_data[p_id]['replicas'] = replicas
            topic_data['partitions'] = partitions_data
            topics_data[topic_id] = topic_data
        return topics_data

    def get_consumer_groups(self, consumer_group_id=None, names_only=False):
        """Get information on all the available consumer-groups.

        If names_only is False, only list of consumer-group ids are sent.
        If names_only is True, Consumer group offset details are returned
        for all consumer-groups or given consumer-group if given in dict
        format as:-

        {
            'group-id':
            {
                'topic':
                {
                    'partition': offset-value,
                    ...
                    ...
                }
            }
        }

        :rtype: dict of consumer-group offset details
        """
        if consumer_group_id is None:
            group_ids = self.get_children("/consumers")
        else:
            group_ids = [consumer_group_id]

        # Return consumer-group-ids only
        if names_only:
            return {g_id: None for g_id in group_ids}

        consumer_offsets = {}
        for g_id in group_ids:
            consumer_offsets[g_id] = self.get_group_offsets(g_id)
        return consumer_offsets

    def get_group_offsets(self, group, topic=None):
        """Fetch group offsets for given topic and partition otherwise all topics
        and partitions otherwise.


        {
            'topic':
            {
                'partition': offset-value,
                ...
                ...
            }
        }
        """
        group_offsets = {}
        try:
            all_topics = self.get_my_subscribed_topics(group)
        except NoNodeError:
            # No offset information of given consumer-group
            _log.warning(
                "No topics subscribed to consumer-group {group}.".format(
                    group=group, ), )
            return group_offsets
        if topic:
            if topic in all_topics:
                topics = [topic]
            else:
                _log.error(
                    "Topic {topic} not found in topic list {topics} for consumer"
                    "-group {consumer_group}.".format(
                        topic=topic,
                        topics=', '.join(topic for topic in all_topics),
                        consumer_group=group,
                    ), )
                return group_offsets
        else:
            topics = all_topics
        for topic in topics:
            group_offsets[topic] = {}
            try:
                partitions = self.get_my_subscribed_partitions(group, topic)
            except NoNodeError:
                _log.warning(
                    "No partition offsets found for topic {topic}. "
                    "Continuing to next one...".format(topic=topic), )
                continue
            # Fetch offsets for each partition
            for partition in partitions:
                path = "/consumers/{group_id}/offsets/{topic}/{partition}".format(
                    group_id=group,
                    topic=topic,
                    partition=partition,
                )
                try:
                    # Get current offset
                    offset_json, _ = self.get(path)
                    group_offsets[topic][partition] = load_json(offset_json)
                except NoNodeError:
                    _log.error("Path {path} not found".format(path=path))
                    raise
        return group_offsets

    def _fetch_partition_state(self, topic_id, partition_id):
        """Fetch partition-state for given topic-partition."""
        state_path = "/brokers/topics/{topic_id}/partitions/{p_id}/state"
        try:
            partition_state = self.get(
                state_path.format(topic_id=topic_id, p_id=partition_id), )
            return partition_state
        except NoNodeError:
            return {}  # The partition has no data

    def _fetch_partition_info(self, topic_id, partition_id):
        """Fetch partition info for given topic-partition."""
        info_path = "/brokers/topics/{topic_id}/partitions/{p_id}"
        try:
            _, partition_info = self.get(
                info_path.format(topic_id=topic_id, p_id=partition_id), )
            return partition_info
        except NoNodeError:
            return {}  # The partition has no data

    def get_my_subscribed_topics(self, groupid):
        """Get the list of topics that a consumer is subscribed to

        :param: groupid: The consumer group ID for the consumer
        :returns list of kafka topics
        :rtype: list
        """
        path = "/consumers/{group_id}/offsets".format(group_id=groupid)
        return self.get_children(path)

    def get_my_subscribed_partitions(self, groupid, topic):
        """Get the list of partitions of a topic
        that a consumer is subscribed to

        :param: groupid: The consumer group ID for the consumer
        :param: topic: The topic name
        :returns list of partitions
        :rtype: list
        """
        path = "/consumers/{group_id}/offsets/{topic}".format(
            group_id=groupid,
            topic=topic,
        )
        return self.get_children(path)

    def get_cluster_assignment(self):
        """Fetch the cluster layout in form of assignment from zookeeper"""
        plan = self.get_cluster_plan()
        assignment = {}
        for elem in plan['partitions']:
            assignment[(elem['topic'], elem['partition'])] = elem['replicas']

        return assignment

    def create(self,
               path,
               value='',
               acl=None,
               ephemeral=False,
               sequence=False,
               makepath=False):
        """Creates a Zookeeper node.

        :param: path: The zookeeper node path
        :param: value: Zookeeper node value
        :param: acl: ACL list
        :param: ephemeral: Boolean indicating where this node is tied to
          this session.
        :param: sequence:  Boolean indicating whether path is suffixed
          with a unique index.
        :param: makepath: Whether the path should be created if it doesn't
          exist.
        """
        _log.debug("ZK: Creating node " + path)
        return self.zk.create(path, value, acl, ephemeral, sequence, makepath)

    def delete(self, path, recursive=False):
        """Deletes a Zookeeper node.

        :param: path: The zookeeper node path
        :param: recursive: Recursively delete node and all its children.
        """
        _log.debug("ZK: Deleting node " + path)
        return self.zk.delete(path, recursive=recursive)

    def delete_topic_partitions(self, groupid, topic, partitions):
        """Delete the specified partitions within the topic that the consumer
        is subscribed to.

        :param: groupid: The consumer group ID for the consumer.
        :param: topic: Kafka topic.
        :param: partitions: List of partitions within the topic to be deleted.
        :raises:
          NoNodeError: if the consumer is not subscribed to the topic

          ZookeeperError: if there is an error with Zookeeper
        """
        for partition in partitions:
            path = "/consumers/{groupid}/offsets/{topic}/{partition}".format(
                groupid=groupid, topic=topic, partition=partition)
            self.delete(path)

    def delete_topic(self, groupid, topic):
        path = "/consumers/{groupid}/offsets/{topic}".format(
            groupid=groupid,
            topic=topic,
        )
        self.delete(path, True)

    def delete_group(self, groupid):
        path = "/consumers/{groupid}".format(groupid=groupid, )
        self.delete(path, True)

    def execute_plan(self, plan, allow_rf_change=False):
        """Submit reassignment plan for execution."""
        reassignment_path = '{admin}/{reassignment_node}'\
            .format(admin=ADMIN_PATH, reassignment_node=REASSIGNMENT_NODE)
        plan_json = dump_json(plan)
        base_plan = self.get_cluster_plan()
        if not validate_plan(plan, base_plan, allow_rf_change=allow_rf_change):
            _log.error('Given plan is invalid. ABORTING reassignment...')
            return False
        # Send proposed-plan to zookeeper
        try:
            _log.info('Sending plan to Zookeeper...')
            self.create(reassignment_path, plan_json, makepath=True)
            _log.info(
                'Re-assign partitions node in Zookeeper updated successfully '
                'with {plan}'.format(plan=plan), )
            return True
        except NodeExistsError:
            _log.warning('Previous plan in progress. Exiting..')
            in_progress_plan = load_json(self.get(reassignment_path)[0])
            in_progress_partitions = [
                '{topic}-{p_id}'.format(
                    topic=p_data['topic'],
                    p_id=str(p_data['partition']),
                ) for p_data in in_progress_plan['partitions']
            ]
            _log.warning(
                '{count} partition(s) reassignment currently in progress:-'.
                format(count=len(in_progress_partitions)), )
            _log.warning(
                '{partitions}. ABORTING reassignment...'.format(
                    partitions=', '.join(in_progress_partitions), ), )
            return False
        except Exception as e:
            _log.error(
                'Could not re-assign partitions {plan}. Error: {e}'.format(
                    plan=plan, e=e), )
            return False

    def get_cluster_plan(self):
        """Fetch cluster plan from zookeeper."""

        _log.info('Fetching current cluster-topology from Zookeeper...')
        cluster_layout = self.get_topics(fetch_partition_state=False)
        # Re-format cluster-layout
        partitions = [{
            'topic': topic_id,
            'partition': int(p_id),
            'replicas': partitions_data['replicas']
        } for topic_id, topic_info in six.iteritems(cluster_layout)
                      for p_id, partitions_data in six.iteritems(
                          topic_info['partitions'])]
        return {'version': 1, 'partitions': partitions}

    def get_pending_plan(self):
        """Read the currently running plan on reassign_partitions node."""
        reassignment_path = '{admin}/{reassignment_node}'\
            .format(admin=ADMIN_PATH, reassignment_node=REASSIGNMENT_NODE)
        try:
            result = self.get(reassignment_path)
            return load_json(result[0])
        except NoNodeError:
            return {}
Beispiel #33
0
class ZkClient:
    """
    Wrapper class over KazooClient. Provides utility methods for standalone failure tests to get details about
    processor group state stored in zookeeper.

    Instantiates a kazoo client to connect to zookeeper server at :param zookeeper_host::param zookeeper_port.
    """
    def __init__(self, zookeeper_host, zookeeper_port, app_name, app_id):
        self.kazoo_client = KazooClient(
            hosts='{0}:{1}'.format(zookeeper_host, zookeeper_port))
        self.zk_base_node = 'app-{0}-{1}/{2}-{3}-coordinationData'.format(
            app_name, app_id, app_name, app_id)

    def start(self):
        """
        Establishes connection with the zookeeper server at self.host_name:self.port.
        """
        self.kazoo_client.start()

    def stop(self):
        """
        Closes and releases the connection held with the zookeeper server.
        """
        self.kazoo_client.stop()

    def watch_job_model(self, watch_function):
        self.kazoo_client.ensure_path(
            '{0}/JobModelGeneration/jobModels/'.format(self.zk_base_node))
        self.kazoo_client.get_children(
            '{0}/JobModelGeneration/jobModels/'.format(self.zk_base_node),
            watch=watch_function)

    def get_latest_job_model(self):
        """
        Reads and returns the latest JobModel from zookeeper.
        """
        job_model_dict = {}
        try:
            childZkNodes = self.kazoo_client.get_children(
                '{0}/JobModelGeneration/jobModels/'.format(self.zk_base_node))
            if len(childZkNodes) > 0:
                childZkNodes.sort()
                childZkNodes.reverse()

                job_model_generation_path = '{0}/JobModelGeneration/jobModels/{1}/'.format(
                    self.zk_base_node, childZkNodes[0])
                job_model, _ = self.kazoo_client.get(job_model_generation_path)
                """
                ZkClient java library stores the data in the following format in zookeeper:
                        class_name, data_length, actual_data

                JobModel json manipulation: Delete all the characters before first occurrence of '{' in jobModel json string.

                Normal json deserialization without the above custom string massaging fails. This will be removed after SAMZA-1749.
                """

                first_curly_brace_index = job_model.find('{')
                job_model = job_model[first_curly_brace_index:]
                job_model_dict = json.loads(job_model)
                logger.info(
                    "Recent JobModel in zookeeper: {0}".format(job_model_dict))
        except:
            logger.error(traceback.format_exc(sys.exc_info()))
        return job_model_dict

    def get_leader_processor_id(self):
        """
        Determines the processorId of the current leader in a processors group.

        Returns the processorId of the leader if leader exists.
        Returns None otherwise.
        """
        leader_processor_id = None
        try:
            processors_path = '{0}/processors'.format(self.zk_base_node)
            childZkNodes = self.kazoo_client.get_children(processors_path)
            childZkNodes.sort()
            child_processor_path = '{0}/{1}'.format(processors_path,
                                                    childZkNodes[0])
            processor_data, _ = self.kazoo_client.get(child_processor_path)
            host, leader_processor_id = processor_data.split(" ")
        except:
            logger.error(traceback.format_exc(sys.exc_info()))
        return leader_processor_id

    def purge_all_nodes(self):
        """
        Recursively delete all zookeeper nodes from the base node: self.zk_base_node.
        """
        try:
            self.kazoo_client.delete(path=self.zk_base_node,
                                     version=-1,
                                     recursive=True)
        except:
            logger.error(traceback.format_exc(sys.exc_info()))

    def get_active_processors(self):
        """
        Determines the processor ids that are active in zookeeper.
        """
        processor_ids = []
        try:
            processors_path = '{0}/processors'.format(self.zk_base_node)
            childZkNodes = self.kazoo_client.get_children(processors_path)
            childZkNodes.sort()

            for childZkNode in childZkNodes:
                child_processor_path = '{0}/{1}'.format(
                    processors_path, childZkNode)
                processor_data, _ = self.kazoo_client.get(child_processor_path)
                host, processor_id = processor_data.split(" ")
                processor_ids.append(processor_id)
        except:
            logger.error(traceback.format_exc(sys.exc_info()))
        return processor_ids
Beispiel #34
0
        # Handle being disconnected from Zookeeper
        logger.error('session was suspended')
    elif state == KazooState.CONNECTED:
        logger.info('session is connected')
    else:
        # Handle being connected/reconnected to Zookeeper
        logger.error('session was being connected/reconnected')


zk = KazooClient(hosts='127.0.0.1:2181')
zk.add_listener(my_listener)
zk.start()
id = int(time.time())

if zk.exists('/batches'):
    zk.delete('/batches', recursive=True)

zk.ensure_path('/batches')


@zk.DataWatch('/batches/job_%s/node_1' % id)
def watch_node(data, stat):
    try:
        print("Version: %s, data: %s" % (stat.version, data.decode("utf-8")))
    except Exception as e:
        print(e)


@zk.ChildrenWatch('/batches')
def watch_children(children):
    try:
Beispiel #35
0
class zookeeper:
	def __init__(self, address):
		self.address = address
		self.zk = KazooClient(address)
		self.zk.start()

		self.arcus_cache_map = {} 
		self.arcus_node_map = {}

		self.force = False
		self.meta = ('', None)
		self.meta_mtime = None

	def __repr__(self):
		repr = '[ZooKeeper: %s] %s, %s' % (self.address, self.meta[0], str(self.meta[1]))

		for code, cache in self.arcus_cache_map.items():
			repr = '%s\n\n%s' % (repr, cache)

		return repr

	def set_force(self):
		self.force = True

	def zk_read(self, path):
		data, stat = self.zk.get(path)
		children = self.zk.get_children(path)
		return data, stat, children

	def zk_children(self, path, watch=None):
		if watch != None:
			return self.zk.get_children(path, watch = watch)
		else:
			return self.zk.get_children(path)

	def zk_children_if_exists(self, path, watch=None):
		if self.zk_exists(path) == False:
			return []

		return self.zk_children(path, watch)
	
	def zk_exists(self, path):
		if self.zk.exists(path) == None:
			return False

		return True

	def zk_create(self, path, value):
		try:
			self.zk.create(path, bytes(value, 'utf-8'))
		except NodeExistsError:
			if self.force == False:
				raise NodeExistsError
		
	def zk_delete(self, path):
		try:
			self.zk.delete(path)
		except NoNodeError:
			if self.force == False:
				raise NoNodeError
		
	def zk_delete_tree(self, path):
		try:
			self.zk.delete(path, recursive=True)
		except NoNodeError:
			if self.force == False:
				raise NoNodeError

	def zk_update(self, path, value):
		try:
			self.zk.set(path, bytes(value, 'utf-8'))
		except NoNodeError:
			if self.force == False:
				raise NoNodeError

	def get_arcus_cache_list(self):
		children = self.zk_children_if_exists('/arcus/cache_list/')
		children += self.zk_children_if_exists('/arcus_repl/cache_list/')

		return children

	def get_arcus_node_of_code(self, code, server):
		# repl case
		children = self.zk_children_if_exists('/arcus_repl/cache_list/' + code)
		children += self.zk_children_if_exists('/arcus/cache_list/' + code)
		ret = []
		for child in children:
			tmp = child.split('^', 2) # remove repl info
			if len(tmp) == 3:
				child = tmp[2]

			addr, name = child.split('-', 1)
			ip, port = addr.split(':', 1)

			if server != '' and (server != ip and server != name):
				continue # skip this

			node = arcus_node(ip, port)
			node.name = name
			ret.append(node)


		return ret

	def get_arcus_node_of_server(self, addr):
		ip = socket.gethostbyname(addr)

		children = self.zk_children_if_exists('/arcus_repl/cache_server_mapping/')
		children += self.zk_children_if_exists('/arcus/cache_server_mapping/')
		ret = []
		for child in children:
			l = len(ip)
			if child[:l] == ip:
				code = self.zk_children_if_exists('/arcus_repl/cache_server_mapping/' + child)
				if len(code) == 0:
					code = self.zk_children_if_exists('/arcus/cache_server_mapping/' + child)

				code = code[0]

				tmp = code.split('^') # remove repl info
				code = tmp[0]
				
				try:
					ip, port = child.split(':')
				except ValueError:
					print('No port defined in cache_server_mapping: %s' % child)
					continue

				node = arcus_node(ip, port)
				node.code = code
				ret.append(node)

		return ret

	def _get_arcus_node(self, child, results):
		code = self.zk_children_if_exists('/arcus_repl/cache_server_mapping/' + child)
		if len(code) == 0:
			code = self.zk_children_if_exists('/arcus/cache_server_mapping/' + child)

		if len(code) == 0:
			print('no childrens in cache_server_mapping error: %s' % child)
			print(code)
			return

		code = code[0]

		tmp = code.split('^') # remove repl info
		code = tmp[0]

		try:
			ip, port = child.split(':')
		except ValueError:
			print('No port defined in cache_server_mapping: %s' % child)
			ip = child
			port = '0'


		node = arcus_node(ip, port)
		node.code = code
		results.append(node)

	def get_arcus_node_all(self):
		children = self.zk_children_if_exists('/arcus_repl/cache_server_mapping/')
		children += self.zk_children_if_exists('/arcus/cache_server_mapping/')

		ret = []
		threads = []

		#print(children)
		for child in children:
			th = threading.Thread(target = self._get_arcus_node, args = (child, ret))
			th.start()
			threads.append(th)

		for th in threads:
			th.join()

		return ret

	def _get_arcus_meta(self, child, results):
		data, stat, children = self.zk_read('/arcus/meta/' + child)
		results[child] = [data.decode('utf-8'), stat]


	def get_arcus_meta_all(self):
		if self.zk.exists('/arcus/meta') == None:
			self.zk.create('/arcus/meta', b'arcus meta info')

		children = self.zk.get_children('/arcus/meta')
		print('# children')
		print(children)

		threads = []
		ret = {}

		#print(children)
		for child in children:
			th = threading.Thread(target = self._get_arcus_meta, args = (child, ret))
			th.start()
			threads.append(th)

		for th in threads:
			th.join()

		return ret


	def _match_code_and_nodes(self, code, cache, meta):
		#repl case
		children = self.zk_children_if_exists('/arcus_repl/cache_list/' + code)
		children += self.zk_children_if_exists('/arcus/cache_list/' + code)
		for child in children:
			tmp = child.split('^', 2) # remove repl info
			if len(tmp) == 3:
				child = tmp[2]

			addr, name = child.split('-')
			try:
				node = self.arcus_node_map[addr]
			except KeyError:
				print('%s of %s is not defined in cache_server_mapping' % (addr, code))
				ip, port = addr.split(':')
				node = arcus_node(ip, port)
				node.noport = True
		
			node.active = True
			cache.active_node.append(node)

		
		for node in cache.node:
			if node.active == False:
				cache.dead_node.append(node)

		if code in meta:
			cache.meta = meta[code]



	def load_all(self):
		codes = self.get_arcus_cache_list()
		for code in codes:
			cache = arcus_cache(self.address, code)
			self.arcus_cache_map[code] = cache

		print('# get_arcus_node_all()')
		nodes = self.get_arcus_node_all()
		print('# done')

		for node in nodes:
			self.arcus_node_map[node.ip + ":" + node.port] = node
			self.arcus_cache_map[node.code].node.append(node)

		# meta info 
		print('# get_arcus_meta_all()')
		meta = self.get_arcus_meta_all()
		print('# done')

		print('# match code & nodes')
		threads = []
		
		for code, cache in self.arcus_cache_map.items():
			th = threading.Thread(target = self._match_code_and_nodes, args = (code, cache, meta))
			th.start()
			threads.append(th)

		for th in threads:
			th.join()

		print('#done')

		if 'zookeeper' in meta:
			self.meta = meta['zookeeper']
			

	def _callback(self, event):
		child_list = self.zk.get_children(event.path)
		cloud = os.path.basename(event.path)
		cache = self.arcus_cache_map[cloud]

		event_list = { 'created':[], 'deleted':[] }
		current = {}
		print('##### active node')
		print(cache.active_node)

		children = []
		for child in child_list:
			addr = child.split('-')[0]
			children.append(addr)
		
		print('#### children')
		print(children)

		for node in cache.active_node:
			current[node.ip + ':' + node.port] = True

		print('##### current')
		print(current)

		for node in cache.active_node:
			addr = node.ip + ':' + node.port
			if addr not in children:
				event_list['deleted'].append(addr)
				cache.active_node.remove(node)


		for child in children:
			if child not in current:
				event_list['created'].append(child)
				ip, port = child.split(':')
				node = arcus_node(ip, port)
				cache.active_node.append(node)


		print('####### result')
		print(cache.active_node)

		self.callback(event, event_list)
		children = self.zk.get_children(event.path, watch = self._callback)
		

	def watch(self, callback):
		self.callback = callback
		for code, cache in self.arcus_cache_map.items():
			children = self.zk_children_if_exists('/arcus/cache_list/' + code, watch=self._callback)
			children += self.zk_children_if_exists('/arcus_repl/cache_list/' + code, watch=self._callback)
Beispiel #36
0
class BalancedConsumer(object):
    """
    A self-balancing consumer for Kafka that uses ZooKeeper to communicate
    with other balancing consumers.

    Maintains a single instance of SimpleConsumer, periodically using the
    consumer rebalancing algorithm to reassign partitions to this
    SimpleConsumer.
    """
    def __init__(self,
                 topic,
                 cluster,
                 consumer_group,
                 fetch_message_max_bytes=1024 * 1024,
                 num_consumer_fetchers=1,
                 auto_commit_enable=False,
                 auto_commit_interval_ms=60 * 1000,
                 queued_max_messages=2000,
                 fetch_min_bytes=1,
                 fetch_wait_max_ms=100,
                 offsets_channel_backoff_ms=1000,
                 offsets_commit_max_retries=5,
                 auto_offset_reset=OffsetType.EARLIEST,
                 consumer_timeout_ms=-1,
                 rebalance_max_retries=5,
                 rebalance_backoff_ms=2 * 1000,
                 zookeeper_connection_timeout_ms=6 * 1000,
                 zookeeper_connect='127.0.0.1:2181',
                 zookeeper=None,
                 auto_start=True,
                 reset_offset_on_start=False,
                 post_rebalance_callback=None,
                 use_rdkafka=False):
        """Create a BalancedConsumer instance

        :param topic: The topic this consumer should consume
        :type topic: :class:`pykafka.topic.Topic`
        :param cluster: The cluster to which this consumer should connect
        :type cluster: :class:`pykafka.cluster.Cluster`
        :param consumer_group: The name of the consumer group this consumer
            should join.
        :type consumer_group: bytes
        :param fetch_message_max_bytes: The number of bytes of messages to
            attempt to fetch with each fetch request
        :type fetch_message_max_bytes: int
        :param num_consumer_fetchers: The number of workers used to make
            FetchRequests
        :type num_consumer_fetchers: int
        :param auto_commit_enable: If true, periodically commit to kafka the
            offset of messages already fetched by this consumer. This also
            requires that `consumer_group` is not `None`.
        :type auto_commit_enable: bool
        :param auto_commit_interval_ms: The frequency (in milliseconds) at which
            the consumer's offsets are committed to kafka. This setting is
            ignored if `auto_commit_enable` is `False`.
        :type auto_commit_interval_ms: int
        :param queued_max_messages: The maximum number of messages buffered for
            consumption in the internal
            :class:`pykafka.simpleconsumer.SimpleConsumer`
        :type queued_max_messages: int
        :param fetch_min_bytes: The minimum amount of data (in bytes) that the
            server should return for a fetch request. If insufficient data is
            available, the request will block until sufficient data is available.
        :type fetch_min_bytes: int
        :param fetch_wait_max_ms: The maximum amount of time (in milliseconds)
            that the server will block before answering a fetch request if
            there isn't sufficient data to immediately satisfy `fetch_min_bytes`.
        :type fetch_wait_max_ms: int
        :param offsets_channel_backoff_ms: Backoff time to retry failed offset
            commits and fetches.
        :type offsets_channel_backoff_ms: int
        :param offsets_commit_max_retries: The number of times the offset commit
            worker should retry before raising an error.
        :type offsets_commit_max_retries: int
        :param auto_offset_reset: What to do if an offset is out of range. This
            setting indicates how to reset the consumer's internal offset
            counter when an `OffsetOutOfRangeError` is encountered.
        :type auto_offset_reset: :class:`pykafka.common.OffsetType`
        :param consumer_timeout_ms: Amount of time (in milliseconds) the
            consumer may spend without messages available for consumption
            before returning None.
        :type consumer_timeout_ms: int
        :param rebalance_max_retries: The number of times the rebalance should
            retry before raising an error.
        :type rebalance_max_retries: int
        :param rebalance_backoff_ms: Backoff time (in milliseconds) between
            retries during rebalance.
        :type rebalance_backoff_ms: int
        :param zookeeper_connection_timeout_ms: The maximum time (in
            milliseconds) that the consumer waits while establishing a
            connection to zookeeper.
        :type zookeeper_connection_timeout_ms: int
        :param zookeeper_connect: Comma-separated (ip1:port1,ip2:port2) strings
            indicating the zookeeper nodes to which to connect.
        :type zookeeper_connect: str
        :param zookeeper: A KazooClient connected to a Zookeeper instance.
            If provided, `zookeeper_connect` is ignored.
        :type zookeeper: :class:`kazoo.client.KazooClient`
        :param auto_start: Whether the consumer should begin communicating
            with zookeeper after __init__ is complete. If false, communication
            can be started with `start()`.
        :type auto_start: bool
        :param reset_offset_on_start: Whether the consumer should reset its
            internal offset counter to `self._auto_offset_reset` and commit that
            offset immediately upon starting up
        :type reset_offset_on_start: bool
        :param post_rebalance_callback: A function to be called when a rebalance is
            in progress. This function should accept three arguments: the
            :class:`pykafka.balancedconsumer.BalancedConsumer` instance that just
            completed its rebalance, a dict of partitions that it owned before the
            rebalance, and a dict of partitions it owns after the rebalance. These dicts
            map partition ids to the most recently known offsets for those partitions.
            This function can optionally return a dictionary mapping partition ids to
            offsets. If it does, the consumer will reset its offsets to the supplied
            values before continuing consumption.
            Note that the BalancedConsumer is in a poorly defined state at
            the time this callback runs, so that accessing its properties
            (such as `held_offsets` or `partitions`) might yield confusing
            results.  Instead, the callback should really rely on the
            provided partition-id dicts, which are well-defined.
        :type post_rebalance_callback: function
        :param use_rdkafka: Use librdkafka-backed consumer if available
        :type use_rdkafka: bool
        """
        self._cluster = cluster
        self._consumer_group = consumer_group
        self._topic = topic

        self._auto_commit_enable = auto_commit_enable
        self._auto_commit_interval_ms = auto_commit_interval_ms
        self._fetch_message_max_bytes = fetch_message_max_bytes
        self._fetch_min_bytes = fetch_min_bytes
        self._rebalance_max_retries = rebalance_max_retries
        self._num_consumer_fetchers = num_consumer_fetchers
        self._queued_max_messages = queued_max_messages
        self._fetch_wait_max_ms = fetch_wait_max_ms
        self._rebalance_backoff_ms = rebalance_backoff_ms
        self._consumer_timeout_ms = consumer_timeout_ms
        self._offsets_channel_backoff_ms = offsets_channel_backoff_ms
        self._offsets_commit_max_retries = offsets_commit_max_retries
        self._auto_offset_reset = auto_offset_reset
        self._zookeeper_connect = zookeeper_connect
        self._zookeeper_connection_timeout_ms = zookeeper_connection_timeout_ms
        self._reset_offset_on_start = reset_offset_on_start
        self._post_rebalance_callback = post_rebalance_callback
        self._running = False
        self._worker_exception = None
        self._worker_trace_logged = False

        if not rdkafka and use_rdkafka:
            raise ImportError("use_rdkafka requires rdkafka to be installed")
        self._use_rdkafka = rdkafka and use_rdkafka

        self._rebalancing_lock = cluster.handler.Lock()
        self._consumer = None
        self._consumer_id = "{hostname}:{uuid}".format(
            hostname=socket.gethostname(),
            uuid=uuid4()
        )
        self._setting_watches = True

        self._topic_path = '/consumers/{group}/owners/{topic}'.format(
            group=self._consumer_group,
            topic=self._topic.name)
        self._consumer_id_path = '/consumers/{group}/ids'.format(
            group=self._consumer_group)

        self._zookeeper = None
        self._owns_zookeeper = zookeeper is None
        if zookeeper is not None:
            self._zookeeper = zookeeper
        if auto_start is True:
            self.start()

    def __del__(self):
        log.debug("Finalising {}".format(self))
        self.stop()

    def __repr__(self):
        return "<{module}.{name} at {id_} (consumer_group={group})>".format(
            module=self.__class__.__module__,
            name=self.__class__.__name__,
            id_=hex(id(self)),
            group=self._consumer_group
        )

    def _raise_worker_exceptions(self):
        """Raises exceptions encountered on worker threads"""
        if self._worker_exception is not None:
            _, ex, tb = self._worker_exception
            if not self._worker_trace_logged:
                self._worker_trace_logged = True
                log.error("Exception encountered in worker thread:\n%s",
                          "".join(traceback.format_tb(tb)))
            raise ex

    def _setup_checker_worker(self):
        """Start the zookeeper partition checker thread"""
        self = weakref.proxy(self)

        def checker():
            while True:
                try:
                    if not self._running:
                        break
                    time.sleep(120)
                    if not self._check_held_partitions():
                        self._rebalance()
                except Exception as e:
                    if not isinstance(e, ReferenceError):
                        # surface all exceptions to the main thread
                        self._worker_exception = sys.exc_info()
                    break
            log.debug("Checker thread exiting")
        log.debug("Starting checker thread")
        return self._cluster.handler.spawn(checker)

    @property
    def partitions(self):
        return self._consumer.partitions if self._consumer else dict()

    @property
    def _partitions(self):
        """Convenient shorthand for set of partitions internally held"""
        return set(
            [] if self.partitions is None else itervalues(self.partitions))

    @property
    def held_offsets(self):
        """Return a map from partition id to held offset for each partition"""
        if not self._consumer:
            return None
        return self._consumer.held_offsets

    def start(self):
        """Open connections and join a cluster."""
        try:
            if self._zookeeper is None:
                self._setup_zookeeper(self._zookeeper_connect,
                                      self._zookeeper_connection_timeout_ms)
            self._zookeeper.ensure_path(self._topic_path)
            self._add_self()
            self._running = True
            self._set_watches()
            self._rebalance()
            self._setup_checker_worker()
        except Exception:
            log.error("Stopping consumer in response to error")
            self.stop()

    def stop(self):
        """Close the zookeeper connection and stop consuming.

        This method should be called as part of a graceful shutdown process.
        """
        log.debug("Stopping {}".format(self))
        with self._rebalancing_lock:
            # We acquire the lock in order to prevent a race condition where a
            # rebalance that is already underway might re-register the zk
            # nodes that we remove here
            self._running = False
        if self._consumer is not None:
            self._consumer.stop()
        if self._owns_zookeeper:
            # NB this should always come last, so we do not hand over control
            # of our partitions until consumption has really been halted
            self._zookeeper.stop()
        else:
            self._remove_partitions(self._get_held_partitions())
            try:
                self._zookeeper.delete(self._path_self)
            except NoNodeException:
                pass
        # additionally we'd want to remove watches here, but there are no
        # facilities for that in ChildrenWatch - as a workaround we check
        # self._running in the watcher callbacks (see further down)

    def _setup_zookeeper(self, zookeeper_connect, timeout):
        """Open a connection to a ZooKeeper host.

        :param zookeeper_connect: The 'ip:port' address of the zookeeper node to
            which to connect.
        :type zookeeper_connect: str
        :param timeout: Connection timeout (in milliseconds)
        :type timeout: int
        """
        self._zookeeper = KazooClient(zookeeper_connect, timeout=timeout / 1000)
        self._zookeeper.start()

    def _setup_internal_consumer(self, partitions=None, start=True):
        """Instantiate an internal SimpleConsumer instance"""
        self._consumer = self._get_internal_consumer(partitions=partitions, start=start)

    def _get_internal_consumer(self, partitions=None, start=True):
        """Instantiate a SimpleConsumer for internal use.

        If there is already a SimpleConsumer instance held by this object,
        disable its workers and mark it for garbage collection before
        creating a new one.
        """
        if partitions is None:
            partitions = []
        reset_offset_on_start = self._reset_offset_on_start
        if self._consumer is not None:
            self._consumer.stop()
            # only use this setting for the first call to
            # _get_internal_consumer. subsequent calls should not
            # reset the offsets, since they can happen at any time
            reset_offset_on_start = False
        Cls = (rdkafka.RdKafkaSimpleConsumer
               if self._use_rdkafka else SimpleConsumer)
        return Cls(
            self._topic,
            self._cluster,
            consumer_group=self._consumer_group,
            partitions=partitions,
            auto_commit_enable=self._auto_commit_enable,
            auto_commit_interval_ms=self._auto_commit_interval_ms,
            fetch_message_max_bytes=self._fetch_message_max_bytes,
            fetch_min_bytes=self._fetch_min_bytes,
            num_consumer_fetchers=self._num_consumer_fetchers,
            queued_max_messages=self._queued_max_messages,
            fetch_wait_max_ms=self._fetch_wait_max_ms,
            consumer_timeout_ms=self._consumer_timeout_ms,
            offsets_channel_backoff_ms=self._offsets_channel_backoff_ms,
            offsets_commit_max_retries=self._offsets_commit_max_retries,
            auto_offset_reset=self._auto_offset_reset,
            reset_offset_on_start=reset_offset_on_start,
            auto_start=start
        )

    def _decide_partitions(self, participants):
        """Decide which partitions belong to this consumer.

        Uses the consumer rebalancing algorithm described here
        http://kafka.apache.org/documentation.html

        It is very important that the participants array is sorted,
        since this algorithm runs on each consumer and indexes into the same
        array. The same array index operation must return the same
        result on each consumer.

        :param participants: Sorted list of ids of all other consumers in this
            consumer group.
        :type participants: Iterable of `bytes`
        """
        # Freeze and sort partitions so we always have the same results
        p_to_str = lambda p: '-'.join([str(p.topic.name), str(p.leader.id), str(p.id)])
        all_parts = self._topic.partitions.values()
        all_parts = sorted(all_parts, key=p_to_str)

        # get start point, # of partitions, and remainder
        participants = sorted(participants)  # just make sure it's sorted.
        idx = participants.index(self._consumer_id)
        parts_per_consumer = len(all_parts) // len(participants)
        remainder_ppc = len(all_parts) % len(participants)

        start = parts_per_consumer * idx + min(idx, remainder_ppc)
        num_parts = parts_per_consumer + (0 if (idx + 1 > remainder_ppc) else 1)

        # assign partitions from i*N to (i+1)*N - 1 to consumer Ci
        new_partitions = itertools.islice(all_parts, start, start + num_parts)
        new_partitions = set(new_partitions)
        log.info('Balancing %i participants for %i partitions.\nOwning %i partitions.',
                 len(participants), len(all_parts), len(new_partitions))
        log.debug('My partitions: %s', [p_to_str(p) for p in new_partitions])
        return new_partitions

    def _get_participants(self):
        """Use zookeeper to get the other consumers of this topic.

        :return: A sorted list of the ids of the other consumers of this
            consumer's topic
        """
        try:
            consumer_ids = self._zookeeper.get_children(self._consumer_id_path)
        except NoNodeException:
            log.debug("Consumer group doesn't exist. "
                      "No participants to find")
            return []

        participants = []
        for id_ in consumer_ids:
            try:
                topic, stat = self._zookeeper.get("%s/%s" % (self._consumer_id_path, id_))
                if topic == self._topic.name:
                    participants.append(id_)
            except NoNodeException:
                pass  # disappeared between ``get_children`` and ``get``
        participants = sorted(participants)
        return participants

    def _build_watch_callback(self, fn, proxy):
        """Return a function that's safe to use as a ChildrenWatch callback

        Fixes the issue from https://github.com/Parsely/pykafka/issues/345
        """
        def _callback(children):
            # discover whether the referenced object still exists
            try:
                proxy.__repr__()
            except ReferenceError:
                return False
            return fn(proxy, children)
        return _callback

    def _set_watches(self):
        """Set watches in zookeeper that will trigger rebalances.

        Rebalances should be triggered whenever a broker, topic, or consumer
        znode is changed in zookeeper. This ensures that the balance of the
        consumer group remains up-to-date with the current state of the
        cluster.
        """
        proxy = weakref.proxy(self)
        _brokers_changed = self._build_watch_callback(BalancedConsumer._brokers_changed, proxy)
        _topics_changed = self._build_watch_callback(BalancedConsumer._topics_changed, proxy)
        _consumers_changed = self._build_watch_callback(BalancedConsumer._consumers_changed, proxy)

        self._setting_watches = True
        # Set all our watches and then rebalance
        broker_path = '/brokers/ids'
        try:
            self._broker_watcher = ChildrenWatch(
                self._zookeeper, broker_path,
                _brokers_changed
            )
        except NoNodeException:
            raise Exception(
                'The broker_path "%s" does not exist in your '
                'ZooKeeper cluster -- is your Kafka cluster running?'
                % broker_path)

        self._topics_watcher = ChildrenWatch(
            self._zookeeper,
            '/brokers/topics',
            _topics_changed
        )

        self._consumer_watcher = ChildrenWatch(
            self._zookeeper, self._consumer_id_path,
            _consumers_changed
        )
        self._setting_watches = False

    def _add_self(self):
        """Register this consumer in zookeeper.

        This method ensures that the number of participants is at most the
        number of partitions.
        """
        participants = self._get_participants()
        if len(self._topic.partitions) <= len(participants):
            raise KafkaException("Cannot add consumer: more consumers than partitions")

        self._zookeeper.create(
            self._path_self, self._topic.name, ephemeral=True, makepath=True)

    @property
    def _path_self(self):
        """Path where this consumer should be registered in zookeeper"""
        return '{path}/{id_}'.format(
            path=self._consumer_id_path,
            id_=self._consumer_id
        )

    def _rebalance(self):
        """Claim partitions for this consumer.

        This method is called whenever a zookeeper watch is triggered.
        """
        if self._consumer is not None:
            self.commit_offsets()
        # this is necessary because we can't stop() while the lock is held
        # (it's not an RLock)
        should_stop = False
        with self._rebalancing_lock:
            if not self._running:
                raise ConsumerStoppedException
            log.info('Rebalancing consumer %s for topic %s.' % (
                self._consumer_id, self._topic.name)
            )

            for i in range(self._rebalance_max_retries):
                try:
                    # If retrying, be sure to make sure the
                    # partition allocation is correct.
                    participants = self._get_participants()
                    if self._consumer_id not in participants:
                        # situation that only occurs if our zk session expired
                        self._add_self()
                        participants.append(self._consumer_id)

                    new_partitions = self._decide_partitions(participants)
                    if not new_partitions:
                        should_stop = True
                        log.warning("No partitions assigned to consumer %s - stopping",
                                    self._consumer_id)
                        break

                    # Update zk with any changes:
                    # Note that we explicitly fetch our set of held partitions
                    # from zk, rather than assuming it will be identical to
                    # `self.partitions`.  This covers the (rare) situation
                    # where due to an interrupted connection our zk session
                    # has expired, in which case we'd hold zero partitions on
                    # zk, but `self._partitions` may be outdated and non-empty
                    current_zk_parts = self._get_held_partitions()
                    self._remove_partitions(current_zk_parts - new_partitions)
                    self._add_partitions(new_partitions - current_zk_parts)

                    # Only re-create internal consumer if something changed.
                    if new_partitions != self._partitions:
                        cns = self._get_internal_consumer(list(new_partitions))
                        if self._post_rebalance_callback is not None:
                            old_offsets = (self._consumer.held_offsets
                                           if self._consumer else dict())
                            new_offsets = cns.held_offsets
                            reset_offsets = self._post_rebalance_callback(
                                self, old_offsets, new_offsets)
                            if reset_offsets:
                                cns.reset_offsets(partition_offsets=[
                                    (cns.partitions[id_], offset) for
                                    (id_, offset) in iteritems(reset_offsets)])
                        self._consumer = cns

                    log.info('Rebalancing Complete.')
                    break
                except PartitionOwnedError as ex:
                    if i == self._rebalance_max_retries - 1:
                        log.warning('Failed to acquire partition %s after %d retries.',
                                    ex.partition, i)
                        raise
                    log.info('Unable to acquire partition %s. Retrying', ex.partition)
                    time.sleep(i * (self._rebalance_backoff_ms / 1000))
        if should_stop:
            self.stop()

    def _path_from_partition(self, p):
        """Given a partition, return its path in zookeeper.

        :type p: :class:`pykafka.partition.Partition`
        """
        return "%s/%s-%s" % (self._topic_path, p.leader.id, p.id)

    def _remove_partitions(self, partitions):
        """Remove partitions from the zookeeper registry for this consumer.

        :param partitions: The partitions to remove.
        :type partitions: Iterable of :class:`pykafka.partition.Partition`
        """
        for p in partitions:
            # TODO pass zk node version to make sure we still own this node
            self._zookeeper.delete(self._path_from_partition(p))

    def _add_partitions(self, partitions):
        """Add partitions to the zookeeper registry for this consumer.

        :param partitions: The partitions to add.
        :type partitions: Iterable of :class:`pykafka.partition.Partition`
        """
        for p in partitions:
            try:
                self._zookeeper.create(
                    self._path_from_partition(p),
                    value=get_bytes(self._consumer_id),
                    ephemeral=True
                )
            except NodeExistsError:
                raise PartitionOwnedError(p)

    def _get_held_partitions(self):
        """Build a set of partitions zookeeper says we own"""
        zk_partition_ids = set()
        all_partitions = self._zookeeper.get_children(self._topic_path)
        for partition_slug in all_partitions:
            try:
                owner_id, stat = self._zookeeper.get(
                    '{path}/{slug}'.format(
                        path=self._topic_path, slug=partition_slug))
                if owner_id == get_bytes(self._consumer_id):
                    zk_partition_ids.add(int(partition_slug.split('-')[1]))
            except NoNodeException:
                pass  # disappeared between ``get_children`` and ``get``
        return set(self._topic.partitions[_id] for _id in zk_partition_ids)

    def _check_held_partitions(self):
        """Double-check held partitions against zookeeper

        True if the partitions held by this consumer are the ones that
        zookeeper thinks it's holding, else False.
        """
        log.info("Checking held partitions against ZooKeeper")
        zk_partitions = self._get_held_partitions()
        if zk_partitions != self._partitions:
            log.warning("Internal partition registry doesn't match ZooKeeper!")
            log.debug("Internal partition ids: %s\nZooKeeper partition ids: %s",
                      self._partitions, zk_partitions)
            return False
        return True

    @_catch_thread_exception
    def _brokers_changed(self, brokers):
        if not self._running:
            return False  # `False` tells ChildrenWatch to disable this watch
        if self._setting_watches:
            return
        log.debug("Rebalance triggered by broker change ({})".format(
            self._consumer_id))
        self._rebalance()

    @_catch_thread_exception
    def _consumers_changed(self, consumers):
        if not self._running:
            return False  # `False` tells ChildrenWatch to disable this watch
        if self._setting_watches:
            return
        log.debug("Rebalance triggered by consumer change ({})".format(
            self._consumer_id))
        self._rebalance()

    @_catch_thread_exception
    def _topics_changed(self, topics):
        if not self._running:
            return False  # `False` tells ChildrenWatch to disable this watch
        if self._setting_watches:
            return
        log.debug("Rebalance triggered by topic change ({})".format(
            self._consumer_id))
        self._rebalance()

    def reset_offsets(self, partition_offsets=None):
        """Reset offsets for the specified partitions

        Issue an OffsetRequest for each partition and set the appropriate
        returned offset in the OwnedPartition

        :param partition_offsets: (`partition`, `offset`) pairs to reset
            where `partition` is the partition for which to reset the offset
            and `offset` is the new offset the partition should have
        :type partition_offsets: Iterable of
            (:class:`pykafka.partition.Partition`, int)
        """
        self._raise_worker_exceptions()
        if not self._consumer:
            raise ConsumerStoppedException("Internal consumer is stopped")
        self._consumer.reset_offsets(partition_offsets=partition_offsets)

    def consume(self, block=True):
        """Get one message from the consumer

        :param block: Whether to block while waiting for a message
        :type block: bool
        """

        def consumer_timed_out():
            """Indicates whether the consumer has received messages recently"""
            if self._consumer_timeout_ms == -1:
                return False
            disp = (time.time() - self._last_message_time) * 1000.0
            return disp > self._consumer_timeout_ms
        if not self._partitions:
            raise NoPartitionsForConsumerException()
        message = None
        self._last_message_time = time.time()
        while message is None and not consumer_timed_out():
            self._raise_worker_exceptions()
            try:
                message = self._consumer.consume(block=block)
            except ConsumerStoppedException:
                if not self._running:
                    raise
                continue
            if message:
                self._last_message_time = time.time()
            if not block:
                return message
        return message

    def __iter__(self):
        """Yield an infinite stream of messages until the consumer times out"""
        while True:
            message = self.consume(block=True)
            if not message:
                raise StopIteration
            yield message

    def commit_offsets(self):
        """Commit offsets for this consumer's partitions

        Uses the offset commit/fetch API
        """
        self._raise_worker_exceptions()
        return self._consumer.commit_offsets()
Beispiel #37
0
class SimpleSwitch13(app_manager.RyuApp):
    OFP_VERSIONS = [ofproto_v1_3.OFP_VERSION]

    def __init__(self, *args, **kwargs):
        super(SimpleSwitch13, self).__init__(*args, **kwargs)
        self.mac_to_port = {}
        self.zkConf = {'root':'/multicontroller', 'topo':'/topology',
                       'swstat':'/swstat', 'counter': '/counter'}
        self.zk = KazooClient('127.0.0.1:2181')
        self.zk.start()
        self.ip = '202.201.3.51'
        self.sws = {}
        self.gid = random.randint(0, 10000)
        self.dps = {}
        self.links = []
        self.interval = 5
        self.role = OFPCR_ROLE_EQUAL
        self.topoThread = hub.spawn(self._topoThread)
        self.linkThread = hub.spawn(self._linkDiscover)
        self.clearLinkThread = hub.spawn(self._cleanLinks)
        self.clearLinkThread = hub.spawn(self._cleanSwitches)

    def _cleanSwitches(self):
        while  True:
            self.sws = {k:self.sws[k] for k in self.sws if self.sws[k]}
            hub.sleep(self.interval)

    def _topoThread(self):
        linkNode = self.zkConf['root'] + self.zkConf['topo'] + self.ip
        if self.zk.exists(linkNode):
            self.zk.set(linkNode, json.dumps(self.links))
        else:
            self.zk.create(linkNode, json.dumps(self.links))
        hub.sleep(self.interval)

    def _linkDiscover(self):
        while True:
            for dpid in self.dps:
                self.sendSlldp(dpid)
            hub.sleep(self.interval)

    def sendSlldp(self, dpid):
        dp = self.dps.get(dpid)
        if dp is None:
            return
        actions = [dp.ofproto_parser.OFPActionOutput(dp.ofproto.OFPP_FLOOD)]
        pkt = packet.Packet()
        pkt.add_protocol(ethernet.ethernet(ethertype=ETH_TYPE_SLLDP,
                        dst=SLLDP_MAC_DST, src=SLLDP_MAC_SRC))
        pkt.add_protocol(slldp(dp.id))
        pkt.serialize()
        slldpPacket = pkt.data
        out = dp.ofproto_parser.OFPPacketOut(
            datapath=dp, in_port=dp.ofproto.OFPP_CONTROLLER,
            buffer_id=dp.ofproto.OFP_NO_BUFFER, actions=actions,
            data=slldpPacket)
        dp.send_msg(out)

    def getLinks(self):
        topoNode = self.zkConf['root'] + self.zkConf['topo']
        ips = self.zk.get_children(topoNode)
        res = []
        for ip in ips:
            links = self.zk.get(topoNode + ip)[0]
            for link in links:
                res.append(link)
        return res

    @set_ev_cls(ofp_event.EventOFPPacketIn, MAIN_DISPATCHER)
    def _packet_in_handler(self, ev):

        msg = ev.msg
        pkt = packet.Packet(msg.data)
        eth = pkt.get_protocols(ethernet.ethernet)[0]
        dst = eth.dst

        # SLLDP packet
        if dst == SLLDP_MAC_DST:
            self.handleSlldp(ev)
            return
        # process packet_in message in subclass
        self.packet_in_process(ev)

    def handleSlldp(self, ev):
        msg = ev.msg
        datapath = msg.datapath
        dpid = datapath.id
        inPort = msg.match['in_port']

        pkt = packet.Packet(msg.data)
        slldpBuff = pkt.get_protocols(ethernet.ethernet)[2]
        dpidSrc, _ = slldp.parser(slldpBuff)
        self.links.append({'srcdpid': dpidSrc,
            'dst': {'dpid': dpid, 'port': inPort},
            'time': time.time()})

    def _cleanLinks(self):
        while True:
            now = time.time()
            self.links = [l for l in self.links if now - l['time'] < self.interval]
            hub.sleep(self.interval)

    @abc.abstractmethod
    def packet_in_process(self, ev):
        pass

    @set_ev_cls(event.EventSwitchEnter)
    def switch_enter(self, ev):
        dpid = ev.datapath.id
        self.sws[dpid] = True
        self.dps[dpid] = ev.datapath
        dpNode = self.zkConf['root'] + self.zkConf['swstat'] \
                + '/' + dpid_to_str(dpid)
        self.zk.ensure_path(dpNode)
        if self.election(dpid):
            self.role = OFPCR_ROLE_MASTER
        else:
            self.role = OFPCR_ROLE_SLAVE
        self.countUp(dpid)
        self.roleRequest(dpid, self.role)
        mflag = dpNode + '/' + 'master'
        DataWatch(self.zk, mflag, self.masterWatcher)

    def masterWatcher(self, data, stat, ev):
        if ev and ev.type == 'DELETED':
            _, _, dpid, _ = ev.path.split('/')
            dpid = str_to_dpid(dpid)
            if self.sws.get(dpid):
                if self.election(dpid):
                    self.role = OFPCR_ROLE_MASTER
                    self.roleRequest(dpid, self.role)
            return self.sws.get(dpid)

    def election(self, dpid):
        dpNode = self.zkConf['root'] + self.zkConf['swstat'] \
                + '/' + dpid_to_str(dpid)
        mflag = dpNode + '/' + 'master'
        while not self.zk.exists(mflag):
            mlock = self.zk.Lock(dpNode + '/' + 'mlock', self.ip)
            with mlock:
                if not self.zk.exists(mflag):
                    self.zk.create(mflag, self.ip, ephemeral=True)
            if self.zk.exists(mflag):
                if self.zk.get(mflag) == self.ip:
                    return True
                else:
                    return False
            else:
                time.sleep(random.randint(0, 100)/500.0)
        return False

    def roleRequest(self, dp, role):
        msg = dp.ofproto_parser.OFPRoleRequest(dp, role, self.gid)
        dp.send_msg(msg)

    def getCount(self, dpid):
        dpNode =  self.zkConf['root'] + self.zkConf['swstat'] \
                + '/' + dpid_to_str(dpid)
        countNode = dpNode + self.zkConf['counter']
        counters = self.zk.get_children(countNode)
        return len(counters)

    def countUp(self, dpid):
        countNode = self.zkConf['root'] + self.zkConf['swstat'] \
                + '/' + dpid_to_str(dpid) + self.zkConf['counter']
        self.zk.ensure_path(countNode)
        self.zk.create(countNode+uuid4().hex, 'alive', ephemeral=True)

    @set_ev_cls(event.EventSwitchLeave)
    def switch_levave(self, ev):
        dpid = ev.datapath
        count = self.getCount(dpid)
        self.sws[dpid] = False
        if count == 0:
            dpNode =  self.zkConf['root'] + self.zkConf['swstat'] \
                    + '/' + dpid_to_str(dpid)
        self.zk.delete(dpNode, recursive=True)
Beispiel #38
0
children = zk.get_children('/app')
printout("[INIT]", WHITE)
print(children)

portno = 0
portnum = 0
ranges = []
serverDied = False

if len(children) == 1:
    portno = 8080
    printout("[MASTER]", RED)
    print("MASTER INIT")
    if zk.exists('/meta'):
        zk.delete('/meta', recursive=True)

    zk.create('/meta/master', b'8080', makepath=True)
    zk.create('/meta/status', 'INIT'.encode("utf-8"), makepath=True)
    zk.create('/meta/lastport', '8080'.encode('utf-8'), makepath=True)
    time.sleep(5)
    children = zk.get_children('/app')
    config = {
        "mapper": {},
        "lastDead": {
            "backup": -1,
            # backup is the server port whose data the new server needs to retrieve
            "portno": -1
            # dead server portno
        },
        "numOfServers": len(children)
while True:
    print("cnt(%s)************************************************" % (cnt))
    for child in redis_list:
        child_path = root + '/' + child

        if child in conn:
            rs = conn[host]
        else:
            host, port = child.split(':')
            rs = redis.Redis(host=host, port=port)
            conn[host] = rs

        transaction = zk.transaction()

        try:
            rs.ping()
        except ConnectionError:
            if zk.exists(child_path) != None:
                zk.delete(child_path)
            print("%s: error" % (child_path))
        else:
            if zk.exists(child_path) == None:
                zk.create(child_path)
            print("%s: ok" % (child_path))
        transaction.commit()

    time.sleep(10)
    cnt += 1

zk.stop()
Beispiel #40
0
from kazoo.client import KazooClient
import sys

if __name__ == "__main__":
    kz = KazooClient('127.0.0.1:2181')
    kz.start()
    if len(sys.argv) >= 2 and sys.argv[1] == "clear":
        kz.ensure_path('/dimint/overlord/host_list')
        kz.delete('/dimint/overlord/host_list', recursive=True)
        kz.ensure_path('/dimint/overlord/host_list')
        kz.ensure_path('/dimint/node/list')
        kz.delete('/dimint/node/list', recursive=True)
        kz.ensure_path('/dimint/node/list')
        kz.ensure_path('/dimint/node/role')
        kz.delete('/dimint/node/role', recursive=True)
        kz.ensure_path('/dimint/node/role')
    else:
        kz.ensure_path('/dimint/overlord/host_list')
        print('overlord list : {0}'.format(
            kz.get_children('/dimint/overlord/host_list')))
        kz.ensure_path('/dimint/node/list')
        print('node list : {0}'.format(kz.get_children('/dimint/node/list')))
        kz.ensure_path('/dimint/node/role')
        for master in kz.get_children('/dimint/node/role'):
            print('master node : {0}'.format(master))
            for slave in kz.get_children(
                    '/dimint/node/role/{0}'.format(master)):
                print("\tslave node : {0}".format(slave))
    kz.stop()
Beispiel #41
0
client = docker.from_env()

# A set of global variables to handle entering various sections of the code
not_called_by_scale = True
deleted_master = False
# Setting the initial active containers pid and uuid and the counting the number or requests
active_containers = {}
counter = 0

# Establish a connection with the Zookeeper
zk = KazooClient(hosts='10.0.2.3:2181')
zk.start()

# Create a parent node for the child container if one doesn't exist
if (zk.exists("/producer")):
    zk.delete("/producer", recursive=True)

zk.ensure_path("/producer")

# Run the initial two workers of master and slave on the cry_cloud network
# The time.sleep argument is for the docker service to have enough time to create a new container
temp1 = client.containers.run("worker",
                              detach=True,
                              auto_remove=True,
                              network="cry_cloud")
time.sleep(10)
temp = client.containers.run("worker",
                             detach=True,
                             auto_remove=True,
                             network="cry_cloud")
time.sleep(10)
Beispiel #42
0
class ZkStateManager(StateManager):
  """
  State manager which connects to zookeeper and
  gets and sets states from there.
  """

  def __init__(self, name, host, port, rootpath, tunnelhost):
    self.name = name
    self.host = host
    self.port = port
    self.tunnelhost = tunnelhost
    self.rootpath = rootpath

  def start(self):
    """ state Zookeeper """
    if self.is_host_port_reachable():
      self.client = KazooClient(self.hostport)
    else:
      localport = self.establish_ssh_tunnel()
      self.client = KazooClient("localhost:" + str(localport))
    self.client.start()

    def on_connection_change(state):
      """ callback to log """
      LOG.info("Connection state changed to: " + state)
    self.client.add_listener(on_connection_change)

  def stop(self):
    """ stop Zookeeper """
    self.client.stop()
    self.terminate_ssh_tunnel()

  # pylint: disable=function-redefined
  def get_topologies(self, callback=None):
    """ get topologies """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """Custom callback to get the topologies right now."""
        ret["result"] = data

    try:
      self._get_topologies_with_watch(callback, isWatching)
    except NoNodeError:
      self.client.stop()
      path = self.get_topologies_path()
      raise StateException("Error required topology path '%s' not found" % (path),
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_topologies_with_watch(self, callback, isWatching):
    """
    Helper function to get topologies with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_topologies_path()
    if isWatching:
      LOG.info("Adding children watch for path: " + path)

    # pylint: disable=unused-variable
    @self.client.ChildrenWatch(path)
    def watch_topologies(topologies):
      """ callback to watch topologies """
      callback(topologies)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def get_topology(self, topologyName, callback=None):
    """ get topologies """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """Custom callback to get the topologies right now."""
        ret["result"] = data

    self._get_topology_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_topology_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get pplan with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_topology_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    # pylint: disable=unused-variable, unused-argument
    @self.client.DataWatch(path)
    def watch_topology(data, stats):
      """ watch topology """
      if data:
        topology = Topology()
        topology.ParseFromString(data)
        callback(topology)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def create_topology(self, topologyName, topology):
    """ crate topology """
    if not topology or not topology.IsInitialized():
      raise StateException("Topology protobuf not init properly",
                           StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2]

    path = self.get_topology_path(topologyName)
    LOG.info("Adding topology: {0} to path: {1}".format(
        topologyName, path))
    topologyString = topology.SerializeToString()
    try:
      self.client.create(path, value=topologyString, makepath=True)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while creating topology",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NodeExistsError:
      raise StateException("NodeExistsError while creating topology",
                           StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while creating topology",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def delete_topology(self, topologyName):
    """ delete topology """
    path = self.get_topology_path(topologyName)
    LOG.info("Removing topology: {0} from path: {1}".format(
        topologyName, path))
    try:
      self.client.delete(path)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while deteling topology",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NotEmptyError:
      raise StateException("NotEmptyError while deleting topology",
                           StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while deleting topology",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def get_packing_plan(self, topologyName, callback=None):
    """ get packing plan """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """ Custom callback to get the topologies right now. """
        ret["result"] = data

    self._get_packing_plan_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_packing_plan_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get packing_plan with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_packing_plan_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    # pylint: disable=unused-argument,unused-variable
    @self.client.DataWatch(path)
    def watch_packing_plan(data, stats):
      """ watch the packing plan for updates """
      if data:
        packing_plan = PackingPlan()
        packing_plan.ParseFromString(data)
        callback(packing_plan)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def get_pplan(self, topologyName, callback=None):
    """ get physical plan """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """
        Custom callback to get the topologies right now.
        """
        ret["result"] = data

    self._get_pplan_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_pplan_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get pplan with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_pplan_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    # pylint: disable=unused-variable, unused-argument
    @self.client.DataWatch(path)
    def watch_pplan(data, stats):
      """ invoke callback to watch physical plan """
      if data:
        pplan = PhysicalPlan()
        pplan.ParseFromString(data)
        callback(pplan)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def create_pplan(self, topologyName, pplan):
    """ create physical plan """
    if not pplan or not pplan.IsInitialized():
      raise StateException("Physical Plan protobuf not init properly",
                           StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2]

    path = self.get_pplan_path(topologyName)
    LOG.info("Adding topology: {0} to path: {1}".format(
        topologyName, path))
    pplanString = pplan.SerializeToString()
    try:
      self.client.create(path, value=pplanString, makepath=True)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while creating pplan",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NodeExistsError:
      raise StateException("NodeExistsError while creating pplan",
                           StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while creating pplan",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def delete_pplan(self, topologyName):
    """ delete physical plan info """
    path = self.get_pplan_path(topologyName)
    LOG.info("Removing topology: {0} from path: {1}".format(
        topologyName, path))
    try:
      self.client.delete(path)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while deleting pplan",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NotEmptyError:
      raise StateException("NotEmptyError while deleting pplan",
                           StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while deleting pplan",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def get_execution_state(self, topologyName, callback=None):
    """ get execution state """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """
        Custom callback to get the topologies right now.
        """
        ret["result"] = data

    self._get_execution_state_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_execution_state_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get execution state with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_execution_state_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    # pylint: disable=unused-variable, unused-argument
    @self.client.DataWatch(path)
    def watch_execution_state(data, stats):
      """ invoke callback to watch execute state """
      if data:
        executionState = ExecutionState()
        executionState.ParseFromString(data)
        callback(executionState)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def create_execution_state(self, topologyName, executionState):
    """ create execution state """
    if not executionState or not executionState.IsInitialized():
      raise StateException("Execution State protobuf not init properly",
                           StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2]

    path = self.get_execution_state_path(topologyName)
    LOG.info("Adding topology: {0} to path: {1}".format(
        topologyName, path))
    executionStateString = executionState.SerializeToString()
    try:
      self.client.create(path, value=executionStateString, makepath=True)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while creating execution state",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NodeExistsError:
      raise StateException("NodeExistsError while creating execution state",
                           StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while creating execution state",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def delete_execution_state(self, topologyName):
    """ delete execution state """
    path = self.get_execution_state_path(topologyName)
    LOG.info("Removing topology: {0} from path: {1}".format(
        topologyName, path))
    try:
      self.client.delete(path)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while deleting execution state",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NotEmptyError:
      raise StateException("NotEmptyError while deleting execution state",
                           StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while deleting execution state",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def get_tmaster(self, topologyName, callback=None):
    """ get tmaster """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """
        Custom callback to get the topologies right now.
        """
        ret["result"] = data

    self._get_tmaster_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_tmaster_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get pplan with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_tmaster_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    # pylint: disable=unused-variable, unused-argument
    @self.client.DataWatch(path)
    def watch_tmaster(data, stats):
      """ invoke callback to watch tmaster """
      if data:
        tmaster = TMasterLocation()
        tmaster.ParseFromString(data)
        callback(tmaster)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def get_scheduler_location(self, topologyName, callback=None):
    """ get scheduler location """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """
        Custom callback to get the scheduler location right now.
        """
        ret["result"] = data

    self._get_scheduler_location_with_watch(topologyName, callback, isWatching)

    return ret["result"]

  def _get_scheduler_location_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get scheduler location with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_scheduler_location_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    # pylint: disable=unused-variable, unused-argument
    @self.client.DataWatch(path)
    def watch_scheduler_location(data, stats):
      """ invoke callback to watch scheduler location """
      if data:
        scheduler_location = SchedulerLocation()
        scheduler_location.ParseFromString(data)
        callback(scheduler_location)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching
Beispiel #43
0
def start_kafka(zk_client_port, broker_listen_port, broker_id=0):
    if not os.path.exists(kafka_bdir):
        output, _ = call_command_("mkdir " + kafka_bdir)
    kafka_download = 'wget -nv --tries=3 -c -O ' + kafka_bdir + kafka_dl + \
        ' https://github.com/Juniper/contrail-third-party-cache/blob/master/kafka' + \
        kafka_dl + '?raw=true'
    if not os.path.exists(kafka_bdir + kafka_dl):
        process = subprocess.Popen(kafka_download.split(' '))
        process.wait()
        if process.returncode is not 0:
            return False

    basefile = kafka_version
    kafkabase = "/tmp/kafka.%s.%d/" % (os.getenv('USER',
                                                 'None'), broker_listen_port)
    confdir = kafkabase + basefile + "/config/"
    output, _ = call_command_("rm -rf " + kafkabase)
    output, _ = call_command_("mkdir " + kafkabase)

    logging.info('Check zookeeper in %d' % zk_client_port)
    zk = KazooClient(hosts='127.0.0.1:' + str(zk_client_port), timeout=60.0)
    try:
        zk.start()
        zk.delete("/brokers", recursive=True)
        zk.delete("/consumers", recursive=True)
        zk.delete("/controller", recursive=True)
    except:
        logging.info("Zookeeper client cannot connect")
        zk.stop()
        return False
    zk.stop()
    logging.info('Installing kafka in ' + kafkabase)
    x = os.system("cat " + kafka_bdir + kafka_dl + " | tar -xpzf - -C " +
                  kafkabase)
    if 0 != x:
        logging.error("Cannot install kafka")
        return False

    logging.info('kafka Port %d' % broker_listen_port)

    replace_string_(confdir + "server.properties",
                    [("#listeners=PLAINTEXT://:9092",
                      "listeners=PLAINTEXT://:" + str(broker_listen_port))])

    #Replace the brokerid and port # in the config file
    replace_string_(
        confdir + "server.properties",
        [("broker.id=0", "broker.id=" + str(broker_id)),
         ("zookeeper.connect=localhost:2181",
          "zookeeper.connect=localhost:%d" % zk_client_port),
         ("log.dirs=/tmp/kafka-logs", "log.dirs=" + kafkabase + "logs")])

    replace_string_(kafkabase + basefile + "/bin/kafka-server-stop.sh",
                    [("grep -v grep", "grep %s | grep -v grep" % kafkabase)])
    replace_string_(kafkabase + basefile + "/bin/kafka-server-stop.sh",
                    [("SIGINT", "SIGKILL")])
    replace_string_(kafkabase + basefile + "/bin/kafka-server-stop.sh",
                    [("#!/bin/sh", "#!/bin/sh -x")])
    output, _ = call_command_("chmod +x " + kafkabase + basefile +
                              "/bin/kafka-server-stop.sh")

    # Extra options for JMX : -Djava.net.preferIPv4Stack=true -Djava.rmi.server.hostname=xx.xx.xx.xx
    output, _ = call_command_(kafkabase + basefile +
                              "/bin/kafka-server-start.sh -daemon " +
                              kafkabase + basefile +
                              "/config/server.properties")

    count = 0
    start_wait = os.getenv('CONTRIAL_ANALYTICS_TEST_MAX_START_WAIT_TIME', 15)
    while count < start_wait:
        try:
            logging.info('Trying to connect...')
            kk = KafkaClient("localhost:%d" % broker_listen_port)
        except:
            count += 1
            time.sleep(1)
        else:
            return True

    logging.info("Kafka client cannot connect. Kafka logfile below:")
    with open(kafkabase + basefile + "/logs/kafkaServer.out", 'r') as fin:
        logging.info(fin.read())
    return False
class KazooCommandProxy():
    def __init__(self, module):
        self.module = module
        self.zk = KazooClient(module.params['hosts'])

    def absent(self):
        return self._absent(self.module.params['name'])

    def exists(self, znode):
        return self.zk.exists(znode)

    def list(self):
        children = self.zk.get_children(self.module.params['name'])
        return True, {'count': len(children), 'items': children, 'msg': 'Retrieved znodes in path.',
                      'znode': self.module.params['name']}

    def present(self):
        return self._present(self.module.params['name'], self.module.params['value'])

    def get(self):
        return self._get(self.module.params['name'])

    def shutdown(self):
        self.zk.stop()
        self.zk.close()

    def start(self):
        self.zk.start()

    def wait(self):
        return self._wait(self.module.params['name'], self.module.params['timeout'])

    def _absent(self, znode):
        if self.exists(znode):
            self.zk.delete(znode, recursive=self.module.params['recursive'])
            return True, {'changed': True, 'msg': 'The znode was deleted.'}
        else:
            return True, {'changed': False, 'msg': 'The znode does not exist.'}

    def _get(self, path):
        if self.exists(path):
            value, zstat = self.zk.get(path)
            stat_dict = {}
            for i in dir(zstat):
                if not i.startswith('_'):
                    attr = getattr(zstat, i)
                    if isinstance(attr, (int, str)):
                        stat_dict[i] = attr
            result = True, {'msg': 'The node was retrieved.', 'znode': path, 'value': value,
                            'stat': stat_dict}
        else:
            result = False, {'msg': 'The requested node does not exist.'}

        return result

    def _present(self, path, value):
        if self.exists(path):
            (current_value, zstat) = self.zk.get(path)
            if value != current_value:
                self.zk.set(path, value)
                return True, {'changed': True, 'msg': 'Updated the znode value.', 'znode': path,
                              'value': value}
            else:
                return True, {'changed': False, 'msg': 'No changes were necessary.', 'znode': path, 'value': value}
        else:
            self.zk.create(path, value, makepath=True)
            return True, {'changed': True, 'msg': 'Created a new znode.', 'znode': path, 'value': value}

    def _wait(self, path, timeout, interval=5):
        lim = time.time() + timeout

        while time.time() < lim:
            if self.exists(path):
                return True, {'msg': 'The node appeared before the configured timeout.',
                              'znode': path, 'timeout': timeout}
            else:
                time.sleep(interval)

        return False, {'msg': 'The node did not appear before the operation timed out.', 'timeout': timeout,
                       'znode': path}
Beispiel #45
0
class BalancedConsumer():
    """
    A self-balancing consumer for Kafka that uses ZooKeeper to communicate
    with other balancing consumers.

    Maintains a single instance of SimpleConsumer, periodically using the
    consumer rebalancing algorithm to reassign partitions to this
    SimpleConsumer.
    """
    def __init__(self,
                 topic,
                 cluster,
                 consumer_group,
                 fetch_message_max_bytes=1024 * 1024,
                 num_consumer_fetchers=1,
                 auto_commit_enable=False,
                 auto_commit_interval_ms=60 * 1000,
                 queued_max_messages=2000,
                 fetch_min_bytes=1,
                 fetch_wait_max_ms=100,
                 offsets_channel_backoff_ms=1000,
                 offsets_commit_max_retries=5,
                 auto_offset_reset=OffsetType.EARLIEST,
                 consumer_timeout_ms=-1,
                 rebalance_max_retries=5,
                 rebalance_backoff_ms=2 * 1000,
                 zookeeper_connection_timeout_ms=6 * 1000,
                 zookeeper_connect='127.0.0.1:2181',
                 zookeeper=None,
                 auto_start=True,
                 reset_offset_on_start=False):
        """Create a BalancedConsumer instance

        :param topic: The topic this consumer should consume
        :type topic: :class:`pykafka.topic.Topic`
        :param cluster: The cluster to which this consumer should connect
        :type cluster: :class:`pykafka.cluster.Cluster`
        :param consumer_group: The name of the consumer group this consumer
            should join.
        :type consumer_group: bytes
        :param fetch_message_max_bytes: The number of bytes of messages to
            attempt to fetch with each fetch request
        :type fetch_message_max_bytes: int
        :param num_consumer_fetchers: The number of workers used to make
            FetchRequests
        :type num_consumer_fetchers: int
        :param auto_commit_enable: If true, periodically commit to kafka the
            offset of messages already fetched by this consumer. This also
            requires that `consumer_group` is not `None`.
        :type auto_commit_enable: bool
        :param auto_commit_interval_ms: The frequency (in milliseconds) at which
            the consumer's offsets are committed to kafka. This setting is
            ignored if `auto_commit_enable` is `False`.
        :type auto_commit_interval_ms: int
        :param queued_max_messages: The maximum number of messages buffered for
            consumption in the internal
            :class:`pykafka.simpleconsumer.SimpleConsumer`
        :type queued_max_messages: int
        :param fetch_min_bytes: The minimum amount of data (in bytes) that the
            server should return for a fetch request. If insufficient data is
            available, the request will block until sufficient data is available.
        :type fetch_min_bytes: int
        :param fetch_wait_max_ms: The maximum amount of time (in milliseconds)
            that the server will block before answering a fetch request if
            there isn't sufficient data to immediately satisfy `fetch_min_bytes`.
        :type fetch_wait_max_ms: int
        :param offsets_channel_backoff_ms: Backoff time to retry failed offset
            commits and fetches.
        :type offsets_channel_backoff_ms: int
        :param offsets_commit_max_retries: The number of times the offset commit
            worker should retry before raising an error.
        :type offsets_commit_max_retries: int
        :param auto_offset_reset: What to do if an offset is out of range. This
            setting indicates how to reset the consumer's internal offset
            counter when an `OffsetOutOfRangeError` is encountered.
        :type auto_offset_reset: :class:`pykafka.common.OffsetType`
        :param consumer_timeout_ms: Amount of time (in milliseconds) the
            consumer may spend without messages available for consumption
            before returning None.
        :type consumer_timeout_ms: int
        :param rebalance_max_retries: The number of times the rebalance should
            retry before raising an error.
        :type rebalance_max_retries: int
        :param rebalance_backoff_ms: Backoff time (in milliseconds) between
            retries during rebalance.
        :type rebalance_backoff_ms: int
        :param zookeeper_connection_timeout_ms: The maximum time (in
            milliseconds) that the consumer waits while establishing a
            connection to zookeeper.
        :type zookeeper_connection_timeout_ms: int
        :param zookeeper_connect: Comma-separated (ip1:port1,ip2:port2) strings
            indicating the zookeeper nodes to which to connect.
        :type zookeeper_connect: str
        :param zookeeper: A KazooClient connected to a Zookeeper instance.
            If provided, `zookeeper_connect` is ignored.
        :type zookeeper: :class:`kazoo.client.KazooClient`
        :param auto_start: Whether the consumer should begin communicating
            with zookeeper after __init__ is complete. If false, communication
            can be started with `start()`.
        :type auto_start: bool
        :param reset_offset_on_start: Whether the consumer should reset its
            internal offset counter to `self._auto_offset_reset` and commit that
            offset immediately upon starting up
        :type reset_offset_on_start: bool
        """
        self._cluster = cluster
        self._consumer_group = consumer_group
        self._topic = topic

        self._auto_commit_enable = auto_commit_enable
        self._auto_commit_interval_ms = auto_commit_interval_ms
        self._fetch_message_max_bytes = fetch_message_max_bytes
        self._fetch_min_bytes = fetch_min_bytes
        self._rebalance_max_retries = rebalance_max_retries
        self._num_consumer_fetchers = num_consumer_fetchers
        self._queued_max_messages = queued_max_messages
        self._fetch_wait_max_ms = fetch_wait_max_ms
        self._rebalance_backoff_ms = rebalance_backoff_ms
        self._consumer_timeout_ms = consumer_timeout_ms
        self._offsets_channel_backoff_ms = offsets_channel_backoff_ms
        self._offsets_commit_max_retries = offsets_commit_max_retries
        self._auto_offset_reset = auto_offset_reset
        self._zookeeper_connect = zookeeper_connect
        self._zookeeper_connection_timeout_ms = zookeeper_connection_timeout_ms
        self._reset_offset_on_start = reset_offset_on_start
        self._running = False

        self._rebalancing_lock = cluster.handler.Lock()
        self._consumer = None
        self._consumer_id = "{hostname}:{uuid}".format(
            hostname=socket.gethostname(),
            uuid=uuid4()
        )
        self._partitions = set()
        self._setting_watches = True

        self._topic_path = '/consumers/{group}/owners/{topic}'.format(
            group=self._consumer_group,
            topic=self._topic.name)
        self._consumer_id_path = '/consumers/{group}/ids'.format(
            group=self._consumer_group)

        self._zookeeper = None
        if zookeeper is not None:
            self._zookeeper = zookeeper
        if auto_start is True:
            self.start()

    def __repr__(self):
        return "<{module}.{name} at {id_} (consumer_group={group})>".format(
            module=self.__class__.__module__,
            name=self.__class__.__name__,
            id_=hex(id(self)),
            group=self._consumer_group
        )

    def _setup_checker_worker(self):
        """Start the zookeeper partition checker thread"""
        def checker():
            while True:
                time.sleep(120)
                if not self._running:
                    break
                self._check_held_partitions()
            log.debug("Checker thread exiting")
        log.debug("Starting checker thread")
        return self._cluster.handler.spawn(checker)

    @property
    def partitions(self):
        return self._consumer.partitions if self._consumer else None

    @property
    def held_offsets(self):
        """Return a map from partition id to held offset for each partition"""
        if not self._consumer:
            return None
        return dict((p.partition.id, p.last_offset_consumed)
                    for p in self._consumer._partitions_by_id.itervalues())

    def start(self):
        """Open connections and join a cluster."""
        if self._zookeeper is None:
            self._setup_zookeeper(self._zookeeper_connect,
                                  self._zookeeper_connection_timeout_ms)
        self._zookeeper.ensure_path(self._topic_path)
        self._add_self()
        self._set_watches()
        self._rebalance()
        self._running = True
        self._setup_checker_worker()

    def stop(self):
        """Close the zookeeper connection and stop consuming.

        This method should be called as part of a graceful shutdown process.
        """
        self._zookeeper.stop()
        self._consumer.stop()
        self._running = False

    def _setup_zookeeper(self, zookeeper_connect, timeout):
        """Open a connection to a ZooKeeper host.

        :param zookeeper_connect: The 'ip:port' address of the zookeeper node to
            which to connect.
        :type zookeeper_connect: str
        :param timeout: Connection timeout (in milliseconds)
        :type timeout: int
        """
        self._zookeeper = KazooClient(zookeeper_connect, timeout=timeout / 1000)
        self._zookeeper.start()

    def _setup_internal_consumer(self, start=True):
        """Instantiate an internal SimpleConsumer.

        If there is already a SimpleConsumer instance held by this object,
        disable its workers and mark it for garbage collection before
        creating a new one.
        """
        reset_offset_on_start = self._reset_offset_on_start
        if self._consumer is not None:
            self._consumer.stop()
            # only use this setting for the first call to
            # _setup_internal_consumer. subsequent calls should not
            # reset the offsets, since they can happen at any time
            reset_offset_on_start = False
        self._consumer = SimpleConsumer(
            self._topic,
            self._cluster,
            consumer_group=self._consumer_group,
            partitions=list(self._partitions),
            auto_commit_enable=self._auto_commit_enable,
            auto_commit_interval_ms=self._auto_commit_interval_ms,
            fetch_message_max_bytes=self._fetch_message_max_bytes,
            fetch_min_bytes=self._fetch_min_bytes,
            num_consumer_fetchers=self._num_consumer_fetchers,
            queued_max_messages=self._queued_max_messages,
            fetch_wait_max_ms=self._fetch_wait_max_ms,
            consumer_timeout_ms=self._consumer_timeout_ms,
            offsets_channel_backoff_ms=self._offsets_channel_backoff_ms,
            offsets_commit_max_retries=self._offsets_commit_max_retries,
            auto_offset_reset=self._auto_offset_reset,
            reset_offset_on_start=reset_offset_on_start,
            auto_start=start
        )

    def _decide_partitions(self, participants):
        """Decide which partitions belong to this consumer.

        Uses the consumer rebalancing algorithm described here
        http://kafka.apache.org/documentation.html

        It is very important that the participants array is sorted,
        since this algorithm runs on each consumer and indexes into the same
        array. The same array index operation must return the same
        result on each consumer.

        :param participants: Sorted list of ids of all other consumers in this
            consumer group.
        :type participants: Iterable of `bytes`
        """
        # Freeze and sort partitions so we always have the same results
        p_to_str = lambda p: '-'.join([str(p.topic.name), str(p.leader.id), str(p.id)])
        all_parts = self._topic.partitions.values()
        all_parts = sorted(all_parts, key=p_to_str)

        # get start point, # of partitions, and remainder
        participants = sorted(participants)  # just make sure it's sorted.
        idx = participants.index(self._consumer_id)
        parts_per_consumer = len(all_parts) // len(participants)
        remainder_ppc = len(all_parts) % len(participants)

        start = parts_per_consumer * idx + min(idx, remainder_ppc)
        num_parts = parts_per_consumer + (0 if (idx + 1 > remainder_ppc) else 1)

        # assign partitions from i*N to (i+1)*N - 1 to consumer Ci
        new_partitions = itertools.islice(all_parts, int(start), int(start + num_parts))
        new_partitions = set(new_partitions)
        log.info('Balancing %i participants for %i partitions.\nOwning %i partitions.',
                 len(participants), len(all_parts), len(new_partitions))
        log.debug('My partitions: %s', [p_to_str(p) for p in new_partitions])
        return new_partitions

    def _get_participants(self):
        """Use zookeeper to get the other consumers of this topic.

        :return: A sorted list of the ids of the other consumers of this
            consumer's topic
        """
        try:
            consumer_ids = self._zookeeper.get_children(self._consumer_id_path)
        except NoNodeException:
            log.debug("Consumer group doesn't exist. "
                      "No participants to find")
            return []

        participants = []
        for id_ in consumer_ids:
            try:
                topic, stat = self._zookeeper.get("%s/%s" % (self._consumer_id_path, id_))
                if topic == self._topic.name:
                    participants.append(id_)
            except NoNodeException:
                pass  # disappeared between ``get_children`` and ``get``
        participants = sorted(participants)
        return participants

    def _set_watches(self):
        """Set watches in zookeeper that will trigger rebalances.

        Rebalances should be triggered whenever a broker, topic, or consumer
        znode is changed in zookeeper. This ensures that the balance of the
        consumer group remains up-to-date with the current state of the
        cluster.
        """
        self._setting_watches = True
        # Set all our watches and then rebalance
        broker_path = '/brokers/ids'
        try:
            self._broker_watcher = ChildrenWatch(
                self._zookeeper, broker_path,
                self._brokers_changed
            )
        except NoNodeException:
            raise Exception(
                'The broker_path "%s" does not exist in your '
                'ZooKeeper cluster -- is your Kafka cluster running?'
                % broker_path)

        self._topics_watcher = ChildrenWatch(
            self._zookeeper,
            '/brokers/topics',
            self._topics_changed
        )

        self._consumer_watcher = ChildrenWatch(
            self._zookeeper, self._consumer_id_path,
            self._consumers_changed
        )
        self._setting_watches = False

    def _add_self(self):
        """Register this consumer in zookeeper.

        This method ensures that the number of participants is at most the
        number of partitions.
        """
        participants = self._get_participants()
        if len(self._topic.partitions) <= len(participants):
            raise KafkaException("Cannot add consumer: more consumers than partitions")

        path = '{path}/{id_}'.format(
            path=self._consumer_id_path,
            id_=self._consumer_id
        )
        self._zookeeper.create(
            path, self._topic.name, ephemeral=True, makepath=True)

    def _rebalance(self):
        """Claim partitions for this consumer.

        This method is called whenever a zookeeper watch is triggered.
        """
        if self._consumer is not None:
            self.commit_offsets()
        with self._rebalancing_lock:
            log.info('Rebalancing consumer %s for topic %s.' % (
                self._consumer_id, self._topic.name)
            )

            for i in range(self._rebalance_max_retries):
                try:
                    # If retrying, be sure to make sure the
                    # partition allocation is correct.
                    participants = self._get_participants()
                    partitions = self._decide_partitions(participants)

                    old_partitions = self._partitions - partitions
                    self._remove_partitions(old_partitions)

                    new_partitions = partitions - self._partitions
                    self._add_partitions(new_partitions)

                    # Only re-create internal consumer if something changed.
                    if old_partitions or new_partitions:
                        self._setup_internal_consumer()

                    log.info('Rebalancing Complete.')
                    break
                except PartitionOwnedError as ex:
                    if i == self._rebalance_max_retries - 1:
                        log.warning('Failed to acquire partition %s after %d retries.',
                                    ex.partition, i)
                        raise
                    log.info('Unable to acquire partition %s. Retrying', ex.partition)
                    time.sleep(i * (self._rebalance_backoff_ms / 1000))

    def _path_from_partition(self, p):
        """Given a partition, return its path in zookeeper.

        :type p: :class:`pykafka.partition.Partition`
        """
        return "%s/%s-%s" % (self._topic_path, p.leader.id, p.id)

    def _remove_partitions(self, partitions):
        """Remove partitions from the zookeeper registry for this consumer.

        Also remove these partitions from the consumer's internal
        partition registry.

        :param partitions: The partitions to remove.
        :type partitions: Iterable of :class:`pykafka.partition.Partition`
        """
        for p in partitions:
            assert p in self._partitions
            self._zookeeper.delete(self._path_from_partition(p))
        self._partitions -= partitions

    def _add_partitions(self, partitions):
        """Add partitions to the zookeeper registry for this consumer.

        Also add these partitions to the consumer's internal partition registry.

        :param partitions: The partitions to add.
        :type partitions: Iterable of :class:`pykafka.partition.Partition`
        """
        for p in partitions:
            try:
                self._zookeeper.create(
                    self._path_from_partition(p),
                    value=get_bytes(self._consumer_id),
                    ephemeral=True
                )
                self._partitions.add(p)
            except NodeExistsError:
                raise PartitionOwnedError(p)

    def _check_held_partitions(self):
        """Double-check held partitions against zookeeper

        Ensure that the partitions held by this consumer are the ones that
        zookeeper thinks it's holding. If not, rebalance.
        """
        log.info("Checking held partitions against ZooKeeper")
        # build a set of partition ids zookeeper says we own
        zk_partition_ids = set()
        all_partitions = self._zookeeper.get_children(self._topic_path)
        for partition_slug in all_partitions:
            owner_id, stat = self._zookeeper.get(
                '{path}/{slug}'.format(
                    path=self._topic_path, slug=partition_slug))
            if owner_id == self._consumer_id:
                zk_partition_ids.add(int(partition_slug.split('-')[1]))
        # build a set of partition ids we think we own
        internal_partition_ids = set([p.id for p in self._partitions])
        # compare the two sets, rebalance if necessary
        if internal_partition_ids != zk_partition_ids:
            log.warning("Internal partition registry doesn't match ZooKeeper!")
            log.debug("Internal partition ids: %s\nZooKeeper partition ids: %s",
                      internal_partition_ids, zk_partition_ids)
            self._rebalance()

    def _brokers_changed(self, brokers):
        if self._setting_watches:
            return
        log.debug("Rebalance triggered by broker change")
        self._rebalance()

    def _consumers_changed(self, consumers):
        if self._setting_watches:
            return
        log.debug("Rebalance triggered by consumer change")
        self._rebalance()

    def _topics_changed(self, topics):
        if self._setting_watches:
            return
        log.debug("Rebalance triggered by topic change")
        self._rebalance()

    def reset_offsets(self, partition_offsets=None):
        """Reset offsets for the specified partitions

        Issue an OffsetRequest for each partition and set the appropriate
        returned offset in the OwnedPartition

        :param partition_offsets: (`partition`, `offset`) pairs to reset
            where `partition` is the partition for which to reset the offset
            and `offset` is the new offset the partition should have
        :type partition_offsets: Iterable of
            (:class:`pykafka.partition.Partition`, int)
        """
        if not self._consumer:
            raise ConsumerStoppedException("Internal consumer is stopped")
        self._consumer.reset_offsets(partition_offsets=partition_offsets)

    def consume(self, block=True):
        """Get one message from the consumer

        :param block: Whether to block while waiting for a message
        :type block: bool
        """

        def consumer_timed_out():
            """Indicates whether the consumer has received messages recently"""
            if self._consumer_timeout_ms == -1:
                return False
            disp = (time.time() - self._last_message_time) * 1000.0
            return disp > self._consumer_timeout_ms
        message = None
        self._last_message_time = time.time()
        while message is None and not consumer_timed_out():
            try:
                message = self._consumer.consume(block=block)
            except ConsumerStoppedException:
                if not self._running:
                    return
                continue
            if message:
                self._last_message_time = time.time()
            if not block:
                return message
        return message

    def commit_offsets(self):
        """Commit offsets for this consumer's partitions

        Uses the offset commit/fetch API
        """
        return self._consumer.commit_offsets()
Beispiel #46
0
class ZooKeeper(AbstractDCS):
    def __init__(self, name, config):
        super(ZooKeeper, self).__init__(name, config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self.exhibitor = None
        if 'exhibitor' in config:
            exhibitor = config['exhibitor']
            interval = exhibitor.get('poll_interval', 300)
            self.exhibitor = ExhibitorEnsembleProvider(exhibitor['hosts'],
                                                       exhibitor['port'],
                                                       poll_interval=interval)
            hosts = self.exhibitor.zookeeper_hosts

        self.client = KazooClient(hosts=hosts,
                                  timeout=(config.get('session_timeout', None)
                                           or 30),
                                  command_retry={
                                      'deadline':
                                      (config.get('reconnect_timeout', None)
                                       or 10),
                                      'max_delay':
                                      1,
                                      'max_tries':
                                      -1
                                  },
                                  connection_retry={
                                      'max_delay': 1,
                                      'max_tries': -1
                                  })
        self.client.add_listener(self.session_listener)

        self._my_member_data = None
        self.fetch_cluster = True
        self.last_leader_operation = 0

        self.client.start(None)

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self.fetch_cluster = True
        self.event.set()

    def get_node(self, key, watch=None):
        try:
            ret = self.client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner,
                                value)

    def get_children(self, key, watch=None):
        try:
            return self.client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self):
        members = []
        for member in self.get_children(self.members_path,
                                        self.cluster_watcher):
            data = self.get_node(self.members_path + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self.fetch_cluster = False
        self.event.clear()
        nodes = set(
            self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self.fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path)
                      or [None])[0] if self._INITIALIZE in nodes else None

        # get list of members
        members = self.load_members() if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(
            self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self.client.client_id
            if leader[0] == self._name and client_id is not None and client_id[
                    0] != leader[1].ephemeralOwner:
                logger.info(
                    'I am leader but not owner of the session. Removing leader node'
                )
                self.client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]]
                          or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner,
                                member)
                self.fetch_cluster = member.index == -1

        # failover key
        failover = self.get_node(
            self.failover_path,
            watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        if failover:
            failover = Failover.from_node(failover[1].version, failover[0])

        # get last leader operation
        optime = self.get_node(
            self.leader_optime_path
        ) if self._OPTIME in nodes and self.fetch_cluster else None
        self.last_leader_operation = 0 if optime is None else int(optime[0])
        self._cluster = Cluster(initialize, leader, self.last_leader_operation,
                                members, failover)

    def _load_cluster(self):
        if self.exhibitor and self.exhibitor.poll():
            self.client.set_hosts(self.exhibitor.zookeeper_hosts)

        if self.fetch_cluster:
            try:
                self.client.retry(self._inner_load_cluster)
            except:
                logger.exception('get_cluster')
                self.session_listener(KazooState.LOST)
                raise ZooKeeperError('ZooKeeper in not responding properly')

    def _create(self, path, value, **kwargs):
        try:
            self.client.retry(self.client.create, path, value.encode('utf-8'),
                              **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self):
        ret = self._create(self.leader_path,
                           self._name,
                           makepath=True,
                           ephemeral=True)
        ret or logger.info('Could not take out TTL lock')
        return ret

    def set_failover_value(self, value, index=None):
        try:
            self.client.retry(self.client.set,
                              self.failover_path,
                              value.encode('utf-8'),
                              version=index or -1)
            return True
        except NoNodeError:
            return value == '' or (not index
                                   and self._create(self.failover_path, value))
        except:
            logging.exception('set_failover_value')
            return False

    def initialize(self, create_new=True, sysid=""):
        return self._create(self.initialize_path, sysid, makepath=True) if create_new \
            else self.client.retry(self.client.set, self.initialize_path,  sysid.encode("utf-8"))

    def touch_member(self, data, ttl=None):
        cluster = self.cluster
        me = cluster and ([m for m in cluster.members if m.name == self._name]
                          or [None])[0]
        path = self.member_path
        data = data.encode('utf-8')
        create = not me
        if me and self.client.client_id is not None and me.session != self.client.client_id[
                0]:
            try:
                self.client.retry(self.client.delete, path)
            except NoNodeError:
                pass
            except:
                return False
            create = True

        if not create and data == self._my_member_data:
            return True

        try:
            if create:
                self.client.retry(self.client.create,
                                  path,
                                  data,
                                  makepath=True,
                                  ephemeral=True)
            else:
                self.client.retry(self.client.set, path, data)
            self._my_member_data = data
            return True
        except NodeExistsError:
            try:
                self.client.retry(self.client.set, path, data)
                self._my_member_data = data
                return True
            except:
                logger.exception('touch_member')
        except:
            logger.exception('touch_member')
        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def write_leader_optime(self, last_operation):
        last_operation = last_operation.encode('utf-8')
        if last_operation != self.last_leader_operation:
            self.last_leader_operation = last_operation
            path = self.leader_optime_path
            try:
                self.client.retry(self.client.set, path, last_operation)
            except NoNodeError:
                try:
                    self.client.retry(self.client.create,
                                      path,
                                      last_operation,
                                      makepath=True)
                except:
                    logger.exception('Failed to create %s', path)
            except:
                logger.exception('Failed to update %s', path)

    def update_leader(self):
        return True

    def delete_leader(self):
        self.client.restart()
        self._my_member_data = None
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self.client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self.client.retry(self._cancel_initialization)
        except:
            logger.exception("Unable to delete initialize key")

    def watch(self, timeout):
        if super(ZooKeeper, self).watch(timeout):
            self.fetch_cluster = True
        return self.fetch_cluster
Beispiel #47
0
class ZooKeeperJobStore(BaseJobStore):
    """
    Stores jobs in a ZooKeeper tree. Any leftover keyword arguments are directly passed to
    kazoo's `KazooClient
    <http://kazoo.readthedocs.io/en/latest/api/client.html>`_.

    Plugin alias: ``zookeeper``

    :param str path: path to store jobs in
    :param client: a :class:`~kazoo.client.KazooClient` instance to use instead of
        providing connection arguments
    :param int pickle_protocol: pickle protocol level to use (for serialization), defaults to the
        highest available
    """
    def __init__(self,
                 path='/apscheduler',
                 client=None,
                 close_connection_on_exit=False,
                 pickle_protocol=pickle.HIGHEST_PROTOCOL,
                 **connect_args):
        super().__init__()
        self.pickle_protocol = pickle_protocol
        self.close_connection_on_exit = close_connection_on_exit

        if not path:
            raise ValueError('The "path" parameter must not be empty')

        self.path = path

        if client:
            self.client = maybe_ref(client)
        else:
            self.client = KazooClient(**connect_args)
        self._ensured_path = False

    def _ensure_paths(self):
        if not self._ensured_path:
            self.client.ensure_path(self.path)
        self._ensured_path = True

    def start(self, scheduler, alias):
        super().start(scheduler, alias)
        if not self.client.connected:
            self.client.start()

    def lookup_job(self, job_id):
        self._ensure_paths()
        node_path = os.path.join(self.path, job_id)
        try:
            content, _ = self.client.get(node_path)
            doc = pickle.loads(content)
            job = self._reconstitute_job(doc['job_state'])
            return job
        except BaseException:
            return None

    def get_due_jobs(self, now):
        timestamp = datetime_to_utc_timestamp(now)
        jobs = [
            job_def['job'] for job_def in self._get_jobs()
            if job_def['next_run_time'] is not None
            and job_def['next_run_time'] <= timestamp
        ]
        return jobs

    def get_next_run_time(self):
        next_runs = [
            job_def['next_run_time'] for job_def in self._get_jobs()
            if job_def['next_run_time'] is not None
        ]
        return utc_timestamp_to_datetime(
            min(next_runs)) if len(next_runs) > 0 else None

    def get_all_jobs(self):
        jobs = [job_def['job'] for job_def in self._get_jobs()]
        self._fix_paused_jobs_sorting(jobs)
        return jobs

    def add_job(self, job):
        self._ensure_paths()
        node_path = os.path.join(self.path, str(job.id))
        value = {
            'next_run_time': datetime_to_utc_timestamp(job.next_run_time),
            'job_state': job.__getstate__()
        }
        data = pickle.dumps(value, self.pickle_protocol)
        try:
            self.client.create(node_path, value=data)
        except NodeExistsError:
            raise ConflictingIdError(job.id)

    def update_job(self, job):
        self._ensure_paths()
        node_path = os.path.join(self.path, str(job.id))
        changes = {
            'next_run_time': datetime_to_utc_timestamp(job.next_run_time),
            'job_state': job.__getstate__()
        }
        data = pickle.dumps(changes, self.pickle_protocol)
        try:
            self.client.set(node_path, value=data)
        except NoNodeError:
            raise JobLookupError(job.id)

    def remove_job(self, job_id):
        self._ensure_paths()
        node_path = os.path.join(self.path, str(job_id))
        try:
            self.client.delete(node_path)
        except NoNodeError:
            raise JobLookupError(job_id)

    def remove_all_jobs(self):
        try:
            self.client.delete(self.path, recursive=True)
        except NoNodeError:
            pass
        self._ensured_path = False

    def shutdown(self):
        if self.close_connection_on_exit:
            self.client.stop()
            self.client.close()

    def _reconstitute_job(self, job_state):
        job_state = job_state
        job = Job.__new__(Job)
        job.__setstate__(job_state)
        job._scheduler = self._scheduler
        job._jobstore_alias = self._alias
        return job

    def _get_jobs(self):
        self._ensure_paths()
        jobs = []
        failed_job_ids = []
        all_ids = self.client.get_children(self.path)
        for node_name in all_ids:
            try:
                node_path = os.path.join(self.path, node_name)
                content, _ = self.client.get(node_path)
                doc = pickle.loads(content)
                job_def = {
                    'job_id':
                    node_name,
                    'next_run_time':
                    doc['next_run_time'] if doc['next_run_time'] else None,
                    'job_state':
                    doc['job_state'],
                    'job':
                    self._reconstitute_job(doc['job_state']),
                    'creation_time':
                    _.ctime
                }
                jobs.append(job_def)
            except BaseException:
                self._logger.exception(
                    'Unable to restore job "%s" -- removing it' % node_name)
                failed_job_ids.append(node_name)

        # Remove all the jobs we failed to restore
        if failed_job_ids:
            for failed_id in failed_job_ids:
                self.remove_job(failed_id)
        paused_sort_key = datetime(9999, 12, 31, tzinfo=utc)
        return sorted(jobs,
                      key=lambda job_def:
                      (job_def['job'].next_run_time or paused_sort_key,
                       job_def['creation_time']))

    def __repr__(self):
        self._logger.exception('<%s (client=%s)>' %
                               (self.__class__.__name__, self.client))
        return '<%s (client=%s)>' % (self.__class__.__name__, self.client)
Beispiel #48
0
class AnalyticsDiscovery(gevent.Greenlet):

    def _sandesh_connection_info_update(self, status, message):

        new_conn_state = getattr(ConnectionStatus, status)
        ConnectionState.update(conn_type = ConnectionType.ZOOKEEPER,
                name = self._svc_name, status = new_conn_state,
                message = message,
                server_addrs = self._zk_server.split(','))

        if (self._conn_state and self._conn_state != ConnectionStatus.DOWN and
                new_conn_state == ConnectionStatus.DOWN):
            msg = 'Connection to Zookeeper down: %s' %(message)
            self._logger.error(msg)
        if (self._conn_state and self._conn_state != new_conn_state and
                new_conn_state == ConnectionStatus.UP):
            msg = 'Connection to Zookeeper ESTABLISHED'
            self._logger.error(msg)

        self._conn_state = new_conn_state
        #import pdb; pdb.set_trace()
    # end _sandesh_connection_info_update

    def _zk_listen(self, state):
        self._logger.error("Analytics Discovery listen %s" % str(state))
        if state == KazooState.CONNECTED:
            if self._conn_state != ConnectionStatus.UP:
                self._sandesh_connection_info_update(status='UP', message='')
                self._logger.error("Analytics Discovery to publish %s" % str(self._pubinfo))
                self._reconnect = True
            else:
                self._logger.error("Analytics Discovery already connected")
        else:
            self._logger.error("Analytics Discovery NOT connected")
            if self._conn_state == ConnectionStatus.UP:
                self._sandesh_connection_info_update(status='DOWN', message='')

    def _zk_datawatch(self, watcher, child, data, stat, event="unknown"):
        self._logger.error(\
                "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                (watcher, child, data, event))
        if data:
            data_dict = json.loads(data)
            self._wchildren[watcher][child] = OrderedDict(sorted(data_dict.items()))
        else:
            if child in self._wchildren[watcher]:
                del self._wchildren[watcher][child]
        if self._watchers[watcher]:
            self._pendingcb.add(watcher)

    def _zk_watcher(self, watcher, children):
        self._logger.error("Analytics Discovery Children %s" % children)
        self._reconnect = True

    def __init__(self, logger, zkservers, svc_name, inst,
                watchers={}, zpostfix="", freq=10):
        gevent.Greenlet.__init__(self)
        self._svc_name = svc_name
        self._inst = inst
        self._zk_server = zkservers
        # initialize logging and other stuff
        if logger is None:
            logging.basicConfig()
            self._logger = logging
        else:
            self._logger = logger
        self._conn_state = None
        self._sandesh_connection_info_update(status='INIT', message='')
        self._zk = KazooClient(hosts=zkservers)
        self._pubinfo = None
        self._watchers = watchers
        self._wchildren = {}
        self._pendingcb = set()
        self._zpostfix = zpostfix
        self._basepath = "/analytics-discovery-" + self._zpostfix
        self._reconnect = None
        self._freq = freq

    def publish(self, pubinfo):
        self._pubinfo = pubinfo
        #import pdb; pdb.set_trace()
        if self._conn_state == ConnectionStatus.UP:
            try:
                self._logger.error("ensure %s" % (self._basepath + "/" + self._svc_name))
                self._logger.error("zk state %s (%s)" % (self._zk.state, self._zk.client_state))
                self._zk.ensure_path(self._basepath + "/" + self._svc_name)
                self._logger.error("check for %s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))
                if pubinfo is not None:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._zk.set("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo)
                    else:
                        self._zk.create("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst),
                                self._pubinfo, ephemeral=True)
                else:
                    if self._zk.exists("%s/%s/%s" % \
                            (self._basepath, self._svc_name, self._inst)):
                        self._logger.error("withdrawing published info!")
                        self._zk.delete("%s/%s/%s" % \
                                (self._basepath, self._svc_name, self._inst))

            except Exception as ex:
                template = "Exception {0} in AnalyticsDiscovery publish. Args:\n{1!r}"
                messag = template.format(type(ex).__name__, ex.args)
                self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                self._sandesh_connection_info_update(status='DOWN', message='')
                self._reconnect = True
        else:
            self._logger.error("Analytics Discovery cannot publish while down")

    def _run(self):
        while True:
            try:
                self._zk.start()
                break
            except gevent.event.Timeout as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                         message=str(e))
                gevent.sleep(1)
                # Zookeeper is also throwing exception due to delay in master election
            except Exception as e:
                # Update connection info
                self._sandesh_connection_info_update(status='DOWN',
                                                     message=str(e))
                gevent.sleep(1)

        try:
            # Update connection info
            self._sandesh_connection_info_update(status='UP', message='')
            self._reconnect = False
            # Done connecting to ZooKeeper

            self._zk.add_listener(self._zk_listen)
            for wk in self._watchers.keys():
                self._zk.ensure_path(self._basepath + "/" + wk)
                self._wchildren[wk] = {}
                self._zk.ChildrenWatch(self._basepath + "/" + wk,
                        partial(self._zk_watcher, wk))

            # Trigger the initial publish
            self._reconnect = True

            while True:
                try:
                    if not self._reconnect:
                        pending_list = list(self._pendingcb)
                        self._pendingcb = set()
                        for wk in pending_list:
                            if self._watchers[wk]:
                                self._watchers[wk](\
                                        sorted(self._wchildren[wk].values()))

                    # If a reconnect happens during processing, don't lose it
                    while self._reconnect:
                        self._logger.error("Analytics Discovery %s reconnect" \
                                % self._svc_name)
                        self._reconnect = False
                        self._pendingcb = set()
                        self.publish(self._pubinfo)

                        for wk in self._watchers.keys():
                            self._zk.ensure_path(self._basepath + "/" + wk)
                            children = self._zk.get_children(self._basepath + "/" + wk)

                            old_children = set(self._wchildren[wk].keys())
                            new_children = set(children)

                            # Remove contents for the children who are gone
                            # (DO NOT remove the watch)
                            for elem in old_children - new_children:
                                 del self._wchildren[wk][elem]

                            # Overwrite existing children, or create new ones
                            for elem in new_children:
                                # Create a watch for new children
                                if elem not in self._wchildren[wk]:
                                    self._zk.DataWatch(self._basepath + "/" + \
                                            wk + "/" + elem,
                                            partial(self._zk_datawatch, wk, elem))

                                data_str, _ = self._zk.get(\
                                        self._basepath + "/" + wk + "/" + elem)
                                data_dict = json.loads(data_str)
                                self._wchildren[wk][elem] = \
                                        OrderedDict(sorted(data_dict.items()))

                                self._logger.error(\
                                    "Analytics Discovery %s ChildData : child %s, data %s, event %s" % \
                                    (wk, elem, self._wchildren[wk][elem], "GET"))
                            if self._watchers[wk]:
                                self._watchers[wk](sorted(self._wchildren[wk].values()))

                    gevent.sleep(self._freq)
                except gevent.GreenletExit:
                    self._logger.error("Exiting AnalyticsDiscovery for %s" % \
                            self._svc_name)
                    self._zk.stop()
                    break

                except Exception as ex:
                    template = "Exception {0} in AnalyticsDiscovery reconnect. Args:\n{1!r}"
                    messag = template.format(type(ex).__name__, ex.args)
                    self._logger.error("%s : traceback %s for %s info %s" % \
                        (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
                    self._reconnect = True

        except Exception as ex:
            template = "Exception {0} in AnalyticsDiscovery run. Args:\n{1!r}"
            messag = template.format(type(ex).__name__, ex.args)
            self._logger.error("%s : traceback %s for %s info %s" % \
                    (messag, traceback.format_exc(), self._svc_name, str(self._pubinfo)))
            raise SystemExit
Beispiel #49
0
from kazoo.client import KazooClient

zk = KazooClient('zk:2181')
zk.start()
zk.delete('/lymph', recursive=True)
zk.stop()
Beispiel #50
0
class SimpleSwitch13(app_manager.RyuApp):
    OFP_VERSIONS = [ofproto_v1_3.OFP_VERSION]

    def __init__(self, *args, **kwargs):
        super(SimpleSwitch13, self).__init__(*args, **kwargs)
        self.mac_to_port = {}
        self.zkConf = {
            'root': '/multicontroller',
            'topo': '/topology',
            'swstat': '/swstat',
            'counter': '/counter'
        }
        self.zk = KazooClient('127.0.0.1:2181')
        self.zk.start()
        self.ip = '202.201.3.51'
        self.sws = {}
        self.gid = random.randint(0, 10000)
        self.dps = {}
        self.links = []
        self.interval = 5
        self.role = OFPCR_ROLE_EQUAL
        self.topoThread = hub.spawn(self._topoThread)
        self.linkThread = hub.spawn(self._linkDiscover)
        self.clearLinkThread = hub.spawn(self._cleanLinks)
        self.clearLinkThread = hub.spawn(self._cleanSwitches)

    def _cleanSwitches(self):
        while True:
            self.sws = {k: self.sws[k] for k in self.sws if self.sws[k]}
            hub.sleep(self.interval)

    def _topoThread(self):
        linkNode = self.zkConf['root'] + self.zkConf['topo'] + self.ip
        if self.zk.exists(linkNode):
            self.zk.set(linkNode, json.dumps(self.links))
        else:
            self.zk.create(linkNode, json.dumps(self.links))
        hub.sleep(self.interval)

    def _linkDiscover(self):
        while True:
            for dpid in self.dps:
                self.sendSlldp(dpid)
            hub.sleep(self.interval)

    def sendSlldp(self, dpid):
        dp = self.dps.get(dpid)
        if dp is None:
            return
        actions = [dp.ofproto_parser.OFPActionOutput(dp.ofproto.OFPP_FLOOD)]
        pkt = packet.Packet()
        pkt.add_protocol(
            ethernet.ethernet(ethertype=ETH_TYPE_SLLDP,
                              dst=SLLDP_MAC_DST,
                              src=SLLDP_MAC_SRC))
        pkt.add_protocol(slldp(dp.id))
        pkt.serialize()
        slldpPacket = pkt.data
        out = dp.ofproto_parser.OFPPacketOut(
            datapath=dp,
            in_port=dp.ofproto.OFPP_CONTROLLER,
            buffer_id=dp.ofproto.OFP_NO_BUFFER,
            actions=actions,
            data=slldpPacket)
        dp.send_msg(out)

    def getLinks(self):
        topoNode = self.zkConf['root'] + self.zkConf['topo']
        ips = self.zk.get_children(topoNode)
        res = []
        for ip in ips:
            links = self.zk.get(topoNode + ip)[0]
            for link in links:
                res.append(link)
        return res

    @set_ev_cls(ofp_event.EventOFPPacketIn, MAIN_DISPATCHER)
    def _packet_in_handler(self, ev):

        msg = ev.msg
        pkt = packet.Packet(msg.data)
        eth = pkt.get_protocols(ethernet.ethernet)[0]
        dst = eth.dst

        # SLLDP packet
        if dst == SLLDP_MAC_DST:
            self.handleSlldp(ev)
            return
        # process packet_in message in subclass
        self.packet_in_process(ev)

    def handleSlldp(self, ev):
        msg = ev.msg
        datapath = msg.datapath
        dpid = datapath.id
        inPort = msg.match['in_port']

        pkt = packet.Packet(msg.data)
        slldpBuff = pkt.get_protocols(ethernet.ethernet)[2]
        dpidSrc, _ = slldp.parser(slldpBuff)
        self.links.append({
            'srcdpid': dpidSrc,
            'dst': {
                'dpid': dpid,
                'port': inPort
            },
            'time': time.time()
        })

    def _cleanLinks(self):
        while True:
            now = time.time()
            self.links = [
                l for l in self.links if now - l['time'] < self.interval
            ]
            hub.sleep(self.interval)

    @abc.abstractmethod
    def packet_in_process(self, ev):
        pass

    @set_ev_cls(event.EventSwitchEnter)
    def switch_enter(self, ev):
        dpid = ev.datapath.id
        self.sws[dpid] = True
        self.dps[dpid] = ev.datapath
        dpNode = self.zkConf['root'] + self.zkConf['swstat'] \
                + '/' + dpid_to_str(dpid)
        self.zk.ensure_path(dpNode)
        if self.election(dpid):
            self.role = OFPCR_ROLE_MASTER
        else:
            self.role = OFPCR_ROLE_SLAVE
        self.countUp(dpid)
        self.roleRequest(dpid, self.role)
        mflag = dpNode + '/' + 'master'
        DataWatch(self.zk, mflag, self.masterWatcher)

    def masterWatcher(self, data, stat, ev):
        if ev and ev.type == 'DELETED':
            _, _, dpid, _ = ev.path.split('/')
            dpid = str_to_dpid(dpid)
            if self.sws.get(dpid):
                if self.election(dpid):
                    self.role = OFPCR_ROLE_MASTER
                    self.roleRequest(dpid, self.role)
            return self.sws.get(dpid)

    def election(self, dpid):
        dpNode = self.zkConf['root'] + self.zkConf['swstat'] \
                + '/' + dpid_to_str(dpid)
        mflag = dpNode + '/' + 'master'
        while not self.zk.exists(mflag):
            mlock = self.zk.Lock(dpNode + '/' + 'mlock', self.ip)
            with mlock:
                if not self.zk.exists(mflag):
                    self.zk.create(mflag, self.ip, ephemeral=True)
            if self.zk.exists(mflag):
                if self.zk.get(mflag) == self.ip:
                    return True
                else:
                    return False
            else:
                time.sleep(random.randint(0, 100) / 500.0)
        return False

    def roleRequest(self, dp, role):
        msg = dp.ofproto_parser.OFPRoleRequest(dp, role, self.gid)
        dp.send_msg(msg)

    def getCount(self, dpid):
        dpNode =  self.zkConf['root'] + self.zkConf['swstat'] \
                + '/' + dpid_to_str(dpid)
        countNode = dpNode + self.zkConf['counter']
        counters = self.zk.get_children(countNode)
        return len(counters)

    def countUp(self, dpid):
        countNode = self.zkConf['root'] + self.zkConf['swstat'] \
                + '/' + dpid_to_str(dpid) + self.zkConf['counter']
        self.zk.ensure_path(countNode)
        self.zk.create(countNode + uuid4().hex, 'alive', ephemeral=True)

    @set_ev_cls(event.EventSwitchLeave)
    def switch_levave(self, ev):
        dpid = ev.datapath
        count = self.getCount(dpid)
        self.sws[dpid] = False
        if count == 0:
            dpNode =  self.zkConf['root'] + self.zkConf['swstat'] \
                    + '/' + dpid_to_str(dpid)
        self.zk.delete(dpNode, recursive=True)
Beispiel #51
0
class ArcusZooKeeper:
    """
  ZooKeeper helper for Arcus
  """

    def __init__(self, hostports, timeout):
        self.hostports = hostports
        self.timeout = timeout
        self.zk = KazooClient(hosts=hostports, read_only=False)

    def start(self):
        self.zk.start()

    def stop(self):
        self.zk.stop()

    def init_structure(self):
        if self.zk.exists("/arcus"):
            print "init_arcus_structure: fail (/arcus exists)"
            return False

        tx = self.zk.transaction()
        tx.create("/arcus", b"")
        tx.create("/arcus/cache_list", b"")
        tx.create("/arcus/client_list", b"")
        tx.create("/arcus/cache_server_mapping", b"")
        results = tx.commit()
        if len(results) > 0:
            print results
            return False

        print "init_structure: success"
        return True

    def drop_structure(self):
        self.zk.delete("/arcus", recursive=True)
        print "delete_structure: success"

    def get_structure(self):
        return self.zk.get_children("/arcus")

    def get_mapping_for_service(self, service_code):
        result = []
        cache_list = "/arcus/cache_list/%s" % service_code
        mapping = "/arcus/cache_server_mapping"

        all = self.zk.get_children(mapping)
        for ipport in all:
            codes = self.zk.get_children("%s/%s" % (mapping, ipport))
            if len(codes) > 0:
                if codes[0] == service_code:
                    result.append("%s/%s" % (mapping, ipport))

        return result

    def get_config_for_service(self, service_code):
        cache_list = "/arcus/cache_list/%s" % service_code
        data, stat = self.zk.get(cache_list)
        return json.loads(data), data, stat

    def update_service_code(self, cluster):
        cache_list = "/arcus/cache_list/%s" % cluster["serviceCode"]
        client_list = "/arcus/client_list/%s" % cluster["serviceCode"]
        mapping = "/arcus/cache_server_mapping"

        try:
            delete_list = self.get_mapping_for_service(cluster["serviceCode"])

            # 0. Create a transaction
            tx = self.zk.transaction()

            # 1. Cache list
            if self.zk.exists(cache_list):
                c1, c2, c3 = self.get_config_for_service(cluster["serviceCode"])
                cluster["created"] = c1.get("created")
                cluster["modified"] = str(datetime.datetime.now())
                tx.set_data(cache_list, json.dumps(cluster))
            else:
                cluster["created"] = str(datetime.datetime.now())
                tx.create("/arcus/cache_list/%s" % cluster["serviceCode"], json.dumps(cluster))

            # 2. Client list
            if self.zk.exists(client_list):
                pass
            else:
                tx.create("/arcus/client_list/%s" % cluster["serviceCode"], b"")

            # 3. Mapping
            for each in delete_list:
                tx.delete("%s/%s" % (each, cluster["serviceCode"]))
                tx.delete(each)

            for server in cluster["servers"]:
                global_config = cluster.get("config", {})
                per_node_config = server.get("config", {})
                config = dict(global_config.items() + per_node_config.items())

                if len(config) == 0:
                    print "update_service_code: config not found for {0}".format(server)
                    continue

                map_ip = "/arcus/cache_server_mapping/%s:%s" % (server["ip"], config["port"])
                map_code = "%s/%s" % (map_ip, cluster["serviceCode"])

                tx.create(map_ip, json.dumps(config))
                tx.create(map_code, b"")

            # 4. Commit
            results = tx.commit()
            print results
        except Exception as e:
            traceback.print_exc()

    def delete_service_code(self, cluster):
        cache_list = "/arcus/cache_list/%s" % cluster["serviceCode"]
        client_list = "/arcus/client_list/%s" % cluster["serviceCode"]
        mapping = "/arcus/cache_server_mapping"

        try:
            delete_list = self.get_mapping_for_service(cluster["serviceCode"])

            # 0. Create a transaction
            tx = self.zk.transaction()

            # 1. Cache list
            tx.delete("/arcus/cache_list/%s" % cluster["serviceCode"])

            # 2. Client list
            tx.delete("/arcus/client_list/%s" % cluster["serviceCode"])

            # 3. Mapping
            for each in delete_list:
                tx.delete("%s/%s" % (each, cluster["serviceCode"]))
                tx.delete(each)

            # 4. Commit
            results = tx.commit()
            print results
        except Exception as e:
            traceback.print_exc()

    def list_all_service_code(self):
        result = []
        cache_list = "/arcus/cache_list"

        try:
            list = self.zk.get_children(cache_list)
            for each in list:
                result.append(self.list_service_code(each))
            return result
        except Exception as e:
            traceback.print_exc()

    def list_service_code(self, service_code):
        result = {}
        cache_list = "/arcus/cache_list/%s" % service_code
        client_list = "/arcus/client_list/%s" % service_code
        mapping = "/arcus/cache_server_mapping"

        try:
            data, stat = self.zk.get(cache_list)
            static_list = self.get_mapping_for_service(service_code)
            current_list = self.zk.get_children(cache_list)

            # sort the lists
            static_list.sort()
            current_list.sort()

            # get clusterConfig
            cluster = json.loads(data)

            # get clusterStatus
            static_set = set([each.split("/")[-1] for each in static_list])
            current_set = set([each.split("-")[0] for each in current_list])
            offline = static_set - current_set
            online = static_set - offline
            undefined = current_set - static_set

            result["serviceCode"] = service_code
            result["config"] = cluster
            result["online"] = list(online)
            result["offline"] = list(offline)
            result["undefined"] = list(undefined)
            result["created"] = cluster.get("created")
            result["modified"] = cluster.get("modified")
            return result

        except Exception as e:
            traceback.print_exc()
Beispiel #52
0
class ZooKeeperClient:
    """
    Handles basic ZooKeeper events
    """
    def __init__(self, zk_hosts, log_name="ZooKeeperClient", prefix=""):
        """
        :param zk_hosts: List of ZooKeepers hosts
        :param log_name: Name of the logger to use
        :param prefix: Prefix to all paths
        """
        self._zk = KazooClient(zk_hosts)
        self._zk.add_listener(self.__conn_listener)
        self.__prefix = prefix

        # Session state
        self.__connected = False
        self.__online = False
        self.__stop = False

        # Notification queue
        self._logger = logging.getLogger(log_name)
        self._queue = ThreadPool(1, 1, logname=log_name)

    @property
    def prefix(self):
        """
        Prefix to all ZooKeeper nodes
        """
        return self.__prefix

    @property
    def connected(self):
        """
        ZooKeeper client state: connected to the quorum
        """
        return self.__connected

    @property
    def online(self):
        """
        ZooKeeper client state: connected & online (session active)
        """
        return self.__online

    @property
    def stopped(self):
        """
        ZooKeeper client status (stop requested)
        """
        return self.__stop

    def __conn_listener(self, state):
        """
        Connection event listener

        :param state: The new connection state
        """
        if state == KazooState.CONNECTED:
            self.__online = True
            if not self.__connected:
                self.__connected = True
                self._logger.info("Connected to ZooKeeper")
                self._queue.enqueue(self.on_first_connection)
            else:
                self._logger.warning("Re-connected to ZooKeeper")
                self._queue.enqueue(self.on_client_reconnection)
        elif state == KazooState.SUSPENDED:
            self._logger.warning("Connection suspended")
            self.__online = False
        elif state == KazooState.LOST:
            self.__online = False
            self.__connected = False

            if self.__stop:
                self._logger.info("Disconnected from ZooKeeper (requested)")
            else:
                self._logger.warning("Connection lost")

    def start(self):
        """
        Starts the connection
        """
        self.__stop = False
        self._queue.start()
        self._zk.start()

    def stop(self):
        """
        Stops the connection
        """
        self.__stop = True
        self._queue.stop()
        self._zk.stop()

    @staticmethod
    def on_first_connection():
        """
        Called when the client is connected for the first time
        """
        pass

    @staticmethod
    def on_client_reconnection():
        """
        Called when the client is reconnected to the server
        """
        pass

    def __path(self, path):
        """
        Adds the prefix to the given path

        :param path: Z-Path
        :return: Prefixed Z-Path
        """
        if path.startswith(self.__prefix):
            return path

        return "{}{}".format(self.__prefix, path)

    def create(self, path, data, ephemeral=False, sequence=False):
        """
        Creates a ZooKeeper node

        :param path: Z-Path
        :param data: Node Content
        :param ephemeral: Ephemeral flag
        :param sequence: Sequential flag
        """
        return self._zk.create(self.__path(path),
                               data,
                               ephemeral=ephemeral,
                               sequence=sequence)

    def ensure_path(self, path):
        """
        Ensures that a path exists, creates it if necessary

        :param path: Z-Path
        """
        return self._zk.ensure_path(self.__path(path))

    def get(self, path, watch=None):
        """
        Gets the content of a ZooKeeper node

        :param path: Z-Path
        :param watch: Watch method
        """
        return self._zk.get(self.__path(path), watch=watch)

    def get_children(self, path, watch=None):
        """
        Gets the list of children of a node

        :param path: Z-Path
        :param watch: Watch method
        """
        return self._zk.get_children(self.__path(path), watch=watch)

    def set(self, path, data):
        """
        Sets the content of a ZooKeeper node

        :param path: Z-Path
        :param data: New content
        """
        return self._zk.set(self.__path(path), data)

    def delete(self, path):
        """
        Deletes a node

        :param path: Z-Path
        """
        return self._zk.delete(self.__path(path))
Beispiel #53
0
class ZooKeeper(AbstractDCS):

    def __init__(self, config):
        super(ZooKeeper, self).__init__(config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self._client = KazooClient(hosts, handler=PatroniSequentialThreadingHandler(config['retry_timeout']),
                                   timeout=config['ttl'], connection_retry={'max_delay': 1, 'max_tries': -1},
                                   command_retry={'deadline': config['retry_timeout'], 'max_delay': 1, 'max_tries': -1})
        self._client.add_listener(self.session_listener)

        self._my_member_data = None
        self._fetch_cluster = True
        self._last_leader_operation = 0

        self._orig_kazoo_connect = self._client._connection._connect
        self._client._connection._connect = self._kazoo_connect

        self._client.start()

    def _kazoo_connect(self, host, port):

        """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no
        response on Ping after Ping interval (1/2 from read_timeout) it will consider current
        connection dead and try to connect to another node. Without this "magic" it was taking
        up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had
        only small time for reconnect and retry.

        This method is needed to return different value of read_timeout, which is not calculated
        from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller
        than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and
        `write_leader_optime()` methods, which also may hang..."""

        ret = self._orig_kazoo_connect(host, port)
        return max(self.loop_wait - 2, 2)*1000, ret[1]

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.event.set()

    def reload_config(self, config):
        self.set_retry_timeout(config['retry_timeout'])

        loop_wait = config['loop_wait']

        loop_wait_changed = self._loop_wait != loop_wait
        self._loop_wait = loop_wait
        self._client.handler.set_connect_timeout(loop_wait)

        # We need to reestablish connection to zookeeper if we want to change
        # read_timeout (and Ping interval respectively), because read_timeout
        # is calculated in `_kazoo_connect` method. If we are changing ttl at
        # the same time, set_ttl method will reestablish connection and return
        # `!True`, otherwise we will close existing connection and let kazoo
        # open the new one.
        if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed:
            self._client._connection._socket.close()

    def set_ttl(self, ttl):
        """It is not possible to change ttl (session_timeout) in zookeeper without
        destroying old session and creating the new one. This method returns `!True`
        if session_timeout has been changed (`restart()` has been called)."""
        if self._client._session_timeout != ttl:
            self._client._session_timeout = ttl
            self._client.restart()
            return True

    def set_retry_timeout(self, retry_timeout):
        self._client._retry.deadline = retry_timeout

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner, value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self):
        members = []
        for member in self.get_children(self.members_path, self.cluster_watcher):
            data = self.get_node(self.members_path + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path) or [None])[0] if self._INITIALIZE in nodes else None

        # get global dynamic configuration
        config = self.get_node(self.config_path, watch=self.cluster_watcher) if self._CONFIG in nodes else None
        config = config and ClusterConfig.from_node(config[1].version, config[0], config[1].mzxid)

        # get list of members
        members = self.load_members() if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if leader[0] == self._name and client_id is not None and client_id[0] != leader[1].ephemeralOwner:
                logger.info('I am leader but not owner of the session. Removing leader node')
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]] or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner, member)
                self._fetch_cluster = member.index == -1

        # failover key
        failover = self.get_node(self.failover_path, watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        failover = failover and Failover.from_node(failover[1].version, failover[0])

        # get last leader operation
        optime = self.get_node(self.leader_optime_path) if self._OPTIME in nodes and self._fetch_cluster else None
        self._last_leader_operation = 0 if optime is None else int(optime[0])
        self._cluster = Cluster(initialize, config, leader, self._last_leader_operation, members, failover)

    def _load_cluster(self):
        if self._fetch_cluster or self._cluster is None:
            try:
                self._client.retry(self._inner_load_cluster)
            except Exception:
                logger.exception('get_cluster')
                self.cluster_watcher(None)
                raise ZooKeeperError('ZooKeeper in not responding properly')

    def _create(self, path, value, **kwargs):
        try:
            self._client.retry(self._client.create, path, value.encode('utf-8'), **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self, permanent=False):
        ret = self._create(self.leader_path, self._name, makepath=True, ephemeral=not permanent)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def set_failover_value(self, value, index=None):
        try:
            self._client.retry(self._client.set, self.failover_path, value.encode('utf-8'), version=index or -1)
            return True
        except NoNodeError:
            return value == '' or (index is None and self._create(self.failover_path, value))
        except:
            logging.exception('set_failover_value')
            return False

    def set_config_value(self, value, index=None):
        try:
            self._client.retry(self._client.set, self.config_path, value.encode('utf-8'), version=index or -1)
            return True
        except NoNodeError:
            return index is None and self._create(self.config_path, value)
        except Exception:
            logging.exception('set_config_value')
            return False

    def initialize(self, create_new=True, sysid=""):
        return self._create(self.initialize_path, sysid, makepath=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path,  sysid.encode("utf-8"))

    def touch_member(self, data, ttl=None, permanent=False):
        cluster = self.cluster
        member = cluster and ([m for m in cluster.members if m.name == self._name] or [None])[0]
        data = data.encode('utf-8')
        if member and self._client.client_id is not None and member.session != self._client.client_id[0]:
            try:
                self._client.delete_async(self.member_path).get(timeout=1)
            except NoNodeError:
                pass
            except:
                return False
            member = None

        if member:
            if data == self._my_member_data:
                return True
        else:
            try:
                self._client.create_async(self.member_path, data, makepath=True, ephemeral=not permanent).get(timeout=1)
                self._my_member_data = data
                return True
            except Exception as e:
                if not isinstance(e, NodeExistsError):
                    logger.exception('touch_member')
                    return False
        try:
            self._client.set_async(self.member_path, data).get(timeout=1)
            self._my_member_data = data
            return True
        except:
            logger.exception('touch_member')

        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def write_leader_optime(self, last_operation):
        last_operation = last_operation.encode('utf-8')
        if last_operation != self._last_leader_operation:
            try:
                self._client.set_async(self.leader_optime_path, last_operation).get(timeout=1)
                self._last_leader_operation = last_operation
            except NoNodeError:
                try:
                    self._client.create_async(self.leader_optime_path, last_operation, makepath=True).get(timeout=1)
                    self._last_leader_operation = last_operation
                except:
                    logger.exception('Failed to create %s', self.leader_optime_path)
            except:
                logger.exception('Failed to update %s', self.leader_optime_path)

    def update_leader(self):
        return True

    def delete_leader(self):
        self._client.restart()
        self._my_member_data = None
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete, self.client_path(''), recursive=True)
        except NoNodeError:
            return True

    def watch(self, timeout):
        if super(ZooKeeper, self).watch(timeout):
            self._fetch_cluster = True
        return self._fetch_cluster
Beispiel #54
0
class ZK:
    """Opens a connection to a kafka zookeeper. "
    "To be used in the 'with' statement."""
    def __init__(self, cluster_config):
        self.cluster_config = cluster_config

    def __enter__(self):
        kazooRetry = KazooRetry(max_tries=5, )
        self.zk = KazooClient(
            hosts=self.cluster_config.zookeeper,
            read_only=True,
            connection_retry=kazooRetry,
        )
        _log.debug(
            "ZK: Creating new zookeeper connection: {zookeeper}".format(
                zookeeper=self.cluster_config.zookeeper), )
        self.zk.start()
        return self

    def __exit__(self, type, value, traceback):
        self.zk.stop()

    def get_children(self, path, watch=None):
        """Returns the children of the specified node."""
        _log.debug("ZK: Getting children of {path}".format(path=path), )
        return self.zk.get_children(path, watch)

    def get(self, path, watch=None):
        """Returns the data of the specified node."""
        _log.debug("ZK: Getting {path}".format(path=path), )
        return self.zk.get(path, watch)

    def set(self, path, value):
        """Sets and returns new data for the specified node."""
        _log.debug("ZK: Setting {path} to {value}".format(path=path,
                                                          value=value))
        return self.zk.set(path, value)

    def get_json(self, path, watch=None):
        """Reads the data of the specified node and converts it to json."""
        data, _ = self.get(path, watch)
        return json.loads(data) if data else None

    def get_broker_metadata(self, broker_id):
        try:
            broker_json, _ = self.get(
                "/brokers/ids/{b_id}".format(b_id=broker_id))
        except NoNodeError:
            _log.error("broker '{b_id}' not found.".format(b_id=broker_id), )
            raise
        return json.loads(broker_json)

    def get_brokers(self, names_only=False):
        """Get information on all the available brokers.

        :rtype : dict of brokers
        """
        broker_ids = self.get_children("/brokers/ids")

        # Return broker-ids only
        if names_only:
            return {int(b_id): None for b_id in broker_ids}
        return {
            int(b_id): self.get_broker_metadata(b_id)
            for b_id in broker_ids
        }

    def get_topic_config(self, topic):
        """Get configuration information for specified topic.

        :rtype : dict of configuration
        """
        try:
            config_data = json.loads(
                self.get("/config/topics/{topic}".format(topic=topic))[0])
        except NoNodeError as e:
            _log.error("topic {topic} not found.".format(topic=topic))
            raise e
        return config_data

    def set_topic_config(self, topic, value):
        """Set configuration information for specified topic.

        :rtype : dict of new configuration"""
        try:
            config_data = json.dumps(value)
            # Change value
            return_value = self.set(
                "/config/topics/{topic}".format(topic=topic), config_data)
            # Create change
            self.create('/config/changes/config_change_', topic, sequence=True)
        except NoNodeError as e:
            _log.error("topic {topic} not found.".format(topic=topic))
            raise e
        return return_value

    def get_topics(
        self,
        topic_name=None,
        names_only=False,
        fetch_partition_state=True,
    ):
        """Get information on all the available topics.

        Topic-data format with fetch_partition_state as False :-
        topic_data = {
            'version': 1,
            'partitions': {
                <p_id>: {
                    replicas: <broker-ids>
                }
            }
        }

        Topic-data format with fetch_partition_state as True:-
        topic_data = {
            'version': 1,
            'partitions': {
                <p_id>:{
                    replicas: [<broker_id>, <broker_id>, ...],
                    isr: [<broker_id>, <broker_id>, ...],
                    controller_epoch: <val>,
                    leader_epoch: <val>,
                    version: 1,
                    leader: <broker-id>,
            }
        }
        Note: By default we also fetch partition-state which results in
        accessing the zookeeper twice. If just partition-replica information is
        required fetch_partition_state should be set to False.
        """
        topic_ids = [topic_name] if topic_name else self.get_children(
            "/brokers/topics", )
        if names_only:
            return topic_ids
        topics_data = {}
        for topic_id in topic_ids:
            try:
                topic_data = json.loads(
                    self.get("/brokers/topics/{id}".format(id=topic_id))[0], )
            except NoNodeError:
                _log.error(
                    "topic '{topic}' not found.".format(topic=topic_id), )
                return {}
            # Prepare data for each partition
            partitions_data = {}
            for p_id, replicas in topic_data['partitions'].iteritems():
                partitions_data[p_id] = {}
                if fetch_partition_state:
                    # Fetch partition-state from zookeeper
                    partitions_data[p_id] = self._fetch_partition_state(
                        topic_id, p_id)
                partitions_data[p_id]['replicas'] = replicas
            topic_data['partitions'] = partitions_data
            topics_data[topic_id] = topic_data
        return topics_data

    def get_consumer_groups(self, consumer_group_id=None, names_only=False):
        """Get information on all the available consumer-groups.

        If names_only is False, only list of consumer-group ids are sent.
        If names_only is True, Consumer group offset details are returned
        for all consumer-groups or given consumer-group if given in dict
        format as:-

        {
            'group-id':
            {
                'topic':
                {
                    'partition': offset-value,
                    ...
                    ...
                }
            }
        }

        :rtype: dict of consumer-group offset details
        """
        if consumer_group_id is None:
            group_ids = self.get_children("/consumers")
        else:
            group_ids = [consumer_group_id]

        # Return consumer-group-ids only
        if names_only:
            return {g_id: None for g_id in group_ids}

        consumer_offsets = {}
        for g_id in group_ids:
            consumer_offsets[g_id] = self.get_group_offsets(g_id)
        return consumer_offsets

    def get_group_offsets(self, group, topic=None):
        """Fetch group offsets for given topic and partition otherwise all topics
        and partitions otherwise.


        {
            'topic':
            {
                'partition': offset-value,
                ...
                ...
            }
        }
        """
        group_offsets = {}
        try:
            all_topics = self.get_my_subscribed_topics(group)
        except NoNodeError:
            # No offset information of given consumer-group
            _log.warning(
                "No topics subscribed to consumer-group {group}.".format(
                    group=group, ), )
            return group_offsets
        if topic:
            if topic in all_topics:
                topics = [topic]
            else:
                _log.error(
                    "Topic {topic} not found in topic list {topics} for consumer"
                    "-group {consumer_group}.".format(
                        topic=topic,
                        topics=', '.join(topic for topic in all_topics),
                        consumer_group=group,
                    ), )
                return group_offsets
        else:
            topics = all_topics
        for topic in topics:
            group_offsets[topic] = {}
            try:
                partitions = self.get_my_subscribed_partitions(group, topic)
            except NoNodeError:
                _log.warning(
                    "No partition offsets found for topic {topic}. "
                    "Continuing to next one...".format(topic=topic), )
                continue
            # Fetch offsets for each partition
            for partition in partitions:
                path = "/consumers/{group_id}/offsets/{topic}/{partition}".format(
                    group_id=group,
                    topic=topic,
                    partition=partition,
                )
                try:
                    # Get current offset
                    offset_json, _ = self.get(path)
                    group_offsets[topic][partition] = json.loads(offset_json)
                except NoNodeError:
                    _log.error("Path {path} not found".format(path=path))
                    raise
        return group_offsets

    def _fetch_partition_state(self, topic_id, partition_id):
        """Fetch partition-state for given topic-partition."""
        state_path = "/brokers/topics/{topic_id}/partitions/{p_id}/state"
        try:
            partition_json, _ = self.get(
                state_path.format(topic_id=topic_id, p_id=partition_id), )
            return json.loads(partition_json)
        except NoNodeError:
            return {}  # The partition has no data

    def get_my_subscribed_topics(self, groupid):
        """Get the list of topics that a consumer is subscribed to

        :param: groupid: The consumer group ID for the consumer
        :returns list of kafka topics
        :rtype: list
        """
        path = "/consumers/{group_id}/offsets".format(group_id=groupid)
        return self.get_children(path)

    def get_my_subscribed_partitions(self, groupid, topic):
        """Get the list of partitions of a topic
        that a consumer is subscribed to

        :param: groupid: The consumer group ID for the consumer
        :param: topic: The topic name
        :returns list of partitions
        :rtype: list
        """
        path = "/consumers/{group_id}/offsets/{topic}".format(
            group_id=groupid,
            topic=topic,
        )
        return self.get_children(path)

    def get_cluster_assignment(self):
        """Fetch the cluster layout in form of assignment from zookeeper"""
        plan = self.get_cluster_plan()
        assignment = {}
        for elem in plan['partitions']:
            assignment[(elem['topic'], elem['partition'])] = elem['replicas']

        return assignment

    def create(self,
               path,
               value='',
               acl=None,
               ephemeral=False,
               sequence=False,
               makepath=False):
        """Creates a Zookeeper node.

        :param: path: The zookeeper node path
        :param: value: Zookeeper node value
        :param: acl: ACL list
        :param: ephemeral: Boolean indicating where this node is tied to
          this session.
        :param: sequence:  Boolean indicating whether path is suffixed
          with a unique index.
        :param: makepath: Whether the path should be created if it doesn't
          exist.
        """
        _log.debug("ZK: Creating node " + path)
        return self.zk.create(path, value, acl, ephemeral, sequence, makepath)

    def delete(self, path, recursive=False):
        """Deletes a Zookeeper node.

        :param: path: The zookeeper node path
        :param: recursive: Recursively delete node and all its children.
        """
        _log.debug("ZK: Deleting node " + path)
        return self.zk.delete(path, recursive=recursive)

    def delete_topic_partitions(self, groupid, topic, partitions):
        """Delete the specified partitions within the topic that the consumer
        is subscribed to.

        :param: groupid: The consumer group ID for the consumer.
        :param: topic: Kafka topic.
        :param: partitions: List of partitions within the topic to be deleted.
        :raises:
          NoNodeError: if the consumer is not subscribed to the topic

          ZookeeperError: if there is an error with Zookeeper
        """
        for partition in partitions:
            path = "/consumers/{groupid}/offsets/{topic}/{partition}".format(
                groupid=groupid, topic=topic, partition=partition)
            self.delete(path)

    def delete_topic(self, groupid, topic):
        path = "/consumers/{groupid}/offsets/{topic}".format(
            groupid=groupid,
            topic=topic,
        )
        self.delete(path, True)

    def delete_group(self, groupid):
        path = "/consumers/{groupid}".format(groupid=groupid, )
        self.delete(path, True)

    def execute_plan(self, plan):
        """Submit reassignment plan for execution."""
        reassignment_path = '{admin}/{reassignment_node}'\
            .format(admin=ADMIN_PATH, reassignment_node=REASSIGNMENT_NODE)
        plan_json = json.dumps(plan)
        base_plan = self.get_cluster_plan()
        if not validate_plan(plan, base_plan):
            _log.error('Given plan is invalid. ABORTING reassignment...')
            return False
        # Send proposed-plan to zookeeper
        try:
            _log.info('Sending plan to Zookeeper...')
            self.create(reassignment_path, plan_json, makepath=True)
            _log.info(
                'Re-assign partitions node in Zookeeper updated successfully '
                'with {plan}'.format(plan=plan), )
            return True
        except NodeExistsError:
            _log.warning('Previous plan in progress. Exiting..')
            in_progress_plan = json.loads(self.get(reassignment_path)[0])
            in_progress_partitions = [
                '{topic}-{p_id}'.format(
                    topic=p_data['topic'],
                    p_id=str(p_data['partition']),
                ) for p_data in in_progress_plan['partitions']
            ]
            _log.warning(
                '{count} partition(s) reassignment currently in progress:-'.
                format(count=len(in_progress_partitions)), )
            _log.warning(
                '{partitions}. ABORTING reassignment...'.format(
                    partitions=', '.join(in_progress_partitions), ), )
            return False
        except Exception as e:
            _log.error(
                'Could not re-assign partitions {plan}. Error: {e}'.format(
                    plan=plan, e=e), )
            return False

    def get_cluster_plan(self):
        """Fetch cluster plan from zookeeper."""

        _log.info('Fetching current cluster-topology from Zookeeper...')
        cluster_layout = self.get_topics(fetch_partition_state=False)
        # Re-format cluster-layout
        partitions = [
            {
                'topic': topic_id,
                'partition': int(p_id),
                'replicas': partitions_data['replicas']
            } for topic_id, topic_info in cluster_layout.iteritems()
            for p_id, partitions_data in topic_info['partitions'].iteritems()
        ]
        return {'version': 1, 'partitions': partitions}

    def get_pending_plan(self):
        """Read the currently running plan on reassign_partitions node."""
        reassignment_path = '{admin}/{reassignment_node}'\
            .format(admin=ADMIN_PATH, reassignment_node=REASSIGNMENT_NODE)
        try:
            result = self.get(reassignment_path)
            return json.loads(result[0])
        except NoNodeError:
            return {}
Beispiel #55
0
class BalancedConsumer(object):
    """
    A self-balancing consumer for Kafka that uses ZooKeeper to communicate
    with other balancing consumers.

    Maintains a single instance of SimpleConsumer, periodically using the
    consumer rebalancing algorithm to reassign partitions to this
    SimpleConsumer.
    """
    def __init__(self,
                 topic,
                 cluster,
                 consumer_group,
                 fetch_message_max_bytes=1024 * 1024,
                 num_consumer_fetchers=1,
                 auto_commit_enable=False,
                 auto_commit_interval_ms=60 * 1000,
                 queued_max_messages=2000,
                 fetch_min_bytes=1,
                 fetch_error_backoff_ms=500,
                 fetch_wait_max_ms=100,
                 offsets_channel_backoff_ms=1000,
                 offsets_commit_max_retries=5,
                 auto_offset_reset=OffsetType.EARLIEST,
                 consumer_timeout_ms=-1,
                 rebalance_max_retries=5,
                 rebalance_backoff_ms=2 * 1000,
                 zookeeper_connection_timeout_ms=6 * 1000,
                 zookeeper_connect=None,
                 zookeeper_hosts='127.0.0.1:2181',
                 zookeeper=None,
                 auto_start=True,
                 reset_offset_on_start=False,
                 post_rebalance_callback=None,
                 use_rdkafka=False,
                 compacted_topic=False,
                 membership_protocol=RangeProtocol,
                 deserializer=None,
                 reset_offset_on_fetch=True):
        """Create a BalancedConsumer instance

        :param topic: The topic this consumer should consume
        :type topic: :class:`pykafka.topic.Topic`
        :param cluster: The cluster to which this consumer should connect
        :type cluster: :class:`pykafka.cluster.Cluster`
        :param consumer_group: The name of the consumer group this consumer
            should join. Consumer group names are namespaced at the cluster level,
            meaning that two consumers consuming different topics with the same group name
            will be treated as part of the same group.
        :type consumer_group: str
        :param fetch_message_max_bytes: The number of bytes of messages to
            attempt to fetch with each fetch request
        :type fetch_message_max_bytes: int
        :param num_consumer_fetchers: The number of workers used to make
            FetchRequests
        :type num_consumer_fetchers: int
        :param auto_commit_enable: If true, periodically commit to kafka the
            offset of messages already returned from consume() calls. Requires that
            `consumer_group` is not `None`.
        :type auto_commit_enable: bool
        :param auto_commit_interval_ms: The frequency (in milliseconds) at which
            the consumer's offsets are committed to kafka. This setting is
            ignored if `auto_commit_enable` is `False`.
        :type auto_commit_interval_ms: int
        :param queued_max_messages: The maximum number of messages buffered for
            consumption in the internal
            :class:`pykafka.simpleconsumer.SimpleConsumer`
        :type queued_max_messages: int
        :param fetch_min_bytes: The minimum amount of data (in bytes) that the
            server should return for a fetch request. If insufficient data is
            available, the request will block until sufficient data is available.
        :type fetch_min_bytes: int
        :param fetch_error_backoff_ms: *UNUSED*.
            See :class:`pykafka.simpleconsumer.SimpleConsumer`.
        :type fetch_error_backoff_ms: int
        :param fetch_wait_max_ms: The maximum amount of time (in milliseconds)
            that the server will block before answering a fetch request if
            there isn't sufficient data to immediately satisfy `fetch_min_bytes`.
        :type fetch_wait_max_ms: int
        :param offsets_channel_backoff_ms: Backoff time to retry failed offset
            commits and fetches.
        :type offsets_channel_backoff_ms: int
        :param offsets_commit_max_retries: The number of times the offset commit
            worker should retry before raising an error.
        :type offsets_commit_max_retries: int
        :param auto_offset_reset: What to do if an offset is out of range. This
            setting indicates how to reset the consumer's internal offset
            counter when an `OffsetOutOfRangeError` is encountered.
        :type auto_offset_reset: :class:`pykafka.common.OffsetType`
        :param consumer_timeout_ms: Amount of time (in milliseconds) the
            consumer may spend without messages available for consumption
            before returning None.
        :type consumer_timeout_ms: int
        :param rebalance_max_retries: The number of times the rebalance should
            retry before raising an error.
        :type rebalance_max_retries: int
        :param rebalance_backoff_ms: Backoff time (in milliseconds) between
            retries during rebalance.
        :type rebalance_backoff_ms: int
        :param zookeeper_connection_timeout_ms: The maximum time (in
            milliseconds) that the consumer waits while establishing a
            connection to zookeeper.
        :type zookeeper_connection_timeout_ms: int
        :param zookeeper_connect: Deprecated::2.7,3.6 Comma-Separated
            (ip1:port1,ip2:port2) strings indicating the zookeeper nodes to which
            to connect.
        :type zookeeper_connect: str
        :param zookeeper_hosts: KazooClient-formatted string of ZooKeeper hosts to which
            to connect.
        :type zookeeper_hosts: str
        :param zookeeper: A KazooClient connected to a Zookeeper instance.
            If provided, `zookeeper_connect` is ignored.
        :type zookeeper: :class:`kazoo.client.KazooClient`
        :param auto_start: Whether the consumer should begin communicating
            with zookeeper after __init__ is complete. If false, communication
            can be started with `start()`.
        :type auto_start: bool
        :param reset_offset_on_start: Whether the consumer should reset its
            internal offset counter to `self._auto_offset_reset` and commit that
            offset immediately upon starting up
        :type reset_offset_on_start: bool
        :param post_rebalance_callback: A function to be called when a rebalance is
            in progress. This function should accept three arguments: the
            :class:`pykafka.balancedconsumer.BalancedConsumer` instance that just
            completed its rebalance, a dict of partitions that it owned before the
            rebalance, and a dict of partitions it owns after the rebalance. These dicts
            map partition ids to the most recently known offsets for those partitions.
            This function can optionally return a dictionary mapping partition ids to
            offsets. If it does, the consumer will reset its offsets to the supplied
            values before continuing consumption.
            Note that the BalancedConsumer is in a poorly defined state at
            the time this callback runs, so that accessing its properties
            (such as `held_offsets` or `partitions`) might yield confusing
            results.  Instead, the callback should really rely on the
            provided partition-id dicts, which are well-defined.
        :type post_rebalance_callback: function
        :param use_rdkafka: Use librdkafka-backed consumer if available
        :type use_rdkafka: bool
        :param compacted_topic: Set to read from a compacted topic. Forces
            consumer to use less stringent message ordering logic because compacted
            topics do not provide offsets in strict incrementing order.
        :type compacted_topic: bool
        :param membership_protocol: The group membership protocol to which this consumer
            should adhere
        :type membership_protocol: :class:`pykafka.membershipprotocol.GroupMembershipProtocol`
        :param deserializer: A function defining how to deserialize messages returned
            from Kafka. A function with the signature d(value, partition_key) that
            returns a tuple of (deserialized_value, deserialized_partition_key). The
            arguments passed to this function are the bytes representations of a
            message's value and partition key, and the returned data should be these
            fields transformed according to the client code's serialization logic.
            See `pykafka.utils.__init__` for stock implemtations.
        :type deserializer: function
        :param reset_offset_on_fetch: Whether to update offsets during fetch_offsets.
               Disable for read-only use cases to prevent side-effects.
        :type reset_offset_on_fetch: bool
        """
        self._cluster = cluster
        try:
            self._consumer_group = get_string(consumer_group).encode('ascii')
        except UnicodeEncodeError:
            raise UnicodeException("Consumer group name '{}' contains non-ascii "
                                   "characters".format(consumer_group))
        self._topic = topic

        self._auto_commit_enable = auto_commit_enable
        self._auto_commit_interval_ms = valid_int(auto_commit_interval_ms)
        self._fetch_message_max_bytes = valid_int(fetch_message_max_bytes)
        self._fetch_min_bytes = valid_int(fetch_min_bytes)
        self._rebalance_max_retries = valid_int(rebalance_max_retries, allow_zero=True)
        self._num_consumer_fetchers = valid_int(num_consumer_fetchers)
        self._queued_max_messages = valid_int(queued_max_messages)
        self._fetch_wait_max_ms = valid_int(fetch_wait_max_ms, allow_zero=True)
        self._rebalance_backoff_ms = valid_int(rebalance_backoff_ms)
        self._consumer_timeout_ms = valid_int(consumer_timeout_ms,
                                              allow_zero=True, allow_negative=True)
        self._offsets_channel_backoff_ms = valid_int(offsets_channel_backoff_ms)
        self._offsets_commit_max_retries = valid_int(offsets_commit_max_retries,
                                                     allow_zero=True)
        self._auto_offset_reset = auto_offset_reset
        self._zookeeper_connect = zookeeper_connect or zookeeper_hosts
        self._zookeeper_connection_timeout_ms = valid_int(zookeeper_connection_timeout_ms,
                                                          allow_zero=True)
        self._reset_offset_on_start = reset_offset_on_start
        self._post_rebalance_callback = post_rebalance_callback
        self._generation_id = -1
        self._running = False
        self._worker_exception = None
        self._is_compacted_topic = compacted_topic
        self._membership_protocol = membership_protocol
        self._deserializer = deserializer
        self._reset_offset_on_fetch = reset_offset_on_fetch

        if not rdkafka and use_rdkafka:
            raise ImportError("use_rdkafka requires rdkafka to be installed")
        if GEventHandler and isinstance(self._cluster.handler, GEventHandler) and use_rdkafka:
            raise ImportError("use_rdkafka cannot be used with gevent")
        self._use_rdkafka = rdkafka and use_rdkafka

        self._rebalancing_lock = cluster.handler.Lock()
        self._rebalancing_in_progress = self._cluster.handler.Event()
        self._internal_consumer_running = self._cluster.handler.Event()
        self._consumer = None
        self._consumer_id = get_bytes("{hostname}:{uuid}".format(
            hostname=socket.gethostname(),
            uuid=uuid4()
        ))
        self._setting_watches = True

        self._topic_path = '/consumers/{group}/owners/{topic}'.format(
            group=get_string(self._consumer_group),
            topic=self._topic.name)
        self._consumer_id_path = '/consumers/{group}/ids'.format(
            group=get_string(self._consumer_group))

        self._zookeeper = None
        self._owns_zookeeper = zookeeper is None
        if zookeeper is not None:
            self._zookeeper = zookeeper
        if auto_start is True:
            self.start()

    def __del__(self):
        log.debug("Finalising {}".format(self))
        if self._running:
            self.stop()

    def __repr__(self):
        return "<{module}.{name} at {id_} (consumer_group={group})>".format(
            module=self.__class__.__module__,
            name=self.__class__.__name__,
            id_=hex(id(self)),
            group=get_string(self._consumer_group)
        )

    def _raise_worker_exceptions(self):
        """Raises exceptions encountered on worker threads"""
        if self._worker_exception is not None:
            reraise(*self._worker_exception)

    @property
    def topic(self):
        """The topic this consumer consumes"""
        return self._topic

    @property
    def partitions(self):
        """A list of the partitions that this consumer consumes"""
        return self._consumer.partitions if self._consumer else dict()

    @property
    def _partitions(self):
        """Convenient shorthand for set of partitions internally held"""
        return set(
            [] if self.partitions is None else itervalues(self.partitions))

    @property
    def held_offsets(self):
        """Return a map from partition id to held offset for each partition"""
        if not self._consumer:
            return None
        return self._consumer.held_offsets

    def start(self):
        """Open connections and join a consumer group."""
        try:
            if self._zookeeper is None:
                self._setup_zookeeper(self._zookeeper_connect,
                                      self._zookeeper_connection_timeout_ms)
            self._zookeeper.ensure_path(self._topic_path)
            self._add_self()
            self._running = True
            self._set_watches()
            self._rebalance()
        except Exception:
            log.exception("Stopping consumer in response to error")
            self.stop()

    def stop(self):
        """Close the zookeeper connection and stop consuming.

        This method should be called as part of a graceful shutdown process.
        """
        log.debug("Stopping {}".format(self))
        with self._rebalancing_lock:
            # We acquire the lock in order to prevent a race condition where a
            # rebalance that is already underway might re-register the zk
            # nodes that we remove here
            self._running = False
        if self._consumer is not None:
            self._consumer.stop()
        if self._owns_zookeeper:
            # NB this should always come last, so we do not hand over control
            # of our partitions until consumption has really been halted
            self._zookeeper.stop()
        else:
            self._remove_partitions(self._get_held_partitions())
            try:
                self._zookeeper.delete(self._path_self)
            except NoNodeException:
                pass
        # additionally we'd want to remove watches here, but there are no
        # facilities for that in ChildrenWatch - as a workaround we check
        # self._running in the watcher callbacks (see further down)

    def _setup_zookeeper(self, zookeeper_connect, timeout):
        """Open a connection to a ZooKeeper host.

        :param zookeeper_connect: The 'ip:port' address of the zookeeper node to
            which to connect.
        :type zookeeper_connect: str
        :param timeout: Connection timeout (in milliseconds)
        :type timeout: int
        """
        kazoo_kwargs = {'timeout': timeout / 1000}
        if GEventHandler and isinstance(self._cluster.handler, GEventHandler):
            kazoo_kwargs['handler'] = SequentialGeventHandler()
        self._zookeeper = KazooClient(zookeeper_connect, **kazoo_kwargs)
        self._zookeeper.start()

    def _setup_internal_consumer(self, partitions=None, start=True):
        """Instantiate an internal SimpleConsumer instance"""
        if partitions is None:
            partitions = []
        # Only re-create internal consumer if something changed.
        if partitions != self._partitions:
            cns = self._get_internal_consumer(partitions=list(partitions), start=start)
            if self._post_rebalance_callback is not None:
                old_offsets = (self._consumer.held_offsets
                               if self._consumer else dict())
                new_offsets = cns.held_offsets
                try:
                    reset_offsets = self._post_rebalance_callback(
                        self, old_offsets, new_offsets)
                except Exception:
                    log.exception("post rebalance callback threw an exception")
                    self._worker_exception = sys.exc_info()
                    return False

                if reset_offsets:
                    cns.reset_offsets(partition_offsets=[
                        (cns.partitions[id_], offset) for
                        (id_, offset) in iteritems(reset_offsets)])
            self._consumer = cns
        if self._consumer and self._consumer._running:
            if not self._internal_consumer_running.is_set():
                self._internal_consumer_running.set()
        else:
            if self._internal_consumer_running.is_set():
                self._internal_consumer_running.clear()
        return True

    def _get_internal_consumer(self, partitions=None, start=True):
        """Instantiate a SimpleConsumer for internal use.

        If there is already a SimpleConsumer instance held by this object,
        disable its workers and mark it for garbage collection before
        creating a new one.
        """
        if partitions is None:
            partitions = []
        reset_offset_on_start = self._reset_offset_on_start
        if self._consumer is not None:
            self._consumer.stop()
            # only use this setting for the first call to
            # _get_internal_consumer. subsequent calls should not
            # reset the offsets, since they can happen at any time
            reset_offset_on_start = False
        Cls = (rdkafka.RdKafkaSimpleConsumer
               if self._use_rdkafka else SimpleConsumer)
        cns = Cls(
            self._topic,
            self._cluster,
            consumer_group=self._consumer_group,
            partitions=partitions,
            auto_commit_enable=self._auto_commit_enable,
            auto_commit_interval_ms=self._auto_commit_interval_ms,
            fetch_message_max_bytes=self._fetch_message_max_bytes,
            fetch_min_bytes=self._fetch_min_bytes,
            num_consumer_fetchers=self._num_consumer_fetchers,
            queued_max_messages=self._queued_max_messages,
            fetch_wait_max_ms=self._fetch_wait_max_ms,
            consumer_timeout_ms=self._consumer_timeout_ms,
            offsets_channel_backoff_ms=self._offsets_channel_backoff_ms,
            offsets_commit_max_retries=self._offsets_commit_max_retries,
            auto_offset_reset=self._auto_offset_reset,
            reset_offset_on_start=reset_offset_on_start,
            auto_start=False,
            compacted_topic=self._is_compacted_topic,
            deserializer=self._deserializer,
            reset_offset_on_fetch=self._reset_offset_on_fetch
        )
        cns.consumer_id = self._consumer_id
        cns.generation_id = self._generation_id
        if start:
            cns.start()
        return cns

    def _get_participants(self):
        """Use zookeeper to get the other consumers of this topic.

        :return: A sorted list of the ids of other consumers of this
            consumer's topic
        """
        try:
            consumer_ids = self._zookeeper.get_children(self._consumer_id_path)
        except NoNodeException:
            log.debug("Consumer group doesn't exist. "
                      "No participants to find")
            return []

        participants = []
        for id_ in consumer_ids:
            try:
                topic, stat = self._zookeeper.get("%s/%s" % (self._consumer_id_path, id_))
                if topic == self._topic.name:
                    participants.append(get_bytes(id_))
            except NoNodeException:
                pass  # node disappeared between ``get_children`` and ``get``
        participants = sorted(participants)
        return participants

    def _build_watch_callback(self, fn, proxy):
        """Return a function that's safe to use as a ChildrenWatch callback

        Fixes the issue from https://github.com/Parsely/pykafka/issues/345
        """
        def _callback(children):
            # discover whether the referenced object still exists
            try:
                proxy.__repr__()
            except ReferenceError:
                return False
            return fn(proxy, children)
        return _callback

    def _set_watches(self):
        """Set watches in zookeeper that will trigger rebalances.

        Rebalances should be triggered whenever a broker, topic, or consumer
        znode is changed in zookeeper. This ensures that the balance of the
        consumer group remains up-to-date with the current state of the
        cluster.
        """
        proxy = weakref.proxy(self)
        _brokers_changed = self._build_watch_callback(BalancedConsumer._brokers_changed, proxy)
        _topics_changed = self._build_watch_callback(BalancedConsumer._topics_changed, proxy)
        _consumers_changed = self._build_watch_callback(BalancedConsumer._consumers_changed, proxy)

        self._setting_watches = True
        # Set all our watches and then rebalance
        broker_path = '/brokers/ids'
        try:
            self._broker_watcher = ChildrenWatch(
                self._zookeeper, broker_path,
                _brokers_changed
            )
        except NoNodeException:
            raise Exception(
                'The broker_path "%s" does not exist in your '
                'ZooKeeper cluster -- is your Kafka cluster running?'
                % broker_path)

        self._topics_watcher = ChildrenWatch(
            self._zookeeper,
            '/brokers/topics',
            _topics_changed
        )

        self._consumer_watcher = ChildrenWatch(
            self._zookeeper, self._consumer_id_path,
            _consumers_changed
        )
        self._setting_watches = False

    def _add_self(self):
        """Register this consumer in zookeeper."""
        self._zookeeper.create(
            self._path_self, self._topic.name, ephemeral=True, makepath=True)

    @property
    def _path_self(self):
        """Path where this consumer should be registered in zookeeper"""
        return '{path}/{id_}'.format(
            path=self._consumer_id_path,
            # get_string is necessary to avoid writing literal "b'" to zookeeper
            id_=get_string(self._consumer_id)
        )

    def _update_member_assignment(self):
        """Decide and assign new partitions for this consumer"""
        for i in range(self._rebalance_max_retries):
            try:
                # If retrying, be sure to make sure the
                # partition allocation is correct.
                participants = self._get_participants()
                if self._consumer_id not in participants:
                    # situation that only occurs if our zk session expired
                    self._add_self()
                    participants.append(self._consumer_id)

                new_partitions = self._membership_protocol.decide_partitions(
                    participants, self._topic.partitions, self._consumer_id)
                if not new_partitions:
                    log.warning("No partitions assigned to consumer %s",
                                self._consumer_id)

                # Update zk with any changes:
                # Note that we explicitly fetch our set of held partitions
                # from zk, rather than assuming it will be identical to
                # `self.partitions`.  This covers the (rare) situation
                # where due to an interrupted connection our zk session
                # has expired, in which case we'd hold zero partitions on
                # zk, but `self._partitions` may be outdated and non-empty
                current_zk_parts = self._get_held_partitions()
                self._remove_partitions(current_zk_parts - new_partitions)
                self._add_partitions(new_partitions - current_zk_parts)
                if self._setup_internal_consumer(new_partitions):
                    log.info('Rebalancing Complete.')
                break
            except PartitionOwnedError as ex:
                if i == self._rebalance_max_retries - 1:
                    log.warning('Failed to acquire partition %s after %d retries.',
                                ex.partition, i)
                    raise
                log.info('Unable to acquire partition %s. Retrying', ex.partition)
                self._cluster.handler.sleep(i * (self._rebalance_backoff_ms / 1000))

    def _rebalance(self):
        """Start the rebalancing process for this consumer

        This method is called whenever a zookeeper watch is triggered.
        """
        # This Event is used to notify about rebalance operation to SimpleConsumer's consume().
        if not self._rebalancing_in_progress.is_set():
            self._rebalancing_in_progress.set()

        if self._consumer is not None:
            self.commit_offsets()
        # this is necessary because we can't stop() while the lock is held
        # (it's not an RLock)
        with self._rebalancing_lock:
            if not self._running:
                raise ConsumerStoppedException
            log.info('Rebalancing consumer "%s" for topic "%s".' % (
                self._consumer_id, self._topic.name))
            self._update_member_assignment()

        if self._rebalancing_in_progress.is_set():
            self._rebalancing_in_progress.clear()

    def _path_from_partition(self, p):
        """Given a partition, return its path in zookeeper.

        :type p: :class:`pykafka.partition.Partition`
        """
        return "%s/%s-%s" % (self._topic_path, p.leader.id, p.id)

    def _remove_partitions(self, partitions):
        """Remove partitions from the zookeeper registry for this consumer.

        :param partitions: The partitions to remove.
        :type partitions: Iterable of :class:`pykafka.partition.Partition`
        """
        for p in partitions:
            try:
                # TODO pass zk node version to make sure we still own this node
                self._zookeeper.delete(self._path_from_partition(p))
            except NoNodeException:
                pass

    def _add_partitions(self, partitions):
        """Add partitions to the zookeeper registry for this consumer.

        :param partitions: The partitions to add.
        :type partitions: Iterable of :class:`pykafka.partition.Partition`
        """
        for p in partitions:
            try:
                self._zookeeper.create(
                    self._path_from_partition(p),
                    value=get_bytes(self._consumer_id),
                    ephemeral=True
                )
            except NodeExistsError:
                raise PartitionOwnedError(p)

    def _get_held_partitions(self):
        """Build a set of partitions zookeeper says we own"""
        zk_partition_ids = set()
        all_partitions = self._zookeeper.get_children(self._topic_path)
        for partition_slug in all_partitions:
            try:
                owner_id, stat = self._zookeeper.get(
                    '{path}/{slug}'.format(
                        path=self._topic_path, slug=partition_slug))
                if owner_id == get_bytes(self._consumer_id):
                    zk_partition_ids.add(int(partition_slug.split('-')[1]))
            except NoNodeException:
                pass  # disappeared between ``get_children`` and ``get``
        return set(self._topic.partitions[_id] for _id in zk_partition_ids)

    @_catch_thread_exception
    def _brokers_changed(self, brokers):
        if not self._running:
            return False  # `False` tells ChildrenWatch to disable this watch
        if self._setting_watches:
            return
        log.debug("Rebalance triggered by broker change ({})".format(
            self._consumer_id))
        self._rebalance()

    @_catch_thread_exception
    def _consumers_changed(self, consumers):
        if not self._running:
            return False  # `False` tells ChildrenWatch to disable this watch
        if self._setting_watches:
            return
        log.debug("Rebalance triggered by consumer change ({})".format(
            self._consumer_id))
        self._rebalance()

    @_catch_thread_exception
    def _topics_changed(self, topics):
        if not self._running:
            return False  # `False` tells ChildrenWatch to disable this watch
        if self._setting_watches:
            return
        log.debug("Rebalance triggered by topic change ({})".format(
            self._consumer_id))
        self._rebalance()

    def reset_offsets(self, partition_offsets=None):
        """Reset offsets for the specified partitions

        For each value provided in `partition_offsets`: if the value is an integer,
        immediately reset the partition's internal offset counter to that value. If
        it's a `datetime.datetime` instance or a valid `OffsetType`, issue a
        `ListOffsetRequest` using that timestamp value to discover the latest offset
        in the latest log segment before that timestamp, then set the partition's
        internal counter to that value.

        :param partition_offsets: (`partition`, `timestamp_or_offset`) pairs to
            reset where `partition` is the partition for which to reset the offset
            and `timestamp_or_offset` is EITHER the timestamp before which to find
            a valid offset to set the partition's counter to OR the new offset the
            partition's counter should be set to
        :type partition_offsets: Sequence of tuples of the form
            (:class:`pykafka.partition.Partition`, int OR `datetime.datetime`)
        """
        self._raise_worker_exceptions()
        if not self._consumer:
            raise ConsumerStoppedException("Internal consumer is stopped")
        self._consumer.reset_offsets(partition_offsets=partition_offsets)

    def consume(self, block=True):
        """Get one message from the consumer

        :param block: Whether to block while waiting for a message
        :type block: bool
        """

        def consumer_timed_out():
            """Indicates whether the consumer has received messages recently"""
            if self._consumer_timeout_ms == -1:
                return False
            disp = (time.time() - self._last_message_time) * 1000.0
            return disp > self._consumer_timeout_ms
        message = None
        self._last_message_time = time.time()
        while message is None and not consumer_timed_out():
            if not self._internal_consumer_running.is_set():
                self._cluster.handler.sleep()
                self._raise_worker_exceptions()
                self._internal_consumer_running.wait(self._consumer_timeout_ms / 1000)
            try:
                # acquire the lock to ensure that we don't start trying to consume from
                # a _consumer that might soon be replaced by an in-progress rebalance
                with self._rebalancing_lock:
                    message = self._consumer.consume(block=block, unblock_event=self._rebalancing_in_progress)

                # If Gevent is used, waiting to acquire _rebalancing lock introduces a race condition.
                # This sleep would ensure that the _rebalance method acquires the _rebalancing_lock
                # Issue: https://github.com/Parsely/pykafka/issues/671
                if self._rebalancing_in_progress.is_set():
                    self._cluster.handler.sleep()
            except (ConsumerStoppedException, AttributeError):
                if not self._running:
                    raise ConsumerStoppedException
            if message:
                self._last_message_time = time.time()
            if not block:
                return message
        return message

    def __iter__(self):
        """Yield an infinite stream of messages until the consumer times out"""
        while True:
            message = self.consume(block=True)
            if not message:
                return
            yield message

    def commit_offsets(self, partition_offsets=None):
        """Commit offsets for this consumer's partitions

        Uses the offset commit/fetch API

        :param partition_offsets: (`partition`, `offset`) pairs to
            commit where `partition` is the partition for which to commit the offset
            and `offset` is the offset to commit for the partition. Note that using
            this argument when `auto_commit_enable` is enabled can cause inconsistencies
            in committed offsets. For best results, use *either* this argument *or*
            `auto_commit_enable`.
        :type partition_offsets: Sequence of tuples of the form
            (:class:`pykafka.partition.Partition`, int)
        """
        self._raise_worker_exceptions()
        if not self._consumer:
            raise KafkaException("Cannot commit offsets - consumer not started")
        return self._consumer.commit_offsets(partition_offsets=partition_offsets)
Beispiel #56
0
class ZooKeeper(AbstractDCS):
    def __init__(self, config):
        super(ZooKeeper, self).__init__(config)

        hosts = config.get('hosts', [])
        if isinstance(hosts, list):
            hosts = ','.join(hosts)

        self._client = KazooClient(hosts,
                                   handler=PatroniSequentialThreadingHandler(
                                       config['retry_timeout']),
                                   timeout=config['ttl'],
                                   connection_retry={
                                       'max_delay': 1,
                                       'max_tries': -1
                                   },
                                   command_retry={
                                       'deadline': config['retry_timeout'],
                                       'max_delay': 1,
                                       'max_tries': -1
                                   })
        self._client.add_listener(self.session_listener)

        self._my_member_data = None
        self._fetch_cluster = True

        self._orig_kazoo_connect = self._client._connection._connect
        self._client._connection._connect = self._kazoo_connect

        self._client.start()

    def _kazoo_connect(self, host, port):
        """Kazoo is using Ping's to determine health of connection to zookeeper. If there is no
        response on Ping after Ping interval (1/2 from read_timeout) it will consider current
        connection dead and try to connect to another node. Without this "magic" it was taking
        up to 2/3 from session timeout (ttl) to figure out that connection was dead and we had
        only small time for reconnect and retry.

        This method is needed to return different value of read_timeout, which is not calculated
        from negotiated session timeout but from value of `loop_wait`. And it is 2 sec smaller
        than loop_wait, because we can spend up to 2 seconds when calling `touch_member()` and
        `write_leader_optime()` methods, which also may hang..."""

        ret = self._orig_kazoo_connect(host, port)
        return max(self.loop_wait - 2, 2) * 1000, ret[1]

    def session_listener(self, state):
        if state in [KazooState.SUSPENDED, KazooState.LOST]:
            self.cluster_watcher(None)

    def cluster_watcher(self, event):
        self._fetch_cluster = True
        self.event.set()

    def reload_config(self, config):
        self.set_retry_timeout(config['retry_timeout'])

        loop_wait = config['loop_wait']

        loop_wait_changed = self._loop_wait != loop_wait
        self._loop_wait = loop_wait
        self._client.handler.set_connect_timeout(loop_wait)

        # We need to reestablish connection to zookeeper if we want to change
        # read_timeout (and Ping interval respectively), because read_timeout
        # is calculated in `_kazoo_connect` method. If we are changing ttl at
        # the same time, set_ttl method will reestablish connection and return
        # `!True`, otherwise we will close existing connection and let kazoo
        # open the new one.
        if not self.set_ttl(int(config['ttl'] * 1000)) and loop_wait_changed:
            self._client._connection._socket.close()

    def set_ttl(self, ttl):
        """It is not possible to change ttl (session_timeout) in zookeeper without
        destroying old session and creating the new one. This method returns `!True`
        if session_timeout has been changed (`restart()` has been called)."""
        if self._client._session_timeout != ttl:
            self._client._session_timeout = ttl
            self._client.restart()
            return True

    def set_retry_timeout(self, retry_timeout):
        self._client._retry.deadline = retry_timeout

    def get_node(self, key, watch=None):
        try:
            ret = self._client.get(key, watch)
            return (ret[0].decode('utf-8'), ret[1])
        except NoNodeError:
            return None

    @staticmethod
    def member(name, value, znode):
        return Member.from_node(znode.version, name, znode.ephemeralOwner,
                                value)

    def get_children(self, key, watch=None):
        try:
            return self._client.get_children(key, watch)
        except NoNodeError:
            return []

    def load_members(self):
        members = []
        for member in self.get_children(self.members_path,
                                        self.cluster_watcher):
            data = self.get_node(self.members_path + member)
            if data is not None:
                members.append(self.member(member, *data))
        return members

    def _inner_load_cluster(self):
        self._fetch_cluster = False
        self.event.clear()
        nodes = set(
            self.get_children(self.client_path(''), self.cluster_watcher))
        if not nodes:
            self._fetch_cluster = True

        # get initialize flag
        initialize = (self.get_node(self.initialize_path)
                      or [None])[0] if self._INITIALIZE in nodes else None

        # get global dynamic configuration
        config = self.get_node(
            self.config_path,
            watch=self.cluster_watcher) if self._CONFIG in nodes else None
        config = config and ClusterConfig.from_node(config[1].version,
                                                    config[0], config[1].mzxid)

        # get last leader operation
        last_leader_operation = self._OPTIME in nodes and self._fetch_cluster and self.get_node(
            self.leader_optime_path)
        last_leader_operation = last_leader_operation and int(
            last_leader_operation[0]) or 0

        # get list of members
        members = self.load_members() if self._MEMBERS[:-1] in nodes else []

        # get leader
        leader = self.get_node(
            self.leader_path) if self._LEADER in nodes else None
        if leader:
            client_id = self._client.client_id
            if not self._ctl and leader[0] == self._name and client_id is not None \
                    and client_id[0] != leader[1].ephemeralOwner:
                logger.info(
                    'I am leader but not owner of the session. Removing leader node'
                )
                self._client.delete(self.leader_path)
                leader = None

            if leader:
                member = Member(-1, leader[0], None, {})
                member = ([m for m in members if m.name == leader[0]]
                          or [member])[0]
                leader = Leader(leader[1].version, leader[1].ephemeralOwner,
                                member)
                self._fetch_cluster = member.index == -1

        # failover key
        failover = self.get_node(
            self.failover_path,
            watch=self.cluster_watcher) if self._FAILOVER in nodes else None
        failover = failover and Failover.from_node(failover[1].version,
                                                   failover[0])

        self._cluster = Cluster(initialize, config, leader,
                                last_leader_operation, members, failover)

    def _load_cluster(self):
        if self._fetch_cluster or self._cluster is None:
            try:
                self._client.retry(self._inner_load_cluster)
            except Exception:
                logger.exception('get_cluster')
                self.cluster_watcher(None)
                raise ZooKeeperError('ZooKeeper in not responding properly')

    def _create(self, path, value, **kwargs):
        try:
            self._client.retry(self._client.create, path,
                               value.encode('utf-8'), **kwargs)
            return True
        except:
            return False

    def attempt_to_acquire_leader(self, permanent=False):
        ret = self._create(self.leader_path,
                           self._name,
                           makepath=True,
                           ephemeral=not permanent)
        if not ret:
            logger.info('Could not take out TTL lock')
        return ret

    def set_failover_value(self, value, index=None):
        try:
            self._client.retry(self._client.set,
                               self.failover_path,
                               value.encode('utf-8'),
                               version=index or -1)
            return True
        except NoNodeError:
            return value == '' or (index is None
                                   and self._create(self.failover_path, value))
        except:
            logging.exception('set_failover_value')
            return False

    def set_config_value(self, value, index=None):
        try:
            self._client.retry(self._client.set,
                               self.config_path,
                               value.encode('utf-8'),
                               version=index or -1)
            return True
        except NoNodeError:
            return index is None and self._create(self.config_path, value)
        except Exception:
            logging.exception('set_config_value')
            return False

    def initialize(self, create_new=True, sysid=""):
        return self._create(self.initialize_path, sysid, makepath=True) if create_new \
            else self._client.retry(self._client.set, self.initialize_path,  sysid.encode("utf-8"))

    def touch_member(self, data, ttl=None, permanent=False):
        cluster = self.cluster
        member = cluster and cluster.get_member(self._name,
                                                fallback_to_leader=False)
        data = data.encode('utf-8')
        if member and self._client.client_id is not None and member.session != self._client.client_id[
                0]:
            try:
                self._client.delete_async(self.member_path).get(timeout=1)
            except NoNodeError:
                pass
            except:
                return False
            member = None

        if member:
            if data == self._my_member_data:
                return True
        else:
            try:
                self._client.create_async(
                    self.member_path,
                    data,
                    makepath=True,
                    ephemeral=not permanent).get(timeout=1)
                self._my_member_data = data
                return True
            except Exception as e:
                if not isinstance(e, NodeExistsError):
                    logger.exception('touch_member')
                    return False
        try:
            self._client.set_async(self.member_path, data).get(timeout=1)
            self._my_member_data = data
            return True
        except:
            logger.exception('touch_member')

        return False

    def take_leader(self):
        return self.attempt_to_acquire_leader()

    def _write_leader_optime(self, last_operation):
        last_operation = last_operation.encode('utf-8')
        try:
            self._client.set_async(self.leader_optime_path,
                                   last_operation).get(timeout=1)
            return True
        except NoNodeError:
            try:
                self._client.create_async(self.leader_optime_path,
                                          last_operation,
                                          makepath=True).get(timeout=1)
                return True
            except:
                logger.exception('Failed to create %s',
                                 self.leader_optime_path)
        except:
            logger.exception('Failed to update %s', self.leader_optime_path)
        return False

    def update_leader(self):
        return True

    def delete_leader(self):
        self._client.restart()
        self._my_member_data = None
        return True

    def _cancel_initialization(self):
        node = self.get_node(self.initialize_path)
        if node:
            self._client.delete(self.initialize_path, version=node[1].version)

    def cancel_initialization(self):
        try:
            self._client.retry(self._cancel_initialization)
        except:
            logger.exception("Unable to delete initialize key")

    def delete_cluster(self):
        try:
            return self._client.retry(self._client.delete,
                                      self.client_path(''),
                                      recursive=True)
        except NoNodeError:
            return True

    def watch(self, timeout):
        if super(ZooKeeper, self).watch(timeout):
            self._fetch_cluster = True
        return self._fetch_cluster
Beispiel #57
0
class ZkStateManager(StateManager):
  """
  State manager which connects to zookeeper and
  gets and sets states from there.
  """

  def __init__(self, name, host, port, rootpath, tunnelhost):
    self.name = name
    self.host = host
    self.port = port
    self.tunnelhost = tunnelhost
    self.rootpath = rootpath

  def start(self):
    """ state Zookeeper """
    if self.is_host_port_reachable():
      self.client = KazooClient(self.hostport)
    else:
      localport = self.establish_ssh_tunnel()
      self.client = KazooClient("localhost:" + str(localport))
    self.client.start()

    def on_connection_change(state):
      """ callback to log """
      LOG.info("Connection state changed to: " + state)
    self.client.add_listener(on_connection_change)

  def stop(self):
    """ stop Zookeeper """
    self.client.stop()
    self.terminate_ssh_tunnel()

  # pylint: disable=function-redefined
  def get_topologies(self, callback=None):
    """ get topologies """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """Custom callback to get the topologies right now."""
        ret["result"] = data

    try:
      self._get_topologies_with_watch(callback, isWatching)
    except NoNodeError as err:
      self.client.stop()
      path = self.get_topologies_path()
      raise StateException("Error required topology path '%s' not found" % (path),
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_topologies_with_watch(self, callback, isWatching):
    """
    Helper function to get topologies with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_topologies_path()
    if isWatching:
      LOG.info("Adding children watch for path: " + path)

    # pylint: disable=unused-variable
    @self.client.ChildrenWatch(path)
    def watch_topologies(topologies):
      """ callback to watch topologies """
      callback(topologies)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def get_topology(self, topologyName, callback=None):
    """ get topologies """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """Custom callback to get the topologies right now."""
        ret["result"] = data

    self._get_topology_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_topology_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get pplan with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_topology_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    # pylint: disable=unused-variable, unused-argument
    @self.client.DataWatch(path)
    def watch_topology(data, stats):
      """ watch topology """
      if data:
        topology = Topology()
        topology.ParseFromString(data)
        callback(topology)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def create_topology(self, topologyName, topology):
    """ crate topology """
    if not topology or not topology.IsInitialized():
      raise StateException("Topology protobuf not init properly",
                           StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2]

    path = self.get_topology_path(topologyName)
    LOG.info("Adding topology: {0} to path: {1}".format(
        topologyName, path))
    topologyString = topology.SerializeToString()
    try:
      self.client.create(path, value=topologyString, makepath=True)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while creating topology",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NodeExistsError:
      raise StateException("NodeExistsError while creating topology",
                           StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while creating topology",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def delete_topology(self, topologyName):
    """ delete topology """
    path = self.get_topology_path(topologyName)
    LOG.info("Removing topology: {0} from path: {1}".format(
        topologyName, path))
    try:
      self.client.delete(path)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while deteling topology",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NotEmptyError:
      raise StateException("NotEmptyError while deleting topology",
                           StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while deleting topology",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def get_pplan(self, topologyName, callback=None):
    """ get physical plan """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """
        Custom callback to get the topologies right now.
        """
        ret["result"] = data

    self._get_pplan_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_pplan_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get pplan with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_pplan_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    # pylint: disable=unused-variable, unused-argument
    @self.client.DataWatch(path)
    def watch_pplan(data, stats):
      """ invoke callback to watch physical plan """
      if data:
        pplan = PhysicalPlan()
        pplan.ParseFromString(data)
        callback(pplan)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def create_pplan(self, topologyName, pplan):
    """ create physical plan """
    if not pplan or not pplan.IsInitialized():
      raise StateException("Physical Plan protobuf not init properly",
                           StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2]

    path = self.get_pplan_path(topologyName)
    LOG.info("Adding topology: {0} to path: {1}".format(
        topologyName, path))
    pplanString = pplan.SerializeToString()
    try:
      self.client.create(path, value=pplanString, makepath=True)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while creating pplan",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NodeExistsError:
      raise StateException("NodeExistsError while creating pplan",
                           StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while creating pplan",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def delete_pplan(self, topologyName):
    """ delete physical plan info """
    path = self.get_pplan_path(topologyName)
    LOG.info("Removing topology: {0} from path: {1}".format(
        topologyName, path))
    try:
      self.client.delete(path)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while deleting pplan",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NotEmptyError:
      raise StateException("NotEmptyError while deleting pplan",
                           StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while deleting pplan",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def get_execution_state(self, topologyName, callback=None):
    """ get execution state """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """
        Custom callback to get the topologies right now.
        """
        ret["result"] = data

    self._get_execution_state_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_execution_state_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get execution state with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_execution_state_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    # pylint: disable=unused-variable, unused-argument
    @self.client.DataWatch(path)
    def watch_execution_state(data, stats):
      """ invoke callback to watch execute state """
      if data:
        executionState = ExecutionState()
        executionState.ParseFromString(data)
        callback(executionState)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def create_execution_state(self, topologyName, executionState):
    """ create execution state """
    if not executionState or not executionState.IsInitialized():
      raise StateException("Execution State protobuf not init properly",
                           StateException.EX_TYPE_PROTOBUF_ERROR), None, sys.exc_info()[2]

    path = self.get_execution_state_path(topologyName)
    LOG.info("Adding topology: {0} to path: {1}".format(
        topologyName, path))
    executionStateString = executionState.SerializeToString()
    try:
      self.client.create(path, value=executionStateString, makepath=True)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while creating execution state",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NodeExistsError:
      raise StateException("NodeExistsError while creating execution state",
                           StateException.EX_TYPE_NODE_EXISTS_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while creating execution state",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def delete_execution_state(self, topologyName):
    """ delete execution state """
    path = self.get_execution_state_path(topologyName)
    LOG.info("Removing topology: {0} from path: {1}".format(
        topologyName, path))
    try:
      self.client.delete(path)
      return True
    except NoNodeError:
      raise StateException("NoNodeError while deleting execution state",
                           StateException.EX_TYPE_NO_NODE_ERROR), None, sys.exc_info()[2]
    except NotEmptyError:
      raise StateException("NotEmptyError while deleting execution state",
                           StateException.EX_TYPE_NOT_EMPTY_ERROR), None, sys.exc_info()[2]
    except ZookeeperError:
      raise StateException("Zookeeper while deleting execution state",
                           StateException.EX_TYPE_ZOOKEEPER_ERROR), None, sys.exc_info()[2]
    except Exception:
      # Just re raise the exception.
      raise

  def get_tmaster(self, topologyName, callback=None):
    """ get tmaster """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """
        Custom callback to get the topologies right now.
        """
        ret["result"] = data

    self._get_tmaster_with_watch(topologyName, callback, isWatching)

    # The topologies are now populated with the data.
    return ret["result"]

  def _get_tmaster_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get pplan with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_tmaster_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    # pylint: disable=unused-variable, unused-argument
    @self.client.DataWatch(path)
    def watch_tmaster(data, stats):
      """ invoke callback to watch tmaster """
      if data:
        tmaster = TMasterLocation()
        tmaster.ParseFromString(data)
        callback(tmaster)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching

  def get_scheduler_location(self, topologyName, callback=None):
    """ get scheduler location """
    isWatching = False

    # Temp dict used to return result
    # if callback is not provided.
    ret = {
        "result": None
    }
    if callback:
      isWatching = True
    else:
      def callback(data):
        """
        Custom callback to get the scheduler location right now.
        """
        ret["result"] = data

    self._get_scheduler_location_with_watch(topologyName, callback, isWatching)

    return ret["result"]

  def _get_scheduler_location_with_watch(self, topologyName, callback, isWatching):
    """
    Helper function to get scheduler location with
    a callback. The future watch is placed
    only if isWatching is True.
    """
    path = self.get_scheduler_location_path(topologyName)
    if isWatching:
      LOG.info("Adding data watch for path: " + path)

    # pylint: disable=unused-variable, unused-argument
    @self.client.DataWatch(path)
    def watch_scheduler_location(data, stats):
      """ invoke callback to watch scheduler location """
      if data:
        scheduler_location = SchedulerLocation()
        scheduler_location.ParseFromString(data)
        callback(scheduler_location)
      else:
        callback(None)

      # Returning False will result in no future watches
      # being triggered. If isWatching is True, then
      # the future watches will be triggered.
      return isWatching
Beispiel #58
0
class Zk(object):
    def __init__(self, run_id, conf):
        self.run_id = run_id
        self.conf = conf
        self._zk = KazooClient(hosts=metadata.zk)
        self._zk.start()

    def clean(self):
        #if self._zk.exists(metadata.topics_path):
        #  self._zk.delete(metadata.topics_path,recursive=True)

        #if self._zk.exists(metadata.leader_path):
        #  self._zk.delete(metadata.leader_path,recursive=True)

        #if self._zk.exists(metadata.rb_path):
        #  self._zk.delete(metadata.rb_path,recursive=True)

        if self._zk.exists(metadata.experiment_path):
            self._zk.delete(metadata.experiment_path, recursive=True)

    def stop(self):
        self._zk.stop()

    def setup(self):
        self.create_paths()
        self.install_watches()

    def wait(self, barrier):
        if (barrier == 'subscriber'):
            self.sub_barrier.wait()
        elif (barrier == 'finished'):
            self.finished_barrier.wait()
        elif (barrier == 'monitoring'):
            self.monitoring_barrier.wait()
        else:
            print('invalid barrier name')

    def create_paths(self):
        #create zk path for subscribers
        for client in self.conf.client_numSubscribers.keys():
            sub_path='%s/%s/sub/region_%s/%s'%\
              (metadata.experiment_path,self.run_id,client[3:client.index('-')],client)
            self._zk.ensure_path(sub_path)

        #create zk path for publishers
        for client in self.conf.client_numPublishers.keys():
            pub_path='%s/%s/pub/region_%s/%s'%\
              (metadata.experiment_path,self.run_id,client[3:client.index('-')],client)
            self._zk.ensure_path(pub_path)

        #create zk path to track joined subscribers and publishers
        joined_sub_path='%s/%s/joined_sub'%\
          (metadata.experiment_path,self.run_id)
        self._zk.ensure_path(joined_sub_path)
        joined_pub_path='%s/%s/joined_pub'%\
          (metadata.experiment_path,self.run_id)
        self._zk.ensure_path(joined_pub_path)

        #create zk path to track subscribers have left
        #self._zk.ensure_path('%s/%s/left_sub'%(metadata.experiment_path,self.run_id))

        #create zk path to track monitoring processes have joined
        eb_monitoring_path = '%s/%s/monitoring/eb' % (metadata.experiment_path,
                                                      self.run_id)
        self._zk.ensure_path(eb_monitoring_path)
        self.eb_monitors_exited = False

        rb_monitoring_path = '%s/%s/monitoring/rb' % (metadata.experiment_path,
                                                      self.run_id)
        self._zk.ensure_path(rb_monitoring_path)
        self.rb_monitors_exited = False

        #create barrier paths
        sub_barrier_path = '%s/%s/barriers/sub' % (metadata.experiment_path,
                                                   self.run_id)
        self._zk.ensure_path(sub_barrier_path)
        pub_barrier_path = '%s/%s/barriers/pub' % (metadata.experiment_path,
                                                   self.run_id)
        self._zk.ensure_path(pub_barrier_path)
        finished_barrier_path = '%s/%s/barriers/finished' % (
            metadata.experiment_path, self.run_id)
        self._zk.ensure_path(finished_barrier_path)
        monitoring_barrier_path = '%s/%s/barriers/monitoring' % (
            metadata.experiment_path, self.run_id)
        self._zk.ensure_path(monitoring_barrier_path)

        #create barriers
        self.sub_barrier = Barrier(client=self._zk, path=sub_barrier_path)
        self.finished_barrier = Barrier(client=self._zk,
                                        path=finished_barrier_path)
        self.pub_barrier = Barrier(client=self._zk, path=pub_barrier_path)
        self.monitoring_barrier = Barrier(client=self._zk,
                                          path=monitoring_barrier_path)

    def install_watches(self):
        region_joined_subscriber_clients={region:0 for region in \
          self.conf.region_clientsubscribers_map.keys()}

        region_joined_publisher_clients={region:0 for region in \
          self.conf.region_clientpublishers_map.keys()}

        def _joined_endpoint_listener(children, event):
            if event and event.type == EventType.CHILD:
                if 'sub' in event.path:
                    client = event.path.rpartition('/')[2]
                    region = client[3:client.index('-')]
                    if (len(children) ==
                            self.conf.client_numSubscribers[client]):
                        print('All subscribers have joined on client:%s\n' %
                              (client))
                        region_joined_subscriber_clients[region] += 1
                        if (region_joined_subscriber_clients[region]==\
                          self.conf.region_clientsubscribers_map[region]):
                            self._zk.ensure_path('%s/%s/joined_sub/region_%s'%\
                              (metadata.experiment_path,self.run_id,region))
                    if (len(children) == 0):
                        print('All subscribers on client:%s have exited\n' %
                              (client))
                        region_joined_subscriber_clients[region] -= 1
                        if (region_joined_subscriber_clients[region] == 0):
                            self._zk.delete('%s/%s/joined_sub/region_%s'\
                              %(metadata.experiment_path,self.run_id,region))
                        return False
                if 'pub' in event.path:
                    client = event.path.rpartition('/')[2]
                    region = client[3:client.index('-')]
                    if (len(children) == self.conf.client_numPublishers[client]
                        ):
                        print('All publishers have joined on client:%s\n' %
                              (client))
                        region_joined_publisher_clients[region] += 1
                        if (region_joined_publisher_clients[region]==\
                          self.conf.region_clientpublishers_map[region]):
                            self._zk.ensure_path('%s/%s/joined_pub/region_%s'%\
                              (metadata.experiment_path,self.run_id,region))
                        return False

        def _open_barrier(children, event):
            if event and event.type == EventType.CHILD:
                if 'joined_sub' in event.path:
                    if (len(children) == len(
                            self.conf.region_clientsubscribers_map)):
                        print(
                            "All subscribers have joined. Opening subscriber barrier\n"
                        )
                        self.sub_barrier.remove()
                    if (len(children) == 0):
                        print(
                            "All subscribers have left. Opening finished barrier\n"
                        )
                        self.finished_barrier.remove()
                        return False
                if 'joined_pub' in event.path:
                    if (len(children) == len(
                            self.conf.region_clientpublishers_map)):
                        print(
                            "All publishers have joined. Opening publisher barrier\n"
                        )
                        self.pub_barrier.remove()
                        return False
                if 'monitoring/eb' in event.path:
                    if (len(children) == 0):
                        print('All eb monitors have exited')
                        self.eb_monitors_exited = True
                        if (self.eb_monitors_exited
                                and self.rb_monitors_exited):
                            print(
                                'All monitors have exited. Opening monitoring barrier'
                            )
                            self.monitoring_barrier.remove()
                        return False
                if 'monitoring/rb' in event.path:
                    if (len(children) == 0):
                        print('All rb monitors have exited')
                        self.rb_monitors_exited = True
                        if (self.eb_monitors_exited
                                and self.rb_monitors_exited):
                            print(
                                'All monitors have exited. Opening monitoring barrier'
                            )
                            self.monitoring_barrier.remove()
                        return False

        sub_barrier_opener_watch=ChildrenWatch(client=self._zk,\
          path='%s/%s/joined_sub'%(metadata.experiment_path,self.run_id),\
          func=_open_barrier,send_event=True)

        pub_barrier_opener_watch=ChildrenWatch(client=self._zk,\
          path='%s/%s/joined_pub'%(metadata.experiment_path,self.run_id),\
          func=_open_barrier,send_event=True)

        eb_monitoring_watch=ChildrenWatch(client=self._zk,\
          path='%s/%s/monitoring/eb'%(metadata.experiment_path,self.run_id),\
          func=_open_barrier,send_event=True)

        rb_monitoring_watch=ChildrenWatch(client=self._zk,\
          path='%s/%s/monitoring/rb'%(metadata.experiment_path,self.run_id),\
          func=_open_barrier,send_event=True)

        joined_sub_watches=[ChildrenWatch(client=self._zk,\
          path='%s/%s/sub/region_%s/%s'%\
            (metadata.experiment_path,self.run_id,client[3:client.index('-')],client),\
          func=_joined_endpoint_listener,send_event=True)\
          for client in self.conf.client_numSubscribers.keys()]

        joined_pub_watches=[ChildrenWatch(client=self._zk,\
          path='%s/%s/pub/region_%s/%s'%\
           (metadata.experiment_path,self.run_id,client[3:client.index('-')],client),\
          func=_joined_endpoint_listener,send_event=True)\
          for client in self.conf.client_numPublishers.keys()]
class TestServiceDiscovery(unittest.TestCase):

    def setUp(self):
        self.maxDiff = None
        logging.basicConfig(format="%(asctime)s %(levelname)s %(module)s[%(lineno)d] %(threadName)s %(message)s", level=logging.WARN)
        self.log = logging.getLogger()
        self.basePath = "/discovery_test_%x" % int(time.time())
        self.log.info("Using base path: %s" % self.basePath)
        self.client = KazooClient(hosts="127.0.0.1:2181") 
        self.client.start()
        self._clean()

        self.discovery = ServiceDiscovery(self.client, self.basePath)

    def _clean(self):
        try:
            self.client.delete(self.basePath,recursive=True)
        except NoNodeError:
            pass

    def tearDown(self):
        self.discovery.close()
        self._clean()
        self.client.stop()
        self.client.close()

    def test_paths(self):
        svc1 = ServiceInstance.builder().id("instance1").name("service1").build()
        svc2 = ServiceInstance.builder().id("instance2").name("service1").build()
        svc3 = ServiceInstance.builder().id("foo1").name("foo").build()

        self.assertEquals(self.basePath + "/service1/instance1", self.discovery.pathForInstance(svc1.getName(),svc1.getId()))
        self.assertEquals(self.basePath + "/service1/instance2", self.discovery.pathForInstance(svc2.getName(),svc2.getId()))
        self.assertEquals(self.basePath + "/foo/foo1", self.discovery.pathForInstance(svc3.getName(),svc3.getId()))

    def test_reg_and_dereg(self):
        svc1 = ServiceInstance.builder().id("instance1").name("service1").build()
        svc2 = ServiceInstance.builder().id("instance2").name("service1").build()
        svc3 = ServiceInstance.builder().id("foo1").name("foo").build()

        self.discovery.registerService(svc1)
        self.discovery.registerService(svc2)
        self.discovery.registerService(svc3)

        self.assertTrue(self.client.exists(self.discovery.pathForInstance(svc1.getName(),svc1.getId())))
        self.assertTrue(self.client.exists(self.discovery.pathForInstance(svc2.getName(),svc2.getId())))
        self.assertTrue(self.client.exists(self.discovery.pathForInstance(svc3.getName(),svc3.getId())))

        self.discovery.unregisterService(svc1)
        self.discovery.unregisterService(svc2)
        self.discovery.unregisterService(svc3)

        self.assertFalse(self.client.exists(self.discovery.pathForInstance(svc1.getName(),svc1.getId())))
        self.assertFalse(self.client.exists(self.discovery.pathForInstance(svc2.getName(),svc2.getId())))
        self.assertFalse(self.client.exists(self.discovery.pathForInstance(svc3.getName(),svc3.getId())))

    def test_query(self):
        svc1 = ServiceInstance.builder().id("instance1").name("service1").build()
        svc2 = ServiceInstance.builder().id("instance2").name("service1").build()
        svc3 = ServiceInstance.builder().id("foo1").name("foo").build()

        self.discovery.registerService(svc1)
        self.discovery.registerService(svc2)
        self.discovery.registerService(svc3)

        self.assertEquals(sorted(["foo", "service1"]), sorted(self.discovery.queryForNames()))
        instances = self.discovery.queryForInstances("service1")
        self.assertEquals(2, len(instances))
        self.assertEquals(sorted([svc1, svc2]), sorted(instances))

        # make sure unregister works
        self.discovery.unregisterService(svc2)
        instances = self.discovery.queryForInstances("service1")
        self.assertEquals(1, len(instances))
        self.assertEquals(sorted([svc1]), sorted(instances))

        instance = self.discovery.queryForInstance("service1","instance1")
        self.assertTrue(instance)
        self.assertEquals(svc1, instance)
Beispiel #60
0
class Server(threading.Thread):
    '''
    工作服务器(也是ZooKeeper的客户端)
    '''
    # 控制输出信息的锁,注意:这个是单机器的锁,这里实现的是分布式锁,并不存在本末倒置
    print_mutex = threading.Lock()
    
    DELAY_TIME = 3
    
    def __init__(self, zk_server_address, lock_base_path, host, serve_mode):
        threading.Thread.__init__(self)
        # 锁的根节点路径
        self.lock_base_path = lock_base_path
        # 主机IP
        self.host = host
        # 工作模式,读/写
        self.serve_mode = serve_mode
        # 事件,初始化为False
        self.event = threading.Event()
        
        # 创建一个zookeeper客户端
        self.zkclient = KazooClient(zk_server_address)
        # 添加连接状态监听器
        self.zkclient.add_listener(self.zk_connect_listener)
        # 与zookeeper开启连接
        self.zkclient.start()
        
    
    # 连接状态监听器
    def zk_connect_listener(self, state):
        # 获取打印锁
        Server.print_mutex.acquire()
        if state == KeeperState.CONNECTED:
            print self.host + " 已经开启..."
        elif state == KazooState.LOST:
            print self.host + " 停止服务..."
        else:
            raise Exception(self.host + " 未正常开启...")   
        # 获取打印锁
        Server.print_mutex.release() 
      
        
    # 初始化
    def run(self):
        # 创建锁节点,形如/shared_lock/192.168.0.0-R-0000000001
        self.create_lock_node()
        # 获取锁
        self.acquire_lock()
        # 工作
        self.work()
        # 释放锁
        self.release_lock()
        # 准备停止
        self.stop()
        
        
    def create_lock_node(self):
        # 先检查父节点,如果父节点不存在
        if not self.zkclient.exists(self.lock_base_path):
            # 先创建父节点
            self.zkclient.create(self.lock_base_path)
        # 拼凑出服务器子节点的完整路径
        node_path = self.lock_base_path + "/" + self.host + "-" + self.serve_mode + "-"
        # 创建临时顺序节点
        self.node_path = self.zkclient.create(node_path, "", self.zkclient.default_acl, True, True)
    
    
    # 删除事件的响应
    def pre_node_delete_watch(self, data, stat, event):
        if event and event.type == EventType.DELETED:
            # 将事件设置为True
            self.event.set()
    
    
    # 获取锁
    def acquire_lock(self):
        # 提取出自己的节点名
        node_name = self.node_path.split("/")[-1]
        # 获取/shared_lock子节点排序列表
        sorted_children = self.get_sorted_children()
        # 得到节点的索引
        node_index = sorted_children.index(node_name)
        
        # 寻找最后一个写节点
        def get_last_write_node_index():
            # 逆向遍历
            for i in range(node_index)[::-1]:
                # 工作模式是节点名中的第二个部分
                serve_mode = sorted_children[i].split("-")[1]
                # 只要找到一个写请求,则立刻返回
                if serve_mode == "W":
                    return i
            # 如果全部都是读请求,则返回-1
            return -1
        
        # 如果是写请求,
        if self.serve_mode == "W":
            # 如果是,再判断自己是不是序号最小的节点
            if node_index == 0:
                # 立马返回,占用锁,开始写数据
                return
            # 如果不是,向比自己小的最后一个节点注册监听
            else:
                # 拼凑出前一个节点的路径
                pre_node_path = self.lock_base_path + "/" + sorted_children[node_index - 1]
                # 添加对前一个节点的删除事件的关注
                self.zkclient.DataWatch(pre_node_path, self.pre_node_delete_watch)
                # 这里应该等待锁
                self.event.wait()
        # 如果是读请求
        else:
            # 得到所有比自己小的子节点中的最后一个写节点的下标
            last_write_node_index = get_last_write_node_index()
            # 判断以下两个条件是否成立
            # 1)没有比自己序号小的子节点
            # 2)或是所有比自己小的子节点都是读请求
            # 如果成立
            if node_index == 0 or last_write_node_index < 0:
                # 立马返回,占用共享锁,开始读数据
                return
            # 如果不成立,向比自己小的最后一个写节点注册监听
            else:
                # 拼凑出前一个节点的路径
                pre_node_path = self.lock_base_path + "/" + sorted_children[last_write_node_index]
                # 添加对前一个节点的删除事件的关注
                self.zkclient.DataWatch(pre_node_path, self.pre_node_delete_watch)
                # 这里应该等待锁
                self.event.wait()
    
    
    def work(self):
        # 获取打印锁
        Server.print_mutex.acquire()
        # 如果是写请求,
        if self.serve_mode == "W":
            # 写一会数据,然后删除节点,关闭会话
            print self.host + " 正在写数据..."
        else:
            # 读一会数据,然后删除节点,关闭会话
            print self.host + " 正在读数据..."
        Server.print_mutex.release()
        # 这里暂停几秒钟。模拟工作耗时状态
        sleep(self.DELAY_TIME)
    
    
    # 释放锁
    def release_lock(self):
        # 删除自己的节点
        self.zkclient.delete(self.node_path)
    
    
    # 获取/shared_lock子节点排序列表
    def get_sorted_children(self):
        # 获取/shared_lock子节点列表
        children = self.zkclient.get_children(self.lock_base_path)
        ###############################################################
        # 这里sort函数的比较表达式是由两个函数实现,还挺有技巧的
        ###############################################################
        # 返回节点的序列号
        def get_lock_node_seq(node_name):
            # 分割字符串,然后返回列表最后一个元素,先将其转化为整型
            return string.atoi(node_name.split("-")[-1])
        # 编号比较r函数
        def sequence_compare(node1, node2):
            return get_lock_node_seq(node1) - get_lock_node_seq(node2)
        # 将列表排序
        children.sort(cmp = sequence_compare)
        
        return children
        
    
    # 停止工作
    def stop(self):
        # 移除事件监听器
        self.zkclient.remove_listener(self.pre_node_delete_watch)
        # 会话
        self.zkclient.stop()
        self.zkclient.close()