def test_lockNode_multi(self): node = zk.Node('100') self.zk.lockNode(node) with testtools.ExpectedException( npe.ZKLockException, "Did not get lock on .*" ): self.zk.lockNode(node, blocking=False)
def registerNodeFromConfig(self, count, provider_name, pool_name, static_node): ''' Register a static node from the config with ZooKeeper. A node can be registered multiple times to support max-parallel-jobs. These nodes will share a hostname. :param int count: Number of times to register this node. :param str provider_name: Name of the provider. :param str pool_name: Name of the pool owning the node. :param dict static_node: The node definition from the config file. ''' host_keys = self.checkHost(static_node) for i in range(0, count): node = zk.Node() node.state = zk.READY node.provider = provider_name node.pool = pool_name node.launcher = "static driver" node.type = static_node["labels"] node.hostname = static_node["name"] node.username = static_node["username"] node.interface_ip = static_node["name"] node.connection_port = static_node["connection-port"] node.connection_type = static_node["connection-type"] nodeutils.set_node_ip(node) node.host_keys = host_keys self.zk.storeNode(node) self.log.debug("Registered static node %s", node.hostname)
def _cleanupLeakedInstances(self): ''' Delete any leaked server instances. Remove any servers we find in providers we know about that are not recorded in the ZooKeeper data. ''' zk_conn = self._nodepool.getZK() for provider in self._nodepool.config.providers.values(): manager = self._nodepool.getProviderManager(provider.name) for server in manager.listNodes(): meta = server.get('metadata', {}) if 'nodepool_provider_name' not in meta: continue if meta['nodepool_provider_name'] != provider.name: # Another launcher, sharing this provider but configured # with a different name, owns this. continue if not zk_conn.getNode(meta['nodepool_node_id']): self.log.warning( "Deleting leaked instance %s (%s) in %s " "(unknown node id %s)", server.name, server.id, provider.name, meta['nodepool_node_id']) # Create an artifical node to use for deleting the server. node = zk.Node() node.external_id = server.id node.provider = provider.name self._deleteInstance(node) manager.cleanupLeakedResources()
def test_getNode(self): n = zk.Node('100') n.state = zk.BUILDING path = self.zk._nodePath(n.id) self.zk.client.create(path, value=n.serialize(), makepath=True) o = self.zk.getNode(n.id) self.assertIsInstance(o, zk.Node) self.assertEqual(n.id, o.id)
def test_lockNode_unlockNode(self): node = zk.Node('100') self.zk.lockNode(node) self.assertIsNotNone(node.lock) self.assertIsNotNone( self.zk.client.exists(self.zk._nodeLockPath(node.id))) self.zk.unlockNode(node) self.assertIsNone(node.lock)
def test_mixed_launch(self, mock_launch): configfile = self.setup_config('node.yaml') self._setup(configfile) handler = self._createHandler(1) mock_launch.side_effect = [None, Exception()] n1 = zk.Node() n1.state = zk.BUILDING n1.type = 'fake-label' n2 = zk.Node() n2.state = zk.BUILDING n2.type = 'fake-label' self._launch(handler, n1) self._launch(handler, n2) while not handler.pollLauncher(): time.sleep(0) self.assertEqual(len(handler._failed_nodes), 1) self.assertEqual(len(handler._ready_nodes), 1)
def test_mixed_launch(self, mock_launch): configfile = self.setup_config('node.yaml') self._setup(configfile) mock_launch.side_effect = [None, Exception()] n1 = zk.Node() n1.state = zk.BUILDING n1.type = 'fake-label' n2 = zk.Node() n2.state = zk.BUILDING n2.type = 'fake-label' mgr = OpenStackNodeLaunchManager(self.zk, self.provider_pool, self.pmanager, 'zuul', 1) mgr.launch(n1) mgr.launch(n2) while not mgr.poll(): time.sleep(0) self.assertEqual(len(mgr.failed_nodes), 1) self.assertEqual(len(mgr.ready_nodes), 1)
def _create_node(self): node = zk.Node() node.state = zk.BUILDING node.provider = 'rax' self.assertIsNone(node.id) self.zk.storeNode(node) self.assertIsNotNone(node.id) self.assertIsNotNone(self.zk.client.exists(self.zk._nodePath(node.id))) return node
def test_custom_connection_port(self): n = zk.Node('0001') n.state = zk.BUILDING d = n.toDict() self.assertEqual(d["connection_port"], 22, "Default port not 22") n = zk.Node.fromDict(d, '0001') self.assertEqual(n.connection_port, 22, "Default port not 22") n.connection_port = 22022 d = n.toDict() self.assertEqual(d["connection_port"], 22022, "Custom ssh port not set")
def run_handler(self): self._setFromPoolWorker() node = zk.Node() node.state = zk.READY node.external_id = "test-%s" % self.request.id node.provider = self.provider.name node.launcher = self.launcher_id node.allocated_to = self.request.id node.type = self.request.node_types[0] self.nodeset.append(node) self.zk.storeNode(node)
def launch(self): attempts = 1 while attempts <= self._retries: try: self._launchNode() break except openstack.exceptions.TaskManagerStopped: # If we lost our TaskManager session, we won't be able to # launch an instance, so there's no need to continue. raise except kze.SessionExpiredError: # If we lost our ZooKeeper session, we've lost our node lock # so there's no need to continue. raise except Exception as e: if attempts <= self._retries: self.log.exception( "Request %s: Launch attempt %d/%d failed for node %s:", self.handler.request.id, attempts, self._retries, self.node.id) # If we created an instance, delete it. if self.node.external_id: deleting_node = zk.Node() deleting_node.provider = self.node.provider deleting_node.pool = self.node.pool deleting_node.type = self.node.type deleting_node.external_id = self.node.external_id deleting_node.state = zk.DELETING self.zk.storeNode(deleting_node) self.log.info("Request %s: Node %s scheduled for cleanup", self.handler.request.id, deleting_node.external_id) self.node.external_id = None self.node.public_ipv4 = None self.node.public_ipv6 = None self.node.interface_ip = None self.zk.storeNode(self.node) if attempts == self._retries: raise if 'quota exceeded' in str(e).lower(): # A quota exception is not directly recoverable so bail # out immediately with a specific exception. self.log.info("Quota exceeded, invalidating quota cache") self.handler.manager.invalidateQuotaCache() raise exceptions.QuotaException("Quota exceeded") attempts += 1 self.node.state = zk.READY self.zk.storeNode(self.node) self.log.info("Node id %s is ready", self.node.id)
def cleanupLeakedResources(self): ''' Delete any leaked server instances. Remove any servers found in this provider that are not recorded in the ZooKeeper data. ''' deleting_nodes = {} for node in self._zk.nodeIterator(): if node.state == zk.DELETING: if node.provider != self.provider.name: continue if node.provider not in deleting_nodes: deleting_nodes[node.provider] = [] deleting_nodes[node.provider].append(node.external_id) for server in self.listNodes(): meta = server.get('metadata', {}) if 'nodepool_provider_name' not in meta: continue if meta['nodepool_provider_name'] != self.provider.name: # Another launcher, sharing this provider but configured # with a different name, owns this. continue if (self.provider.name in deleting_nodes and server.id in deleting_nodes[self.provider.name]): # Already deleting this node continue if not self._zk.getNode(meta['nodepool_node_id']): self.log.warning( "Marking for delete leaked instance %s (%s) in %s " "(unknown node id %s)", server.name, server.id, self.provider.name, meta['nodepool_node_id'] ) # Create an artifical node to use for deleting the server. node = zk.Node() node.external_id = server.id node.provider = self.provider.name node.state = zk.DELETING self._zk.storeNode(node) if self.provider.clean_floating_ips: self._client.delete_unattached_floating_ips()
def _cleanupLeakedInstances(self): ''' Delete any leaked server instances. Remove any servers we find in providers we know about that are not recorded in the ZooKeeper data. ''' zk_conn = self._nodepool.getZK() deleting_nodes = {} for node in zk_conn.nodeIterator(): if node.state == zk.DELETING: if node.provider not in deleting_nodes: deleting_nodes[node.provider] = [] deleting_nodes[node.provider].append(node.external_id) for provider in self._nodepool.config.providers.values(): manager = self._nodepool.getProviderManager(provider.name) for server in manager.listNodes(): meta = server.get('metadata', {}) if 'nodepool_provider_name' not in meta: continue if meta['nodepool_provider_name'] != provider.name: # Another launcher, sharing this provider but configured # with a different name, owns this. continue if (provider.name in deleting_nodes and server.id in deleting_nodes[provider.name]): # Already deleting this node continue if not zk_conn.getNode(meta['nodepool_node_id']): self.log.warning( "Marking for delete leaked instance %s (%s) in %s " "(unknown node id %s)", server.name, server.id, provider.name, meta['nodepool_node_id']) # Create an artifical node to use for deleting the server. node = zk.Node() node.external_id = server.id node.provider = provider.name node.state = zk.DELETING zk_conn.storeNode(node) manager.cleanupLeakedResources()
def test_successful_launch(self): configfile = self.setup_config('node.yaml') self._setup(configfile) handler = self._createHandler(1) n1 = zk.Node() n1.state = zk.BUILDING n1.type = 'fake-label' self._launch(handler, n1) while not handler.pollLauncher(): time.sleep(0) self.assertEqual(len(handler.ready_nodes), 1) self.assertEqual(len(handler.failed_nodes), 0) nodes = handler.manager.listNodes() self.assertEqual(nodes[0]['metadata']['groups'], 'fake-provider,fake-image,fake-label')
def test_successful_launch(self): configfile = self.setup_config('node.yaml') self._setup(configfile) n1 = zk.Node() n1.state = zk.BUILDING n1.type = 'fake-label' mgr = OpenStackNodeLaunchManager(self.zk, self.provider_pool, self.pmanager, 'zuul', 1) mgr.launch(n1) while not mgr.poll(): time.sleep(0) self.assertEqual(len(mgr.ready_nodes), 1) self.assertEqual(len(mgr.failed_nodes), 0) nodes = mgr._provider_manager.listNodes() self.assertEqual(nodes[0]['metadata']['groups'], 'fake-provider,fake-image,fake-label')
def test_Node_toDict(self): o = zk.Node('123') o.state = zk.INIT o.provider = 'rax' o.type = 'trusty' o.allocated_to = '456-789' o.az = 'RegionOne' o.region = 'fake-region' o.public_ipv4 = '<ipv4>' o.private_ipv4 = '<pvt-ipv4>' o.public_ipv6 = '<ipv6>' o.host_id = 'fake-host-id' o.image_id = 'image-id' o.launcher = 'launcher-id' o.external_id = 'ABCD' o.hostname = 'xyz' o.comment = 'comment' o.hold_job = 'hold job' o.host_keys = ['key1', 'key2'] o.attributes = {'executor-zone': 'vpn'} d = o.toDict() self.assertNotIn('id', d) self.assertEqual(d['state'], o.state) self.assertIn('state_time', d) self.assertIn('created_time', d) self.assertEqual(d['provider'], o.provider) self.assertEqual(d['type'], o.type) self.assertEqual(d['allocated_to'], o.allocated_to) self.assertEqual(d['az'], o.az) self.assertEqual(d['region'], o.region) self.assertEqual(d['public_ipv4'], o.public_ipv4) self.assertEqual(d['private_ipv4'], o.private_ipv4) self.assertEqual(d['public_ipv6'], o.public_ipv6) self.assertEqual(d['host_id'], o.host_id) self.assertEqual(d['image_id'], o.image_id) self.assertEqual(d['launcher'], o.launcher) self.assertEqual(d['external_id'], o.external_id) self.assertEqual(d['hostname'], o.hostname) self.assertEqual(d['comment'], o.comment) self.assertEqual(d['hold_job'], o.hold_job) self.assertEqual(d['host_keys'], o.host_keys) self.assertEqual(d['attributes'], o.attributes)
def _create_pending_request(self): req = zk.NodeRequest() req.state = zk.PENDING req.requestor = 'test_nodepool' req.node_types.append('fake-label') self.zk.storeNodeRequest(req) # Create a node that is allocated to the request, but not yet assigned # within the NodeRequest object node = zk.Node() node.state = zk.READY node.type = 'fake-label' node.public_ipv4 = 'fake' node.provider = 'fake-provider' node.pool = 'main' node.allocated_to = req.id self.zk.storeNode(node) return (req, node)
def test_node_deallocation(self): """Test an allocated node with a missing request is deallocated""" node = zk.Node() node.state = zk.READY node.type = 'fake-label' node.public_ipv4 = 'fake' node.provider = 'fake-provider' node.allocated_to = "MISSING" self.zk.storeNode(node) configfile = self.setup_config('node_lost_requests.yaml') pool = self.useNodepool(configfile, watermark_sleep=1) self.useBuilder(configfile) pool.start() while True: node = self.zk.getNode(node.id) if not node.allocated_to: break
def registerNodeFromConfig(self, count, provider_name, pool_name, static_node): ''' Register a static node from the config with ZooKeeper. A node can be registered multiple times to support max-parallel-jobs. These nodes will share a hostname. In case there are 'building' nodes waiting for a label, those nodes will be updated and marked 'ready'. :param int count: Number of times to register this node. :param str provider_name: Name of the provider. :param str pool_name: Name of the pool owning the node. :param dict static_node: The node definition from the config file. ''' host_keys = self.checkHost(static_node) waiting_nodes = self.getWaitingNodesOfType(static_node["labels"]) for i in range(0, count): try: node = waiting_nodes.pop() except IndexError: node = zk.Node() node.state = zk.READY node.provider = provider_name node.pool = pool_name node.launcher = "static driver" node.type = static_node["labels"] node.hostname = static_node["name"] node.username = static_node["username"] node.interface_ip = static_node["name"] node.connection_port = static_node["connection-port"] node.connection_type = static_node["connection-type"] node.python_path = static_node["python-path"] nodeutils.set_node_ip(node) node.host_keys = host_keys self.zk.storeNode(node) self.log.debug("Registered static node %s", node.hostname)
def _waitForNodeSet(self): ''' Fill node set for the request. ''' needed_types = self.request.node_types static_nodes = [] unavailable_nodes = [] ready_nodes = self.zk.getReadyNodesOfTypes(needed_types) for ntype in needed_types: # First try to grab from the list of already available nodes. got_a_node = False if self.request.reuse and ntype in ready_nodes: for node in ready_nodes[ntype]: # Only interested in nodes from this provider and # pool if node.provider != self.provider.name: continue if node.pool != self.pool.name: continue try: self.zk.lockNode(node, blocking=False) except exceptions.ZKLockException: # It's already locked so skip it. continue else: if self.paused: self.log.debug("Unpaused request %s", self.request) self.paused = False self.log.debug( "Locked existing node %s for request %s", node.id, self.request.id) got_a_node = True node.allocated_to = self.request.id self.zk.storeNode(node) self.nodeset.append(node) break # Could not grab an existing node, so assign a new one. if not got_a_node: for node in self.available_nodes: if ntype in node["labels"]: max_concurrency = not self.checkConcurrency(node) if max_concurrency: continue static_nodes.append((ntype, node)) break if max_concurrency: unavailable_nodes.append(ntype) if unavailable_nodes: self.log.debug("%s: static nodes %s are at capacity" % (self.request.id, unavailable_nodes)) self.zk.storeNodeRequest(self.request) self.zk.unlockNodeRequest(self.request) self.done = True return for node_type, static_node in static_nodes: self.log.debug("%s: Assigning static_node %s" % (self.request.id, static_node)) node = zk.Node() node.state = zk.READY node.external_id = "static-%s" % self.request.id node.hostname = static_node["name"] node.username = static_node["username"] node.interface_ip = static_node["name"] node.connection_port = static_node["ssh-port"] node.connection_type = "ssh" nodeutils.set_node_ip(node) node.host_keys = self.manager.nodes_keys[static_node["name"]] node.provider = self.provider.name node.pool = self.pool.name node.launcher = self.launcher_id node.allocated_to = self.request.id node.type = node_type self.nodeset.append(node) self.zk.storeNode(node)
def test_unlockNode_not_locked(self): node = zk.Node('100') with testtools.ExpectedException(npe.ZKLockException): self.zk.unlockNode(node)
def _waitForNodeSet(self): ''' Fill node set for the request. Obtain nodes for the request, pausing all new request handling for this provider until the node set can be filled. note:: This code is a bit racey in its calculation of the number of nodes in use for quota purposes. It is possible for multiple launchers to be doing this calculation at the same time. Since we currently have no locking mechanism around the "in use" calculation, if we are at the edge of the quota, one of the launchers could attempt to launch a new node after the other launcher has already started doing so. This would cause an expected failure from the underlying library, which is ok for now. ''' # Since this code can be called more than once for the same request, # we need to calculate the difference between our current node set # and what was requested. We cannot use set operations here since a # node type can appear more than once in the requested types. saved_types = collections.Counter(self._satisfied_types.labels()) requested_types = collections.Counter(self.request.node_types) diff = requested_types - saved_types needed_types = list(diff.elements()) ready_nodes = self.zk.getReadyNodesOfTypes(needed_types) for ntype in needed_types: # First try to grab from the list of already available nodes. got_a_node = False if self.request.reuse and ntype in ready_nodes: for node in ready_nodes[ntype]: # Only interested in nodes from this provider and pool if node.provider != self.provider.name: continue if node.pool != self.pool.name: continue # Check this driver reuse requirements if not self.checkReusableNode(node): continue try: self.zk.lockNode(node, blocking=False) except exceptions.ZKLockException: # It's already locked so skip it. continue else: if self.paused: self.log.debug("Unpaused request %s", self.request) self.paused = False self.log.debug( "Locked existing node %s for request %s", node.id, self.request.id) got_a_node = True node.allocated_to = self.request.id self.zk.storeNode(node) self.nodeset.append(node) self._satisfied_types.add(ntype, node.id) # Notify driver handler about node re-use self.nodeReusedNotification(node) break # Could not grab an existing node, so launch a new one. if not got_a_node: # If we calculate that we're at capacity, pause until nodes # are released by Zuul and removed by the DeletedNodeWorker. if not self.hasRemainingQuota(ntype): self.log.info( "Not enough quota remaining to satisfy request %s", self.request.id) if not self.paused: self.log.debug( "Pausing request handling to satisfy request %s", self.request.id) self.paused = True self.zk.deleteOldestUnusedNode(self.provider.name, self.pool.name) return if self.paused: self.log.debug("Unpaused request %s", self.request) self.paused = False node = zk.Node() node.state = zk.INIT node.type = ntype node.provider = self.provider.name node.pool = self.pool.name node.launcher = self.launcher_id node.allocated_to = self.request.id self.setNodeMetadata(node) # Note: It should be safe (i.e., no race) to lock the node # *after* it is stored since nodes in INIT state are not # locked anywhere. self.zk.storeNode(node) self.zk.lockNode(node, blocking=False) self.log.debug("Locked building node %s for request %s", node.id, self.request.id) # Set state AFTER lock so that it isn't accidentally cleaned # up (unlocked BUILDING nodes will be deleted). node.state = zk.BUILDING self.zk.storeNode(node) self.nodeset.append(node) self._satisfied_types.add(ntype, node.id) self.launch(node)
def _waitForNodeSet(self): ''' Fill node set for the request. Obtain nodes for the request, pausing all new request handling for this provider until the node set can be filled. We attempt to group the node set within the same provider availability zone. For this to work properly, the provider entry in the nodepool config must list the availability zones. Otherwise, new nodes will be put in random AZs at nova's whim. The exception being if there is an existing node in the READY state that we can select for this node set. Its AZ will then be used for new nodes, as well as any other READY nodes. note:: This code is a bit racey in its calculation of the number of nodes in use for quota purposes. It is possible for multiple launchers to be doing this calculation at the same time. Since we currently have no locking mechanism around the "in use" calculation, if we are at the edge of the quota, one of the launchers could attempt to launch a new node after the other launcher has already started doing so. This would cause an expected failure from the underlying library, which is ok for now. ''' if not self.launch_manager: self.launch_manager = OpenStackNodeLaunchManager( self.zk, self.pool, self.manager, self.request.requestor, retries=self.provider.launch_retries) # Since this code can be called more than once for the same request, # we need to calculate the difference between our current node set # and what was requested. We cannot use set operations here since a # node type can appear more than once in the requested types. saved_types = collections.Counter([n.type for n in self.nodeset]) requested_types = collections.Counter(self.request.node_types) diff = requested_types - saved_types needed_types = list(diff.elements()) ready_nodes = self.zk.getReadyNodesOfTypes(needed_types) for ntype in needed_types: # First try to grab from the list of already available nodes. got_a_node = False if self.request.reuse and ntype in ready_nodes: for node in ready_nodes[ntype]: # Only interested in nodes from this provider and # pool, and within the selected AZ. if node.provider != self.provider.name: continue if node.pool != self.pool.name: continue if self.chosen_az and node.az != self.chosen_az: continue try: self.zk.lockNode(node, blocking=False) except exceptions.ZKLockException: # It's already locked so skip it. continue else: if self.paused: self.log.debug("Unpaused request %s", self.request) self.paused = False self.log.debug( "Locked existing node %s for request %s", node.id, self.request.id) got_a_node = True node.allocated_to = self.request.id self.zk.storeNode(node) self.nodeset.append(node) # If we haven't already chosen an AZ, select the # AZ from this ready node. This will cause new nodes # to share this AZ, as well. if not self.chosen_az and node.az: self.chosen_az = node.az break # Could not grab an existing node, so launch a new one. if not got_a_node: # Select grouping AZ if we didn't set AZ from a selected, # pre-existing node if not self.chosen_az: self.chosen_az = random.choice(self.pool.azs or self.manager.getAZs()) # If we calculate that we're at capacity, pause until nodes # are released by Zuul and removed by the DeletedNodeWorker. if not self._hasRemainingQuota(ntype): if not self.paused: self.log.debug( "Pausing request handling to satisfy request %s", self.request) self.paused = True self.zk.deleteOldestUnusedNode(self.provider.name, self.pool.name) return if self.paused: self.log.debug("Unpaused request %s", self.request) self.paused = False node = zk.Node() node.state = zk.INIT node.type = ntype node.provider = self.provider.name node.pool = self.pool.name node.az = self.chosen_az node.cloud = self.provider.cloud_config.name node.region = self.provider.region_name node.launcher = self.launcher_id node.allocated_to = self.request.id # Note: It should be safe (i.e., no race) to lock the node # *after* it is stored since nodes in INIT state are not # locked anywhere. self.zk.storeNode(node) self.zk.lockNode(node, blocking=False) self.log.debug("Locked building node %s for request %s", node.id, self.request.id) # Set state AFTER lock so that it isn't accidentally cleaned # up (unlocked BUILDING nodes will be deleted). node.state = zk.BUILDING self.zk.storeNode(node) self.nodeset.append(node) self.launch_manager.launch(node)