def test_node_exclusive_bug(self): """test_node_exclusive_bug If two processes with the same node exclusive attribute where scheduled in the same matchmaking cycle, they could be scheduled to the same resource, due to a caching issue. This test tests the fix. """ self.mm.initialize() n1 = NodeRecord.new("n1", "d1") self.store.add_node(n1) props = {"engine": "engine1"} r1 = ResourceRecord.new("r1", "n1", 2, properties=props) self.store.add_resource(r1) n2 = NodeRecord.new("n2", "d1") self.store.add_node(n2) props = {"engine": "engine1"} r2 = ResourceRecord.new("r2", "n2", 2, properties=props) self.store.add_resource(r2) xattr_1 = "port5000" constraints = {} p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_1) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) p2 = ProcessRecord.new(None, "p2", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_1) p2key = p2.get_key() self.store.add_process(p2) self.store.enqueue_process(*p2key) # sneak into MM and force it to update this info from the store self.mm._get_queued_processes() self.mm._get_resource_set() self.mm.matchmake() # Ensure these processes are pending and scheduled to different nodes p1 = self.store.get_process(None, "p1") p2 = self.store.get_process(None, "p2") self.assertNotEqual(p1.assigned, p2.assigned)
def test_node_filo(self): """test_node_filo We prioritize shutting down the newest VMs as a workaround for OOI Testing strategy """ self.mm.initialize() n1 = NodeRecord.new("n1", "d1") self.store.add_node(n1) props = {"engine": "engine4"} r1 = ResourceRecord.new("r1", "n1", 2, properties=props) self.store.add_resource(r1) n2 = NodeRecord.new("n2", "d1") self.store.add_node(n2) props = {"engine": "engine4"} r2 = ResourceRecord.new("r2", "n2", 2, properties=props) self.store.add_resource(r2) constraints = {"engine": "engine4"} p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED, constraints=constraints) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) # sneak into MM and force it to update this info from the store self.mm._get_queued_processes() self.mm._get_resource_set() self.mm.register_needs() self.epum_client.clear() self.mm.queued_processes = [] self.mm.register_needs() conf = self.epum_client.reconfigures['pd_domain_engine4'][0] retired_nodes = conf['engine_conf']['retirable_nodes'] assert len(retired_nodes) == 1 # This should be the second node we started assert retired_nodes[0] == "n2"
def node_state(self, node_id, domain_id, state, properties=None): """ Handle updates about available domain nodes. @param node_id: unique instance identifier @param domain_id: domain of instance @param state: EPU state of instance @param properties: Optional properties about this instance @return: This operation is the recipient of a "subscription" the PD makes to domain state updates. Calls to this operation are NOT RPC-style. This information is used for two purposes: 1. To correlate EE agent heartbeats with a node and various deploy information (site, allocation, security groups, etc). 2. To detect EEs which have been killed due to underlying death of a resource (VM). """ if state == InstanceState.RUNNING: node = self.store.get_node(node_id) if node is None: node = NodeRecord.new(node_id, domain_id, properties) try: self.store.add_node(node) except WriteConflictError: # if the node record was written by someone else, # no big deal. return log.info("Domain %s node %s is %s", domain_id, node_id, state) elif state in (InstanceState.TERMINATING, InstanceState.TERMINATED): # reschedule processes running on node node = self.store.get_node(node_id) if node is None: log.warn("got state for unknown node %s in state %s", node_id, state) return self.evacuate_node(node)
def test_node_exclusive(self): self._run_in_thread() n1 = NodeRecord.new("n1", "d1") self.store.add_node(n1) props = {"engine": "engine1"} n1_r1 = ResourceRecord.new("n1_r1", "n1", 2, properties=props) self.store.add_resource(n1_r1) n1_r2 = ResourceRecord.new("n1_r2", "n1", 2, properties=props) self.store.add_resource(n1_r2) xattr_1 = "port5000" constraints = {} p1 = ProcessRecord.new(None, "p1", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_1) p1key = p1.get_key() self.store.add_process(p1) self.store.enqueue_process(*p1key) # The first process should be assigned, since nothing else needs this # attr # TODO: it's possible that this could be assigned to n1_r2, but hopefully not self.wait_resource(n1_r1.resource_id, lambda r: list(p1key) in r.assigned) time.sleep(0.05) self.resource_client.check_process_launched(p1, n1_r1.resource_id) self.wait_process(p1.owner, p1.upid, lambda p: p.assigned == n1_r1.resource_id and p.state == ProcessState.PENDING) p2 = ProcessRecord.new(None, "p2", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_1) p2key = p2.get_key() self.store.add_process(p2) self.store.enqueue_process(*p2key) # The second process should wait, since first process wants this attr # as well self.wait_process(p2.owner, p2.upid, lambda p: p.state == ProcessState.WAITING) # If we start another node, we should see that second process be # scheduled n2 = NodeRecord.new("n2", "d1") self.store.add_node(n2) props = {"engine": "engine1"} n2_r1 = ResourceRecord.new("n2_r1", "n2", 2, properties=props) self.store.add_resource(n2_r1) props = {"engine": "engine1"} n2_r2 = ResourceRecord.new("n2_r2", "n2", 2, properties=props) self.store.add_resource(n2_r2) # The second process should now be assigned self.wait_resource(n2_r1.resource_id, lambda r: list(p2key) in r.assigned) time.sleep(0.05) self.resource_client.check_process_launched(p2, n2_r1.resource_id) self.wait_process(p2.owner, p2.upid, lambda p: p.assigned == n2_r1.resource_id and p.state == ProcessState.PENDING) # Now we submit another process with a different exclusive attribute # It should be assigned right away xattr_2 = "port5001" constraints = {} p3 = ProcessRecord.new(None, "p3", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_2) p3key = p3.get_key() self.store.add_process(p3) self.store.enqueue_process(*p3key) p3_resource = None for resource in [n1_r1, n1_r2, n2_r1, n2_r2]: try: self.wait_resource(resource.resource_id, lambda r: list(p3key) in r.assigned, timeout=0.5) except Exception: continue time.sleep(0.05) self.resource_client.check_process_launched(p3, resource.resource_id) self.wait_process(p3.owner, p3.upid, lambda p: p.assigned == resource.resource_id and p.state == ProcessState.PENDING) p3_resource = resource self.assertIsNotNone(p3_resource) # Now submit a fourth process, which should be scheduled to a different # node from p3 p4 = ProcessRecord.new(None, "p4", get_process_definition(), ProcessState.REQUESTED, constraints=constraints, node_exclusive=xattr_2) p4key = p4.get_key() self.store.add_process(p4) self.store.enqueue_process(*p4key) p4_resource = None for resource in [n1_r1, n1_r2, n2_r1, n2_r2]: try: self.wait_resource(resource.resource_id, lambda r: list(p4key) in r.assigned, timeout=0.5) except Exception: continue time.sleep(0.05) self.resource_client.check_process_launched(p4, resource.resource_id) self.wait_process(p4.owner, p4.upid, lambda p: p.assigned == resource.resource_id and p.state == ProcessState.PENDING) p4_resource = resource self.assertIsNotNone(p4_resource) self.assertNotEqual(p3_resource.node_id, p4_resource.node_id)