def test_query_ctx_nodes_pending_but_actually_running(self): """ When doing large runs, a few EC2 instances get their status changed to "running" a long time after having requested them (up to 15 minutes, compared to about 30 seconds normally). It appears that these instances have been booted successfully for a while, because they are reachable through SSH and the context broker has OK'ed them. Test that we detect these "pending but actually running" instances early. """ launch_id = _new_id() node_records = [make_node(launch_id, states.PENDING) for i in range(3)] node_records.append(make_node(launch_id, states.STARTED)) launch_record = make_launch(launch_id, states.PENDING, node_records) self.store.add_launch(launch_record) for node in node_records: self.store.add_node(node) self.ctx.nodes = [_one_fake_ctx_node_ok(node['public_ip'], _new_id(), _new_id()) for node in node_records] self.ctx.complete = True self.core.query_contexts() launch = self.store.get_launch(launch_id) self.assertEqual(launch['state'], states.RUNNING) for node_id in launch['node_ids']: node = self.store.get_node(node_id) self.assertEqual(states.RUNNING, node['state'])
def test_query_ctx_error(self): node_count = 3 launch_id = _new_id() node_records = [ make_node(launch_id, states.STARTED) for i in range(node_count) ] launch_record = make_launch(launch_id, states.PENDING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) self.ctx.expected_count = len(node_records) self.ctx.complete = False self.ctx.error = False # all but 1 node have reported ok self.ctx.nodes = [ _one_fake_ctx_node_ok(node_records[i]['public_ip'], _new_id(), _new_id()) for i in range(node_count - 1) ] self.ctx.nodes.append( _one_fake_ctx_node_error(node_records[-1]['public_ip'], _new_id(), _new_id())) ok_ids = [node_records[i]['node_id'] for i in range(node_count - 1)] error_ids = [node_records[-1]['node_id']] self.ctx.complete = True self.ctx.error = True yield self.core.query_contexts() self.assertTrue(self.notifier.assure_state(states.RUNNING, ok_ids)) self.assertTrue( self.notifier.assure_state(states.RUNNING_FAILED, error_ids))
def test_query_ctx(self): node_count = 3 launch_id = _new_id() node_records = [make_node(launch_id, states.STARTED) for i in range(node_count)] launch_record = make_launch(launch_id, states.PENDING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) self.ctx.expected_count = len(node_records) self.ctx.complete = False self.ctx.error = False #first query with no ctx nodes. zero records should be updated yield self.core.query_contexts() self.assertTrue(self.notifier.assure_record_count(0)) # all but 1 node have reported ok self.ctx.nodes = [_one_fake_ctx_node_ok(node_records[i]['public_ip'], _new_id(), _new_id()) for i in range(node_count-1)] yield self.core.query_contexts() self.assertTrue(self.notifier.assure_state(states.RUNNING)) self.assertEqual(len(self.notifier.nodes), node_count-1) #last node reports ok self.ctx.nodes.append(_one_fake_ctx_node_ok(node_records[-1]['public_ip'], _new_id(), _new_id())) self.ctx.complete = True yield self.core.query_contexts() self.assertTrue(self.notifier.assure_state(states.RUNNING)) self.assertTrue(self.notifier.assure_record_count(1))
def test_query_ctx_error(self): node_count = 3 launch_id = _new_id() node_records = [make_node(launch_id, states.STARTED) for i in range(node_count)] launch_record = make_launch(launch_id, states.PENDING, node_records) self.store.add_launch(launch_record) for node in node_records: self.store.add_node(node) self.ctx.expected_count = len(node_records) self.ctx.complete = False self.ctx.error = False # all but 1 node have reported ok self.ctx.nodes = [_one_fake_ctx_node_ok(node_records[i]['public_ip'], _new_id(), _new_id()) for i in range(node_count - 1)] self.ctx.nodes.append(_one_fake_ctx_node_error(node_records[-1]['public_ip'], _new_id(), _new_id())) ok_ids = [node_records[i]['node_id'] for i in range(node_count - 1)] error_ids = [node_records[-1]['node_id']] self.ctx.complete = True self.ctx.error = True self.core.query_contexts() self.assertTrue(self.notifier.assure_state(states.RUNNING, ok_ids)) self.assertTrue(self.notifier.assure_state(states.RUNNING_FAILED, error_ids))
def test_recovery_launch_terminating(self): launch_id = _new_id() terminating_iaas_ids = [_new_id(), _new_id()] node_records = [make_node(launch_id, states.TERMINATING, iaas_id=terminating_iaas_ids[0], site='fake'), make_node(launch_id, states.TERMINATED), make_node(launch_id, states.RUNNING, iaas_id=terminating_iaas_ids[1], site='fake')] launch_record = make_launch(launch_id, states.TERMINATING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) yield self.core.recover() self.assertEqual(2, len(self.driver.destroyed)) self.assertTrue(self.driver.destroyed[0].id in terminating_iaas_ids) self.assertTrue(self.driver.destroyed[1].id in terminating_iaas_ids) terminated = yield self.store.get_nodes(state=states.TERMINATED) self.assertEqual(3, len(terminated)) launch_record = yield self.store.get_launch(launch_id) self.assertEqual(launch_record['state'], states.TERMINATED)
def test_query_ctx_with_several_nodes_timeout(self): node_count = 3 launch_id = _new_id() node_records = [make_node(launch_id, states.STARTED) for i in range(node_count)] launch_record = make_launch(launch_id, states.PENDING, node_records) node_ids = map(lambda node: node['node_id'], node_records) ts = time.time() for i in range(node_count - 1): node_records[i]['running_timestamp'] = ts - INSTANCE_READY_TIMEOUT + 10 node_records[-1]['running_timestamp'] = ts - INSTANCE_READY_TIMEOUT - 10 self.store.add_launch(launch_record) for node in node_records: self.store.add_node(node) self.ctx.expected_count = len(node_records) self.ctx.complete = False self.ctx.error = False # all but 1 node have reported ok self.ctx.nodes = [_one_fake_ctx_node_ok(node_records[i]['public_ip'], _new_id(), _new_id()) for i in range(node_count - 1)] self.core.query_contexts() self.assertTrue(self.notifier.assure_state(states.RUNNING, node_ids[:node_count - 1])) self.assertEqual(len(self.notifier.nodes), node_count) self.assertTrue(self.notifier.assure_state(states.RUNNING_FAILED, node_ids[node_count - 1:])) self.assertTrue(self.notifier.assure_record_count(1, node_ids[node_count - 1:]))
def test_recovery_nodes_terminating(self): launch_id = _new_id() terminating_iaas_id = _new_id() node_records = [ make_node(launch_id, states.TERMINATING, iaas_id=terminating_iaas_id, site='fake'), make_node(launch_id, states.TERMINATED), make_node(launch_id, states.RUNNING) ] launch_record = make_launch(launch_id, states.RUNNING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) yield self.core.recover() self.assertEqual(1, len(self.driver.destroyed)) self.assertEqual(self.driver.destroyed[0].id, terminating_iaas_id) terminated = yield self.store.get_nodes(state=states.TERMINATED) self.assertEqual(2, len(terminated))
def test_query_ctx_without_valid_nodes(self): # if there are no nodes < TERMINATING, no broker query should happen for i in range(3): launch_id = _new_id() node_records = [make_node(launch_id, states.STARTED)] launch_record = make_launch(launch_id, states.PENDING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) launches = yield self.store.get_launches(state=states.PENDING) error_launch = launches[0] # mark first launch's node as TERMINATING, should prevent # context query and result in launch being marked FAILED error_launch_node = yield self.store.get_node(error_launch['node_ids'][0]) error_launch_node['state'] = states.TERMINATING yield self.store.put_node(error_launch_node) yield self.core.query_contexts() self.assertNotIn(error_launch['context']['uri'], self.ctx.queried_uris) launches = yield self.store.get_launches() for launch in launches: if launch['launch_id'] == error_launch['launch_id']: self.assertEqual(launch['state'], states.FAILED) expected_node_state = states.TERMINATING else: self.assertEqual(launch['state'], states.PENDING) expected_node_state = states.STARTED node = yield self.store.get_node(launch['node_ids'][0]) self.assertEqual(node['state'], expected_node_state)
def test_recovery_launch_terminating(self): launch_id = _new_id() terminating_iaas_ids = [_new_id(), _new_id()] node_records = [ make_node(launch_id, states.TERMINATING, iaas_id=terminating_iaas_ids[0], site='fake'), make_node(launch_id, states.TERMINATED), make_node(launch_id, states.RUNNING, iaas_id=terminating_iaas_ids[1], site='fake') ] launch_record = make_launch(launch_id, states.TERMINATING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) yield self.core.recover() self.assertEqual(2, len(self.driver.destroyed)) self.assertTrue(self.driver.destroyed[0].id in terminating_iaas_ids) self.assertTrue(self.driver.destroyed[1].id in terminating_iaas_ids) terminated = yield self.store.get_nodes(state=states.TERMINATED) self.assertEqual(3, len(terminated)) launch_record = yield self.store.get_launch(launch_id) self.assertEqual(launch_record['state'], states.TERMINATED)
def test_record_reaper(self): launch_id1 = _new_id() launch_id2 = _new_id() now = time.time() node1 = make_node(launch_id1, InstanceState.TERMINATED, caller=self.default_user, state_changes=[(InstanceState.TERMINATED, now - self.record_reaping_max_age - 1)]) node2 = make_node(launch_id1, InstanceState.FAILED, caller=self.default_user, state_changes=[(InstanceState.FAILED, now - self.record_reaping_max_age - 1)]) node3 = make_node(launch_id1, InstanceState.REJECTED, caller=self.default_user, state_changes=[(InstanceState.REJECTED, now - self.record_reaping_max_age - 1)]) nodes1 = [node1, node2, node3] launch1 = make_launch(launch_id1, InstanceState.RUNNING, nodes1, caller=self.default_user) node4 = make_node(launch_id2, InstanceState.RUNNING, caller=self.default_user, state_changes=[(InstanceState.RUNNING, now - self.record_reaping_max_age - 1)]) node5 = make_node(launch_id2, InstanceState.TERMINATED, caller=self.default_user, state_changes=[(InstanceState.TERMINATED, now - self.record_reaping_max_age - 1)]) nodes2 = [node4, node5] launch2 = make_launch(launch_id2, InstanceState.RUNNING, nodes2, caller=self.default_user) self.store.add_launch(launch1) for node in nodes1: self.store.add_node(node) self.store.add_launch(launch2) for node in nodes2: self.store.add_node(node) # Wait a second for record to get written time.sleep(1) # Force a record reaping cycle self.provisioner.leader._force_record_reaping() # Check that the first launch is completely removed node_ids1 = map(lambda x: x['node_id'], nodes1) self.assertNoStoreNodeRecords(*node_ids1) self.assertNoStoreLaunchRecord(launch_id1) # Check that the second launch is still here but with only the running node self.assertStoreNodeRecords(InstanceState.RUNNING, node4['node_id']) self.assertStoreLaunchRecord(InstanceState.RUNNING, launch_id2)
def test_query_ctx_nodes_not_started(self): launch_id = _new_id() node_records = [make_node(launch_id, states.PENDING) for i in range(3)] node_records.append(make_node(launch_id, states.STARTED)) launch_record = make_launch(launch_id, states.PENDING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) yield self.core.query_contexts() # ensure that no context was actually queried. See the note in # _query_one_context for the reason why this is important. self.assertEqual(len(self.ctx.queried_uris), 0)
def test_recover_launch_incomplete(self): """Ensures that launches in REQUESTED state are completed """ launch_id = _new_id() doc = "<cluster><workspace><name>node</name><image>fake</image>"+\ "<quantity>3</quantity>"+\ "</workspace><workspace><name>running</name><image>fake"+\ "</image><quantity>1</quantity></workspace></cluster>" context = {'broker_uri' : _new_id(), 'context_id' : _new_id(), 'secret' : _new_id(), 'uri' : _new_id()} requested_node_ids = [_new_id(), _new_id()] node_records = [make_node(launch_id, states.RUNNING, site='fake', ctx_name='running'), make_node(launch_id, states.REQUESTED, site='fake', node_id=requested_node_ids[0], ctx_name='node'), make_node(launch_id, states.REQUESTED, site='fake', node_id=requested_node_ids[1], ctx_name='node'), make_node(launch_id, states.RUNNING, ctx_name='node')] launch_record = make_launch(launch_id, states.REQUESTED, node_records, document=doc, context=context) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) # 2 nodes are in REQUESTED state, so those should be launched yield self.core.recover() # because we rely on IaaS idempotency, we get full Node responses # for all nodes in the group. What really would cause this scenario # is successfully launching the full group but failing before records # could be written for the two REQUESTED nodes. self.assertEqual(3, len(self.driver.created)) iaas_ids = set(node.id for node in self.driver.created) self.assertEqual(3, len(iaas_ids)) for node_id in requested_node_ids: node = yield self.store.get_node(node_id) self.assertEqual(states.PENDING, node['state']) self.assertTrue(node['iaas_id'] in iaas_ids) launch = yield self.store.get_launch(launch_id) self.assertEqual(states.PENDING, launch['state'])
def test_query_ctx_permanent_broker_error(self): node_count = 3 launch_id = _new_id() node_records = [make_node(launch_id, states.STARTED) for i in range(node_count)] node_ids = [node['node_id'] for node in node_records] launch_record = make_launch(launch_id, states.PENDING, node_records) self.store.add_launch(launch_record) for node in node_records: self.store.add_node(node) self.ctx.query_error = ContextNotFoundError() self.core.query_contexts() self.assertTrue(self.notifier.assure_state(states.RUNNING_FAILED, node_ids)) launch = self.store.get_launch(launch_id) self.assertEqual(launch['state'], states.FAILED)
def test_query_unexpected_exception(self): launch_id = _new_id() node_records = [make_node(launch_id, states.STARTED)] launch_record = make_launch(launch_id, states.PENDING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) self.ctx.query_error = ValueError("bad programmer") # digging into internals a bit: patching one of the methods query() # calls to raise an exception. This will let us ensure exceptions do # not bubble up def raiser(self): raise KeyError("notreallyaproblem") self.patch(self.core, 'query_nodes', raiser) yield self.core.query() # ensure that exception doesn't bubble up
def test_query_ctx_permanent_broker_error(self): node_count = 3 launch_id = _new_id() node_records = [ make_node(launch_id, states.STARTED) for i in range(node_count) ] node_ids = [node['node_id'] for node in node_records] launch_record = make_launch(launch_id, states.PENDING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) self.ctx.query_error = ContextNotFoundError() yield self.core.query_contexts() self.assertTrue( self.notifier.assure_state(states.RUNNING_FAILED, node_ids)) launch = yield self.store.get_launch(launch_id) self.assertEqual(launch['state'], states.FAILED)
def test_query_ctx_with_one_node_timeout(self): launch_id = _new_id() node_record = make_node(launch_id, states.STARTED) launch_record = make_launch(launch_id, states.PENDING, [node_record]) ts = time.time() node_record['running_timestamp'] = ts - INSTANCE_READY_TIMEOUT - 10 self.store.add_launch(launch_record) self.store.add_node(node_record) self.ctx.expected_count = 1 self.ctx.complete = False self.ctx.error = False self.ctx.nodes = [] self.core.query_contexts() self.assertTrue(self.notifier.assure_state(states.RUNNING_FAILED)) self.assertTrue(self.notifier.assure_record_count(1))
def test_dump_state(self): node_ids = [] node_records = [] for i in range(3): launch_id = _new_id() nodes = [make_node(launch_id, states.PENDING)] node_ids.append(nodes[0]['node_id']) node_records.extend(nodes) launch = make_launch(launch_id, states.PENDING, nodes) yield self.store.put_launch(launch) yield self.store.put_nodes(nodes) yield self.core.dump_state(node_ids[:2]) # should have gotten notifications about the 2 nodes self.assertEqual(self.notifier.nodes_rec_count[node_ids[0]], 1) self.assertEqual(node_records[0], self.notifier.nodes[node_ids[0]]) self.assertEqual(node_records[1], self.notifier.nodes[node_ids[1]]) self.assertEqual(self.notifier.nodes_rec_count[node_ids[1]], 1) self.assertNotIn(node_ids[2], self.notifier.nodes)
def test_query_broker_exception(self): for i in range(2): launch_id = _new_id() node_records = [make_node(launch_id, states.STARTED)] launch_record = make_launch(launch_id, states.PENDING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) # no guaranteed order here so grabbing first launch from store # and making that one return a BrokerError during context query. # THe goal is to ensure that one error doesn't prevent querying # for other contexts. launches = yield self.store.get_launches(state=states.PENDING) error_launch = launches[0] error_launch_ctx = error_launch['context']['uri'] ok_node_id = launches[1]['node_ids'][0] ok_node = yield self.store.get_node(ok_node_id) self.ctx.uri_query_error[error_launch_ctx] = BrokerError("bad broker") self.ctx.nodes = [ _one_fake_ctx_node_ok(ok_node['public_ip'], _new_id(), _new_id()) ] self.ctx.complete = True yield self.core.query_contexts() launches = yield self.store.get_launches() for launch in launches: self.assertIn(launch['context']['uri'], self.ctx.queried_uris) if launch['launch_id'] == error_launch['launch_id']: self.assertEqual(launch['state'], states.PENDING) expected_node_state = states.STARTED else: self.assertEqual(launch['state'], states.RUNNING) expected_node_state = states.RUNNING node = yield self.store.get_node(launch['node_ids'][0]) self.assertEqual(node['state'], expected_node_state)
def test_query_broker_exception(self): caller = "asterix" for i in range(2): launch_id = _new_id() node_records = [make_node(launch_id, states.STARTED)] launch_record = make_launch(launch_id, states.PENDING, node_records, caller=caller) self.store.add_launch(launch_record) for node in node_records: self.store.add_node(node) # no guaranteed order here so grabbing first launch from store # and making that one return a BrokerError during context query. # THe goal is to ensure that one error doesn't prevent querying # for other contexts. launches = self.store.get_launches(state=states.PENDING) error_launch = launches[0] error_launch_ctx = error_launch['context']['uri'] ok_node_id = launches[1]['node_ids'][0] ok_node = self.store.get_node(ok_node_id) self.ctx.uri_query_error[error_launch_ctx] = BrokerError("bad broker") self.ctx.nodes = [_one_fake_ctx_node_ok(ok_node['public_ip'], _new_id(), _new_id())] self.ctx.complete = True self.core.query_contexts() launches = self.store.get_launches() for launch in launches: self.assertIn(launch['context']['uri'], self.ctx.queried_uris) if launch['launch_id'] == error_launch['launch_id']: self.assertEqual(launch['state'], states.PENDING) expected_node_state = states.STARTED else: self.assertEqual(launch['state'], states.RUNNING) expected_node_state = states.RUNNING node = self.store.get_node(launch['node_ids'][0]) self.assertEqual(node['state'], expected_node_state)
def test_mark_nodes_terminating(self): launch_id = _new_id() node_records = [make_node(launch_id, states.RUNNING) for i in range(3)] launch_record = make_launch(launch_id, states.PENDING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) first_two_node_ids = [node_records[0]['node_id'], node_records[1]['node_id']] yield self.core.mark_nodes_terminating(first_two_node_ids) self.assertTrue(self.notifier.assure_state(states.TERMINATING, nodes=first_two_node_ids)) self.assertNotIn(node_records[2]['node_id'], self.notifier.nodes) for node_id in first_two_node_ids: terminating_node = yield self.store.get_node(node_id) self.assertEqual(terminating_node['state'], states.TERMINATING)
def test_mark_nodes_terminating(self): launch_id = _new_id() node_records = [make_node(launch_id, states.RUNNING) for i in range(3)] launch_record = make_launch(launch_id, states.PENDING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) first_two_node_ids = [ node_records[0]['node_id'], node_records[1]['node_id'] ] yield self.core.mark_nodes_terminating(first_two_node_ids) self.assertTrue( self.notifier.assure_state(states.TERMINATING, nodes=first_two_node_ids)) self.assertNotIn(node_records[2]['node_id'], self.notifier.nodes) for node_id in first_two_node_ids: terminating_node = yield self.store.get_node(node_id) self.assertEqual(terminating_node['state'], states.TERMINATING)
def test_describe(self): caller = "asterix" node_ids = [] for _ in range(3): launch_id = _new_id() node_records = [make_node(launch_id, states.RUNNING)] node_ids.append(node_records[0]['node_id']) launch_record = make_launch( launch_id, states.PENDING, node_records, caller=caller) self.store.add_launch(launch_record) for node in node_records: self.store.add_node(node) all_nodes = self.core.describe_nodes() all_node_ids = [n['node_id'] for n in all_nodes] self.assertEqual(set(all_node_ids), set(node_ids)) self.assertFalse(any(VERSION_KEY in n for n in all_nodes)) all_nodes = self.core.describe_nodes(node_ids) all_node_ids = [m['node_id'] for m in all_nodes] self.assertEqual(set(all_node_ids), set(node_ids)) subset_nodes = self.core.describe_nodes(node_ids[1:]) subset_node_ids = [o['node_id'] for o in subset_nodes] self.assertEqual(set(subset_node_ids), set(node_ids[1:])) one_node = self.core.describe_nodes([node_ids[0]]) self.assertEqual(len(one_node), 1) self.assertEqual(one_node[0]['node_id'], node_ids[0]) self.assertEqual(one_node[0]['state'], states.RUNNING) self.assertNotIn(VERSION_KEY, one_node[0]) try: self.core.describe_nodes([node_ids[0], "not-a-real-node"]) except KeyError: pass else: self.fail("Expected exception for bad node_id")
def test_query_ctx_with_no_timeout(self): caller = "asterix" launch_id = _new_id() node_record = make_node(launch_id, states.STARTED) launch_record = make_launch(launch_id, states.PENDING, [node_record], caller=caller) ts = time.time() node_record['running_timestamp'] = ts - INSTANCE_READY_TIMEOUT - 10 self.store.add_launch(launch_record) self.store.add_node(node_record) self.ctx.expected_count = 1 self.ctx.complete = False self.ctx.error = False self.ctx.nodes = [_one_fake_ctx_node_not_done(node_record['public_ip'], _new_id(), _new_id())] self.core.query_contexts() self.assertTrue(self.notifier.assure_record_count(0))
def test_query_ctx(self): node_count = 3 launch_id = _new_id() node_records = [ make_node(launch_id, states.STARTED) for i in range(node_count) ] launch_record = make_launch(launch_id, states.PENDING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) self.ctx.expected_count = len(node_records) self.ctx.complete = False self.ctx.error = False #first query with no ctx nodes. zero records should be updated yield self.core.query_contexts() self.assertTrue(self.notifier.assure_record_count(0)) # all but 1 node have reported ok self.ctx.nodes = [ _one_fake_ctx_node_ok(node_records[i]['public_ip'], _new_id(), _new_id()) for i in range(node_count - 1) ] yield self.core.query_contexts() self.assertTrue(self.notifier.assure_state(states.RUNNING)) self.assertEqual(len(self.notifier.nodes), node_count - 1) #last node reports ok self.ctx.nodes.append( _one_fake_ctx_node_ok(node_records[-1]['public_ip'], _new_id(), _new_id())) self.ctx.complete = True yield self.core.query_contexts() self.assertTrue(self.notifier.assure_state(states.RUNNING)) self.assertTrue(self.notifier.assure_record_count(1))
def test_recovery_nodes_terminating(self): launch_id = _new_id() terminating_iaas_id = _new_id() node_records = [make_node(launch_id, states.TERMINATING, iaas_id=terminating_iaas_id, site='fake'), make_node(launch_id, states.TERMINATED), make_node(launch_id, states.RUNNING)] launch_record = make_launch(launch_id, states.RUNNING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) yield self.core.recover() self.assertEqual(1, len(self.driver.destroyed)) self.assertEqual(self.driver.destroyed[0].id, terminating_iaas_id) terminated = yield self.store.get_nodes(state=states.TERMINATED) self.assertEqual(2, len(terminated))
def test_query_ctx_without_valid_nodes(self): # if there are no nodes < TERMINATING, no broker query should happen for i in range(3): launch_id = _new_id() node_records = [make_node(launch_id, states.STARTED)] launch_record = make_launch(launch_id, states.PENDING, node_records) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) launches = yield self.store.get_launches(state=states.PENDING) error_launch = launches[0] # mark first launch's node as TERMINATING, should prevent # context query and result in launch being marked FAILED error_launch_node = yield self.store.get_node( error_launch['node_ids'][0]) error_launch_node['state'] = states.TERMINATING yield self.store.put_node(error_launch_node) yield self.core.query_contexts() self.assertNotIn(error_launch['context']['uri'], self.ctx.queried_uris) launches = yield self.store.get_launches() for launch in launches: if launch['launch_id'] == error_launch['launch_id']: self.assertEqual(launch['state'], states.FAILED) expected_node_state = states.TERMINATING else: self.assertEqual(launch['state'], states.PENDING) expected_node_state = states.STARTED node = yield self.store.get_node(launch['node_ids'][0]) self.assertEqual(node['state'], expected_node_state)
def test_recover_launch_incomplete(self): """Ensures that launches in REQUESTED state are completed """ launch_id = _new_id() doc = "<cluster><workspace><name>node</name><image>fake</image>"+\ "<quantity>3</quantity>"+\ "</workspace><workspace><name>running</name><image>fake"+\ "</image><quantity>1</quantity></workspace></cluster>" context = { 'broker_uri': _new_id(), 'context_id': _new_id(), 'secret': _new_id(), 'uri': _new_id() } requested_node_ids = [_new_id(), _new_id()] node_records = [ make_node(launch_id, states.RUNNING, site='fake', ctx_name='running'), make_node(launch_id, states.REQUESTED, site='fake', node_id=requested_node_ids[0], ctx_name='node'), make_node(launch_id, states.REQUESTED, site='fake', node_id=requested_node_ids[1], ctx_name='node'), make_node(launch_id, states.RUNNING, ctx_name='node') ] launch_record = make_launch(launch_id, states.REQUESTED, node_records, document=doc, context=context) yield self.store.put_launch(launch_record) yield self.store.put_nodes(node_records) # 2 nodes are in REQUESTED state, so those should be launched yield self.core.recover() # because we rely on IaaS idempotency, we get full Node responses # for all nodes in the group. What really would cause this scenario # is successfully launching the full group but failing before records # could be written for the two REQUESTED nodes. self.assertEqual(3, len(self.driver.created)) iaas_ids = set(node.id for node in self.driver.created) self.assertEqual(3, len(iaas_ids)) for node_id in requested_node_ids: node = yield self.store.get_node(node_id) self.assertEqual(states.PENDING, node['state']) self.assertTrue(node['iaas_id'] in iaas_ids) launch = yield self.store.get_launch(launch_id) self.assertEqual(states.PENDING, launch['state'])