Example #1
0
    def test_query_ctx_nodes_pending_but_actually_running(self):
        """
        When doing large runs, a few EC2 instances get their status changed to
        "running" a long time after having requested them (up to 15 minutes,
        compared to about 30 seconds normally).
        It appears that these instances have been booted successfully for a
        while, because they are reachable through SSH and the context broker
        has OK'ed them.
        Test that we detect these "pending but actually running" instances
        early.
        """
        launch_id = _new_id()
        node_records = [make_node(launch_id, states.PENDING)
                        for i in range(3)]
        node_records.append(make_node(launch_id, states.STARTED))
        launch_record = make_launch(launch_id, states.PENDING,
                                    node_records)
        self.store.add_launch(launch_record)
        for node in node_records:
            self.store.add_node(node)

        self.ctx.nodes = [_one_fake_ctx_node_ok(node['public_ip'], _new_id(),
                          _new_id()) for node in node_records]
        self.ctx.complete = True

        self.core.query_contexts()

        launch = self.store.get_launch(launch_id)
        self.assertEqual(launch['state'], states.RUNNING)

        for node_id in launch['node_ids']:
            node = self.store.get_node(node_id)
            self.assertEqual(states.RUNNING, node['state'])
Example #2
0
    def test_recovery_launch_terminating(self):
        launch_id = _new_id()

        terminating_iaas_ids = [_new_id(), _new_id()]

        node_records = [make_node(launch_id, states.TERMINATING,
                                              iaas_id=terminating_iaas_ids[0],
                                              site='fake'),
                        make_node(launch_id, states.TERMINATED),
                        make_node(launch_id, states.RUNNING,
                                              iaas_id=terminating_iaas_ids[1],
                                              site='fake')]

        launch_record = make_launch(launch_id, states.TERMINATING,
                                                node_records)

        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        yield self.core.recover()

        self.assertEqual(2, len(self.driver.destroyed))
        self.assertTrue(self.driver.destroyed[0].id in terminating_iaas_ids)
        self.assertTrue(self.driver.destroyed[1].id in terminating_iaas_ids)

        terminated = yield self.store.get_nodes(state=states.TERMINATED)
        self.assertEqual(3, len(terminated))

        launch_record = yield self.store.get_launch(launch_id)
        self.assertEqual(launch_record['state'], states.TERMINATED)
Example #3
0
    def test_recovery_nodes_terminating(self):
        launch_id = _new_id()

        terminating_iaas_id = _new_id()

        node_records = [
            make_node(launch_id,
                      states.TERMINATING,
                      iaas_id=terminating_iaas_id,
                      site='fake'),
            make_node(launch_id, states.TERMINATED),
            make_node(launch_id, states.RUNNING)
        ]

        launch_record = make_launch(launch_id, states.RUNNING, node_records)

        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        yield self.core.recover()

        self.assertEqual(1, len(self.driver.destroyed))
        self.assertEqual(self.driver.destroyed[0].id, terminating_iaas_id)

        terminated = yield self.store.get_nodes(state=states.TERMINATED)
        self.assertEqual(2, len(terminated))
Example #4
0
    def test_recovery_launch_terminating(self):
        launch_id = _new_id()

        terminating_iaas_ids = [_new_id(), _new_id()]

        node_records = [
            make_node(launch_id,
                      states.TERMINATING,
                      iaas_id=terminating_iaas_ids[0],
                      site='fake'),
            make_node(launch_id, states.TERMINATED),
            make_node(launch_id,
                      states.RUNNING,
                      iaas_id=terminating_iaas_ids[1],
                      site='fake')
        ]

        launch_record = make_launch(launch_id, states.TERMINATING,
                                    node_records)

        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        yield self.core.recover()

        self.assertEqual(2, len(self.driver.destroyed))
        self.assertTrue(self.driver.destroyed[0].id in terminating_iaas_ids)
        self.assertTrue(self.driver.destroyed[1].id in terminating_iaas_ids)

        terminated = yield self.store.get_nodes(state=states.TERMINATED)
        self.assertEqual(3, len(terminated))

        launch_record = yield self.store.get_launch(launch_id)
        self.assertEqual(launch_record['state'], states.TERMINATED)
Example #5
0
    def test_query_ctx_nodes_not_started(self):
        launch_id = _new_id()
        node_records = [make_node(launch_id, states.PENDING) for i in range(3)]
        node_records.append(make_node(launch_id, states.STARTED))
        launch_record = make_launch(launch_id, states.PENDING, node_records)
        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        yield self.core.query_contexts()

        # ensure that no context was actually queried. See the note in
        # _query_one_context for the reason why this is important.
        self.assertEqual(len(self.ctx.queried_uris), 0)
Example #6
0
    def test_recover_launch_incomplete(self):
        """Ensures that launches in REQUESTED state are completed
        """
        launch_id = _new_id()
        doc = "<cluster><workspace><name>node</name><image>fake</image>"+\
              "<quantity>3</quantity>"+\
              "</workspace><workspace><name>running</name><image>fake"+\
              "</image><quantity>1</quantity></workspace></cluster>"
        context = {'broker_uri' : _new_id(), 'context_id' : _new_id(),
                  'secret' : _new_id(), 'uri' : _new_id()}

        requested_node_ids = [_new_id(), _new_id()]

        node_records = [make_node(launch_id, states.RUNNING,
                                              site='fake',
                                              ctx_name='running'),
                        make_node(launch_id, states.REQUESTED,
                                              site='fake',
                                              node_id=requested_node_ids[0],
                                              ctx_name='node'),
                        make_node(launch_id, states.REQUESTED,
                                              site='fake',
                                              node_id=requested_node_ids[1],
                                              ctx_name='node'),
                        make_node(launch_id, states.RUNNING,
                                              ctx_name='node')]
        launch_record = make_launch(launch_id, states.REQUESTED,
                                                node_records, document=doc,
                                                context=context)

        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        # 2 nodes are in REQUESTED state, so those should be launched
        yield self.core.recover()

        # because we rely on IaaS idempotency, we get full Node responses
        # for all nodes in the group. What really would cause this scenario
        # is successfully launching the full group but failing before records
        # could be written for the two REQUESTED nodes.
        self.assertEqual(3, len(self.driver.created))
        iaas_ids = set(node.id for node in self.driver.created)
        self.assertEqual(3, len(iaas_ids))

        for node_id in requested_node_ids:
            node = yield self.store.get_node(node_id)
            self.assertEqual(states.PENDING, node['state'])
            self.assertTrue(node['iaas_id'] in iaas_ids)

        launch = yield self.store.get_launch(launch_id)
        self.assertEqual(states.PENDING, launch['state'])
Example #7
0
    def test_query_ctx_nodes_not_started(self):
        launch_id = _new_id()
        node_records = [make_node(launch_id, states.PENDING)
                for i in range(3)]
        node_records.append(make_node(launch_id, states.STARTED))
        launch_record = make_launch(launch_id, states.PENDING,
                                                node_records)
        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        yield self.core.query_contexts()

        # ensure that no context was actually queried. See the note in
        # _query_one_context for the reason why this is important.
        self.assertEqual(len(self.ctx.queried_uris), 0)
Example #8
0
    def test_query_ctx_error(self):
        node_count = 3
        launch_id = _new_id()
        node_records = [make_node(launch_id, states.STARTED)
                        for i in range(node_count)]
        launch_record = make_launch(launch_id, states.PENDING,
                                    node_records)

        self.store.add_launch(launch_record)
        for node in node_records:
            self.store.add_node(node)

        self.ctx.expected_count = len(node_records)
        self.ctx.complete = False
        self.ctx.error = False

        # all but 1 node have reported ok
        self.ctx.nodes = [_one_fake_ctx_node_ok(node_records[i]['public_ip'],
                          _new_id(), _new_id()) for i in range(node_count - 1)]
        self.ctx.nodes.append(_one_fake_ctx_node_error(node_records[-1]['public_ip'],
                              _new_id(), _new_id()))

        ok_ids = [node_records[i]['node_id'] for i in range(node_count - 1)]
        error_ids = [node_records[-1]['node_id']]

        self.ctx.complete = True
        self.ctx.error = True

        self.core.query_contexts()
        self.assertTrue(self.notifier.assure_state(states.RUNNING, ok_ids))
        self.assertTrue(self.notifier.assure_state(states.RUNNING_FAILED, error_ids))
Example #9
0
    def test_query_ctx_error(self):
        node_count = 3
        launch_id = _new_id()
        node_records = [
            make_node(launch_id, states.STARTED) for i in range(node_count)
        ]
        launch_record = make_launch(launch_id, states.PENDING, node_records)

        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        self.ctx.expected_count = len(node_records)
        self.ctx.complete = False
        self.ctx.error = False

        # all but 1 node have reported ok
        self.ctx.nodes = [
            _one_fake_ctx_node_ok(node_records[i]['public_ip'], _new_id(),
                                  _new_id()) for i in range(node_count - 1)
        ]
        self.ctx.nodes.append(
            _one_fake_ctx_node_error(node_records[-1]['public_ip'], _new_id(),
                                     _new_id()))

        ok_ids = [node_records[i]['node_id'] for i in range(node_count - 1)]
        error_ids = [node_records[-1]['node_id']]

        self.ctx.complete = True
        self.ctx.error = True

        yield self.core.query_contexts()
        self.assertTrue(self.notifier.assure_state(states.RUNNING, ok_ids))
        self.assertTrue(
            self.notifier.assure_state(states.RUNNING_FAILED, error_ids))
Example #10
0
    def test_query_ctx_without_valid_nodes(self):

        # if there are no nodes < TERMINATING, no broker query should happen
        for i in range(3):
            launch_id = _new_id()
            node_records = [make_node(launch_id, states.STARTED)]
            launch_record = make_launch(launch_id, states.PENDING,
                                                    node_records)

            yield self.store.put_launch(launch_record)
            yield self.store.put_nodes(node_records)

        launches = yield self.store.get_launches(state=states.PENDING)
        error_launch = launches[0]

        # mark first launch's node as TERMINATING, should prevent
        # context query and result in launch being marked FAILED
        error_launch_node = yield self.store.get_node(error_launch['node_ids'][0])
        error_launch_node['state'] = states.TERMINATING
        yield self.store.put_node(error_launch_node)

        yield self.core.query_contexts()
        self.assertNotIn(error_launch['context']['uri'], self.ctx.queried_uris)

        launches = yield self.store.get_launches()
        for launch in launches:
            if launch['launch_id'] == error_launch['launch_id']:
                self.assertEqual(launch['state'], states.FAILED)
                expected_node_state = states.TERMINATING
            else:
                self.assertEqual(launch['state'], states.PENDING)
                expected_node_state = states.STARTED

            node = yield self.store.get_node(launch['node_ids'][0])
            self.assertEqual(node['state'], expected_node_state)
Example #11
0
    def test_query_ctx(self):
        node_count = 3
        launch_id = _new_id()
        node_records = [make_node(launch_id, states.STARTED)
                for i in range(node_count)]
        launch_record = make_launch(launch_id, states.PENDING,
                                                node_records)

        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        self.ctx.expected_count = len(node_records)
        self.ctx.complete = False
        self.ctx.error = False

        #first query with no ctx nodes. zero records should be updated
        yield self.core.query_contexts()
        self.assertTrue(self.notifier.assure_record_count(0))
        
        # all but 1 node have reported ok
        self.ctx.nodes = [_one_fake_ctx_node_ok(node_records[i]['public_ip'], 
            _new_id(),  _new_id()) for i in range(node_count-1)]

        yield self.core.query_contexts()
        self.assertTrue(self.notifier.assure_state(states.RUNNING))
        self.assertEqual(len(self.notifier.nodes), node_count-1)

        #last node reports ok
        self.ctx.nodes.append(_one_fake_ctx_node_ok(node_records[-1]['public_ip'],
            _new_id(), _new_id()))

        self.ctx.complete = True
        yield self.core.query_contexts()
        self.assertTrue(self.notifier.assure_state(states.RUNNING))
        self.assertTrue(self.notifier.assure_record_count(1))
Example #12
0
    def test_query_ctx_with_several_nodes_timeout(self):
        node_count = 3
        launch_id = _new_id()
        node_records = [make_node(launch_id, states.STARTED)
                        for i in range(node_count)]
        launch_record = make_launch(launch_id, states.PENDING,
                                    node_records)
        node_ids = map(lambda node: node['node_id'], node_records)

        ts = time.time()
        for i in range(node_count - 1):
            node_records[i]['running_timestamp'] = ts - INSTANCE_READY_TIMEOUT + 10
        node_records[-1]['running_timestamp'] = ts - INSTANCE_READY_TIMEOUT - 10

        self.store.add_launch(launch_record)
        for node in node_records:
            self.store.add_node(node)

        self.ctx.expected_count = len(node_records)
        self.ctx.complete = False
        self.ctx.error = False

        # all but 1 node have reported ok
        self.ctx.nodes = [_one_fake_ctx_node_ok(node_records[i]['public_ip'],
                          _new_id(), _new_id()) for i in range(node_count - 1)]

        self.core.query_contexts()

        self.assertTrue(self.notifier.assure_state(states.RUNNING, node_ids[:node_count - 1]))
        self.assertEqual(len(self.notifier.nodes), node_count)
        self.assertTrue(self.notifier.assure_state(states.RUNNING_FAILED, node_ids[node_count - 1:]))
        self.assertTrue(self.notifier.assure_record_count(1, node_ids[node_count - 1:]))
Example #13
0
    def test_update_nodes_from_ctx(self):
        launch_id = _new_id()
        nodes = [make_node(launch_id, states.STARTED)
                for i in range(5)]
        ctx_nodes = [_one_fake_ctx_node_ok(node['public_ip'], _new_id(), 
            _new_id()) for node in nodes]

        self.assertEquals(len(nodes), len(update_nodes_from_context(nodes, ctx_nodes)))
Example #14
0
    def test_update_nodes_from_ctx_with_hostname(self):
        launch_id = _new_id()
        nodes = [make_node(launch_id, states.STARTED)
                for i in range(5)]
        #libcloud puts the hostname in the public_ip field
        ctx_nodes = [_one_fake_ctx_node_ok(ip=_new_id(), hostname=node['public_ip'],
            pubkey=_new_id()) for node in nodes]

        self.assertEquals(len(nodes), len(update_nodes_from_context(nodes, ctx_nodes)))
Example #15
0
    def test_update_nodes_from_ctx(self):
        launch_id = _new_id()
        nodes = [make_node(launch_id, states.STARTED) for i in range(5)]
        ctx_nodes = [
            _one_fake_ctx_node_ok(node['public_ip'], _new_id(), _new_id())
            for node in nodes
        ]

        self.assertEquals(len(nodes),
                          len(update_nodes_from_context(nodes, ctx_nodes)))
Example #16
0
    def test_record_reaper(self):
        launch_id1 = _new_id()
        launch_id2 = _new_id()

        now = time.time()
        node1 = make_node(launch_id1, InstanceState.TERMINATED, caller=self.default_user,
                          state_changes=[(InstanceState.TERMINATED, now - self.record_reaping_max_age - 1)])
        node2 = make_node(launch_id1, InstanceState.FAILED, caller=self.default_user,
                          state_changes=[(InstanceState.FAILED, now - self.record_reaping_max_age - 1)])
        node3 = make_node(launch_id1, InstanceState.REJECTED, caller=self.default_user,
                          state_changes=[(InstanceState.REJECTED, now - self.record_reaping_max_age - 1)])
        nodes1 = [node1, node2, node3]
        launch1 = make_launch(launch_id1, InstanceState.RUNNING, nodes1, caller=self.default_user)

        node4 = make_node(launch_id2, InstanceState.RUNNING, caller=self.default_user,
                          state_changes=[(InstanceState.RUNNING, now - self.record_reaping_max_age - 1)])
        node5 = make_node(launch_id2, InstanceState.TERMINATED, caller=self.default_user,
                          state_changes=[(InstanceState.TERMINATED, now - self.record_reaping_max_age - 1)])
        nodes2 = [node4, node5]
        launch2 = make_launch(launch_id2, InstanceState.RUNNING, nodes2, caller=self.default_user)

        self.store.add_launch(launch1)
        for node in nodes1:
            self.store.add_node(node)

        self.store.add_launch(launch2)
        for node in nodes2:
            self.store.add_node(node)

        # Wait a second for record to get written
        time.sleep(1)

        # Force a record reaping cycle
        self.provisioner.leader._force_record_reaping()

        # Check that the first launch is completely removed
        node_ids1 = map(lambda x: x['node_id'], nodes1)
        self.assertNoStoreNodeRecords(*node_ids1)
        self.assertNoStoreLaunchRecord(launch_id1)

        # Check that the second launch is still here but with only the running node
        self.assertStoreNodeRecords(InstanceState.RUNNING, node4['node_id'])
        self.assertStoreLaunchRecord(InstanceState.RUNNING, launch_id2)
Example #17
0
    def test_update_nodes_from_ctx_with_hostname(self):
        launch_id = _new_id()
        nodes = [make_node(launch_id, states.STARTED) for i in range(5)]
        #libcloud puts the hostname in the public_ip field
        ctx_nodes = [
            _one_fake_ctx_node_ok(ip=_new_id(),
                                  hostname=node['public_ip'],
                                  pubkey=_new_id()) for node in nodes
        ]

        self.assertEquals(len(nodes),
                          len(update_nodes_from_context(nodes, ctx_nodes)))
Example #18
0
    def test_recovery_nodes_terminating(self):
        launch_id = _new_id()

        terminating_iaas_id = _new_id()

        node_records = [make_node(launch_id, states.TERMINATING,
                                              iaas_id=terminating_iaas_id,
                                              site='fake'),
                        make_node(launch_id, states.TERMINATED),
                        make_node(launch_id, states.RUNNING)]

        launch_record = make_launch(launch_id, states.RUNNING,
                                                node_records)

        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        yield self.core.recover()

        self.assertEqual(1, len(self.driver.destroyed))
        self.assertEqual(self.driver.destroyed[0].id, terminating_iaas_id)

        terminated = yield self.store.get_nodes(state=states.TERMINATED)
        self.assertEqual(2, len(terminated))
Example #19
0
    def test_query_ctx_permanent_broker_error(self):
        node_count = 3
        launch_id = _new_id()
        node_records = [make_node(launch_id, states.STARTED)
                        for i in range(node_count)]
        node_ids = [node['node_id'] for node in node_records]
        launch_record = make_launch(launch_id, states.PENDING, node_records)
        self.store.add_launch(launch_record)
        for node in node_records:
            self.store.add_node(node)

        self.ctx.query_error = ContextNotFoundError()
        self.core.query_contexts()

        self.assertTrue(self.notifier.assure_state(states.RUNNING_FAILED, node_ids))
        launch = self.store.get_launch(launch_id)
        self.assertEqual(launch['state'], states.FAILED)
Example #20
0
    def test_query_unexpected_exception(self):
        launch_id = _new_id()
        node_records = [make_node(launch_id, states.STARTED)]
        launch_record = make_launch(launch_id, states.PENDING, node_records)
        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)
        self.ctx.query_error = ValueError("bad programmer")

        # digging into internals a bit: patching one of the methods query()
        # calls to raise an exception. This will let us ensure exceptions do
        # not bubble up
        def raiser(self):
            raise KeyError("notreallyaproblem")

        self.patch(self.core, 'query_nodes', raiser)

        yield self.core.query()  # ensure that exception doesn't bubble up
Example #21
0
    def test_query_ctx_permanent_broker_error(self):
        node_count = 3
        launch_id = _new_id()
        node_records = [
            make_node(launch_id, states.STARTED) for i in range(node_count)
        ]
        node_ids = [node['node_id'] for node in node_records]
        launch_record = make_launch(launch_id, states.PENDING, node_records)
        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        self.ctx.query_error = ContextNotFoundError()
        yield self.core.query_contexts()

        self.assertTrue(
            self.notifier.assure_state(states.RUNNING_FAILED, node_ids))
        launch = yield self.store.get_launch(launch_id)
        self.assertEqual(launch['state'], states.FAILED)
Example #22
0
    def test_query_unexpected_exception(self):
        launch_id = _new_id()
        node_records = [make_node(launch_id, states.STARTED)]
        launch_record = make_launch(launch_id, states.PENDING,
                                                node_records)
        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)
        self.ctx.query_error = ValueError("bad programmer")


        # digging into internals a bit: patching one of the methods query()
        # calls to raise an exception. This will let us ensure exceptions do
        # not bubble up
        def raiser(self):
            raise KeyError("notreallyaproblem")
        self.patch(self.core, 'query_nodes', raiser)

        yield self.core.query() # ensure that exception doesn't bubble up
Example #23
0
    def test_query_ctx_with_one_node_timeout(self):
        launch_id = _new_id()
        node_record = make_node(launch_id, states.STARTED)
        launch_record = make_launch(launch_id, states.PENDING, [node_record])

        ts = time.time()
        node_record['running_timestamp'] = ts - INSTANCE_READY_TIMEOUT - 10

        self.store.add_launch(launch_record)
        self.store.add_node(node_record)

        self.ctx.expected_count = 1
        self.ctx.complete = False
        self.ctx.error = False

        self.ctx.nodes = []
        self.core.query_contexts()
        self.assertTrue(self.notifier.assure_state(states.RUNNING_FAILED))
        self.assertTrue(self.notifier.assure_record_count(1))
Example #24
0
    def test_dump_state(self):
        node_ids = []
        node_records = []
        for i in range(3):
            launch_id = _new_id()
            nodes = [make_node(launch_id, states.PENDING)]
            node_ids.append(nodes[0]['node_id'])
            node_records.extend(nodes)
            launch = make_launch(launch_id, states.PENDING, nodes)
            yield self.store.put_launch(launch)
            yield self.store.put_nodes(nodes)

        yield self.core.dump_state(node_ids[:2])

        # should have gotten notifications about the 2 nodes
        self.assertEqual(self.notifier.nodes_rec_count[node_ids[0]], 1)
        self.assertEqual(node_records[0], self.notifier.nodes[node_ids[0]])
        self.assertEqual(node_records[1], self.notifier.nodes[node_ids[1]])
        self.assertEqual(self.notifier.nodes_rec_count[node_ids[1]], 1)
        self.assertNotIn(node_ids[2], self.notifier.nodes)
Example #25
0
    def test_query_broker_exception(self):
        for i in range(2):
            launch_id = _new_id()
            node_records = [make_node(launch_id, states.STARTED)]
            launch_record = make_launch(launch_id, states.PENDING,
                                        node_records)

            yield self.store.put_launch(launch_record)
            yield self.store.put_nodes(node_records)

        # no guaranteed order here so grabbing first launch from store
        # and making that one return a BrokerError during context query.
        # THe goal is to ensure that one error doesn't prevent querying
        # for other contexts.

        launches = yield self.store.get_launches(state=states.PENDING)
        error_launch = launches[0]
        error_launch_ctx = error_launch['context']['uri']
        ok_node_id = launches[1]['node_ids'][0]
        ok_node = yield self.store.get_node(ok_node_id)

        self.ctx.uri_query_error[error_launch_ctx] = BrokerError("bad broker")
        self.ctx.nodes = [
            _one_fake_ctx_node_ok(ok_node['public_ip'], _new_id(), _new_id())
        ]
        self.ctx.complete = True
        yield self.core.query_contexts()

        launches = yield self.store.get_launches()
        for launch in launches:
            self.assertIn(launch['context']['uri'], self.ctx.queried_uris)

            if launch['launch_id'] == error_launch['launch_id']:
                self.assertEqual(launch['state'], states.PENDING)
                expected_node_state = states.STARTED
            else:
                self.assertEqual(launch['state'], states.RUNNING)
                expected_node_state = states.RUNNING

            node = yield self.store.get_node(launch['node_ids'][0])
            self.assertEqual(node['state'], expected_node_state)
Example #26
0
    def test_query_broker_exception(self):
        caller = "asterix"
        for i in range(2):
            launch_id = _new_id()
            node_records = [make_node(launch_id, states.STARTED)]
            launch_record = make_launch(launch_id, states.PENDING, node_records, caller=caller)

            self.store.add_launch(launch_record)
            for node in node_records:
                self.store.add_node(node)

        # no guaranteed order here so grabbing first launch from store
        # and making that one return a BrokerError during context query.
        # THe goal is to ensure that one error doesn't prevent querying
        # for other contexts.

        launches = self.store.get_launches(state=states.PENDING)
        error_launch = launches[0]
        error_launch_ctx = error_launch['context']['uri']
        ok_node_id = launches[1]['node_ids'][0]
        ok_node = self.store.get_node(ok_node_id)

        self.ctx.uri_query_error[error_launch_ctx] = BrokerError("bad broker")
        self.ctx.nodes = [_one_fake_ctx_node_ok(ok_node['public_ip'],
                          _new_id(), _new_id())]
        self.ctx.complete = True
        self.core.query_contexts()

        launches = self.store.get_launches()
        for launch in launches:
            self.assertIn(launch['context']['uri'], self.ctx.queried_uris)

            if launch['launch_id'] == error_launch['launch_id']:
                self.assertEqual(launch['state'], states.PENDING)
                expected_node_state = states.STARTED
            else:
                self.assertEqual(launch['state'], states.RUNNING)
                expected_node_state = states.RUNNING

            node = self.store.get_node(launch['node_ids'][0])
            self.assertEqual(node['state'], expected_node_state)
Example #27
0
    def test_dump_state(self):
        node_ids = []
        node_records = []
        for i in range(3):
            launch_id = _new_id()
            nodes = [make_node(launch_id, states.PENDING)]
            node_ids.append(nodes[0]['node_id'])
            node_records.extend(nodes)
            launch = make_launch(launch_id, states.PENDING,
                                                    nodes)
            yield self.store.put_launch(launch)
            yield self.store.put_nodes(nodes)

        yield self.core.dump_state(node_ids[:2])

        # should have gotten notifications about the 2 nodes
        self.assertEqual(self.notifier.nodes_rec_count[node_ids[0]], 1)
        self.assertEqual(node_records[0], self.notifier.nodes[node_ids[0]])
        self.assertEqual(node_records[1], self.notifier.nodes[node_ids[1]])
        self.assertEqual(self.notifier.nodes_rec_count[node_ids[1]], 1)
        self.assertNotIn(node_ids[2], self.notifier.nodes)
Example #28
0
    def test_mark_nodes_terminating(self):
        launch_id = _new_id()
        node_records = [make_node(launch_id, states.RUNNING)
                        for i in range(3)]
        launch_record = make_launch(launch_id, states.PENDING,
                                                node_records)

        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        first_two_node_ids = [node_records[0]['node_id'],
                              node_records[1]['node_id']]
        yield self.core.mark_nodes_terminating(first_two_node_ids)

        self.assertTrue(self.notifier.assure_state(states.TERMINATING,
                                                   nodes=first_two_node_ids))
        self.assertNotIn(node_records[2]['node_id'], self.notifier.nodes)

        for node_id in first_two_node_ids:
            terminating_node = yield self.store.get_node(node_id)
            self.assertEqual(terminating_node['state'], states.TERMINATING)
Example #29
0
    def test_mark_nodes_terminating(self):
        launch_id = _new_id()
        node_records = [make_node(launch_id, states.RUNNING) for i in range(3)]
        launch_record = make_launch(launch_id, states.PENDING, node_records)

        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        first_two_node_ids = [
            node_records[0]['node_id'], node_records[1]['node_id']
        ]
        yield self.core.mark_nodes_terminating(first_two_node_ids)

        self.assertTrue(
            self.notifier.assure_state(states.TERMINATING,
                                       nodes=first_two_node_ids))
        self.assertNotIn(node_records[2]['node_id'], self.notifier.nodes)

        for node_id in first_two_node_ids:
            terminating_node = yield self.store.get_node(node_id)
            self.assertEqual(terminating_node['state'], states.TERMINATING)
Example #30
0
    def test_describe(self):
        caller = "asterix"
        node_ids = []
        for _ in range(3):
            launch_id = _new_id()
            node_records = [make_node(launch_id, states.RUNNING)]
            node_ids.append(node_records[0]['node_id'])
            launch_record = make_launch(
                launch_id, states.PENDING,
                node_records, caller=caller)
            self.store.add_launch(launch_record)
            for node in node_records:
                self.store.add_node(node)

        all_nodes = self.core.describe_nodes()
        all_node_ids = [n['node_id'] for n in all_nodes]
        self.assertEqual(set(all_node_ids), set(node_ids))
        self.assertFalse(any(VERSION_KEY in n for n in all_nodes))

        all_nodes = self.core.describe_nodes(node_ids)
        all_node_ids = [m['node_id'] for m in all_nodes]
        self.assertEqual(set(all_node_ids), set(node_ids))

        subset_nodes = self.core.describe_nodes(node_ids[1:])
        subset_node_ids = [o['node_id'] for o in subset_nodes]
        self.assertEqual(set(subset_node_ids), set(node_ids[1:]))

        one_node = self.core.describe_nodes([node_ids[0]])
        self.assertEqual(len(one_node), 1)
        self.assertEqual(one_node[0]['node_id'], node_ids[0])
        self.assertEqual(one_node[0]['state'], states.RUNNING)

        self.assertNotIn(VERSION_KEY, one_node[0])

        try:
            self.core.describe_nodes([node_ids[0], "not-a-real-node"])
        except KeyError:
            pass
        else:
            self.fail("Expected exception for bad node_id")
Example #31
0
    def test_query_ctx_with_no_timeout(self):
        caller = "asterix"
        launch_id = _new_id()
        node_record = make_node(launch_id, states.STARTED)
        launch_record = make_launch(launch_id, states.PENDING, [node_record],
                                    caller=caller)

        ts = time.time()
        node_record['running_timestamp'] = ts - INSTANCE_READY_TIMEOUT - 10

        self.store.add_launch(launch_record)
        self.store.add_node(node_record)

        self.ctx.expected_count = 1
        self.ctx.complete = False
        self.ctx.error = False

        self.ctx.nodes = [_one_fake_ctx_node_not_done(node_record['public_ip'],
                          _new_id(), _new_id())]
        self.core.query_contexts()

        self.assertTrue(self.notifier.assure_record_count(0))
Example #32
0
    def test_query_ctx(self):
        node_count = 3
        launch_id = _new_id()
        node_records = [
            make_node(launch_id, states.STARTED) for i in range(node_count)
        ]
        launch_record = make_launch(launch_id, states.PENDING, node_records)

        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        self.ctx.expected_count = len(node_records)
        self.ctx.complete = False
        self.ctx.error = False

        #first query with no ctx nodes. zero records should be updated
        yield self.core.query_contexts()
        self.assertTrue(self.notifier.assure_record_count(0))

        # all but 1 node have reported ok
        self.ctx.nodes = [
            _one_fake_ctx_node_ok(node_records[i]['public_ip'], _new_id(),
                                  _new_id()) for i in range(node_count - 1)
        ]

        yield self.core.query_contexts()
        self.assertTrue(self.notifier.assure_state(states.RUNNING))
        self.assertEqual(len(self.notifier.nodes), node_count - 1)

        #last node reports ok
        self.ctx.nodes.append(
            _one_fake_ctx_node_ok(node_records[-1]['public_ip'], _new_id(),
                                  _new_id()))

        self.ctx.complete = True
        yield self.core.query_contexts()
        self.assertTrue(self.notifier.assure_state(states.RUNNING))
        self.assertTrue(self.notifier.assure_record_count(1))
Example #33
0
    def test_query_ctx_without_valid_nodes(self):

        # if there are no nodes < TERMINATING, no broker query should happen
        for i in range(3):
            launch_id = _new_id()
            node_records = [make_node(launch_id, states.STARTED)]
            launch_record = make_launch(launch_id, states.PENDING,
                                        node_records)

            yield self.store.put_launch(launch_record)
            yield self.store.put_nodes(node_records)

        launches = yield self.store.get_launches(state=states.PENDING)
        error_launch = launches[0]

        # mark first launch's node as TERMINATING, should prevent
        # context query and result in launch being marked FAILED
        error_launch_node = yield self.store.get_node(
            error_launch['node_ids'][0])
        error_launch_node['state'] = states.TERMINATING
        yield self.store.put_node(error_launch_node)

        yield self.core.query_contexts()
        self.assertNotIn(error_launch['context']['uri'], self.ctx.queried_uris)

        launches = yield self.store.get_launches()
        for launch in launches:
            if launch['launch_id'] == error_launch['launch_id']:
                self.assertEqual(launch['state'], states.FAILED)
                expected_node_state = states.TERMINATING
            else:
                self.assertEqual(launch['state'], states.PENDING)
                expected_node_state = states.STARTED

            node = yield self.store.get_node(launch['node_ids'][0])
            self.assertEqual(node['state'], expected_node_state)
Example #34
0
    def test_recover_launch_incomplete(self):
        """Ensures that launches in REQUESTED state are completed
        """
        launch_id = _new_id()
        doc = "<cluster><workspace><name>node</name><image>fake</image>"+\
              "<quantity>3</quantity>"+\
              "</workspace><workspace><name>running</name><image>fake"+\
              "</image><quantity>1</quantity></workspace></cluster>"
        context = {
            'broker_uri': _new_id(),
            'context_id': _new_id(),
            'secret': _new_id(),
            'uri': _new_id()
        }

        requested_node_ids = [_new_id(), _new_id()]

        node_records = [
            make_node(launch_id,
                      states.RUNNING,
                      site='fake',
                      ctx_name='running'),
            make_node(launch_id,
                      states.REQUESTED,
                      site='fake',
                      node_id=requested_node_ids[0],
                      ctx_name='node'),
            make_node(launch_id,
                      states.REQUESTED,
                      site='fake',
                      node_id=requested_node_ids[1],
                      ctx_name='node'),
            make_node(launch_id, states.RUNNING, ctx_name='node')
        ]
        launch_record = make_launch(launch_id,
                                    states.REQUESTED,
                                    node_records,
                                    document=doc,
                                    context=context)

        yield self.store.put_launch(launch_record)
        yield self.store.put_nodes(node_records)

        # 2 nodes are in REQUESTED state, so those should be launched
        yield self.core.recover()

        # because we rely on IaaS idempotency, we get full Node responses
        # for all nodes in the group. What really would cause this scenario
        # is successfully launching the full group but failing before records
        # could be written for the two REQUESTED nodes.
        self.assertEqual(3, len(self.driver.created))
        iaas_ids = set(node.id for node in self.driver.created)
        self.assertEqual(3, len(iaas_ids))

        for node_id in requested_node_ids:
            node = yield self.store.get_node(node_id)
            self.assertEqual(states.PENDING, node['state'])
            self.assertTrue(node['iaas_id'] in iaas_ids)

        launch = yield self.store.get_launch(launch_id)
        self.assertEqual(states.PENDING, launch['state'])