Example #1
0
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test'
        self.process_definition =  ProcessDefinition(name=self.process_definition_name, executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })
        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "%s.hatests" % bootstrap.get_sys_name()
        self._haa_config = {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'npreserving',
                    'parameters': {
                        'preserve_n': 0
                    }
                },
                'process_definition_id': self.process_definition_id,
                'dashi_messaging' : True,
                'dashi_exchange' : self._haa_dashi_exchange,
                'dashi_name': self._haa_dashi_name
            },
            'agent': {'resource_id': self.resource_id},
        }

        self._base_services, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')

        self.rr_cli = ResourceRegistryServiceClient()
        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {
            'module': 'ion.services.cei.test.test_process_dispatcher',
            'class': 'TestProcess'
        }
        self.process_definition_id = self.pd_cli.create_process_definition(
            self.process_definition)

        self.waiter = ProcessStateWaiter()
    def setUp(self):
        self.dashi = None
        self._start_container()
        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self.container = self.container_client._get_container_instance()

        app = dict(name="process_dispatcher", processapp=("process_dispatcher",
                               "ion.services.cei.process_dispatcher_service",
                               "ProcessDispatcherService"))
        self.container.start_app(app, config=pd_config)

        self.rr_cli = self.container.resource_registry

        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher',
                                              'class': 'TestProcess'}
        self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition)

        self._eea_pids = []
        self._tmpdirs = []

        self.dashi = get_dashi(uuid.uuid4().hex,
            pd_config['processdispatcher']['dashi_uri'],
            pd_config['processdispatcher']['dashi_exchange'])

        #send a fake node_state message to PD's dashi binding.
        self.node1_id = uuid.uuid4().hex
        self._send_node_state("engine1", self.node1_id)
        self._start_eeagent(self.node1_id)

        self.waiter = ProcessStateWaiter()
Example #4
0
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(
            to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test_haagent_%s' % self.process_definition_id
        self.process_definition = ProcessDefinition(
            name=self.process_definition_name,
            executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
            })
        self.pd_cli.create_process_definition(self.process_definition,
                                              self.process_definition_id)

        service_definition = SERVICE_DEFINITION_TMPL % self.process_definition_name
        sd = IonObject(RT.ServiceDefinition, {
            "name": self.process_definition_name,
            "definition": service_definition
        })
        self.service_def_id, _ = self.container.resource_registry.create(sd)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "hatests"
        self._haa_config = self._get_haagent_config()

        self._base_services, _ = self.container.resource_registry.find_resources(
            restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
                                                     name=self.container.name)
        self._spawn_haagent()
        self.addCleanup(self._stop_haagent)

        self._setup_haa_client()
    def setUp(self):
        self.dashi = None
        self._start_container()
        from pyon.public import CFG

        self.container_client = ContainerAgentClient(node=self.container.node,
                                                     name=self.container.name)
        self.container = self.container_client._get_container_instance()

        app = dict(name="process_dispatcher",
                   processapp=("process_dispatcher",
                               "ion.services.cei.process_dispatcher_service",
                               "ProcessDispatcherService"))
        self.container.start_app(app, config=pd_config)

        self.rr_cli = self.container.resource_registry

        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {
            'module': 'ion.services.cei.test.test_process_dispatcher',
            'class': 'TestProcess'
        }
        self.process_definition_id = self.pd_cli.create_process_definition(
            self.process_definition)

        self._eea_pids = []
        self._eea_pid_to_resource_id = {}
        self._eea_pid_to_persistence_dir = {}
        self._tmpdirs = []

        self.dashi = get_dashi(
            uuid.uuid4().hex,
            pd_config['processdispatcher']['dashi_uri'],
            pd_config['processdispatcher']['dashi_exchange'],
            sysname=CFG.get_safe("dashi.sysname"))

        #send a fake node_state message to PD's dashi binding.
        self.node1_id = uuid.uuid4().hex
        self._send_node_state("engine1", self.node1_id)
        self._initial_eea_pid = self._start_eeagent(self.node1_id)

        self.waiter = ProcessStateWaiter()
Example #6
0
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test'
        self.process_definition =  ProcessDefinition(name=self.process_definition_name, executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })
        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "%s.hatests" % bootstrap.get_sys_name()
        self._haa_config = {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'npreserving',
                    'parameters': {
                        'preserve_n': 0
                    }
                },
                'process_definition_id': self.process_definition_id,
                'dashi_messaging' : True,
                'dashi_exchange' : self._haa_dashi_exchange,
                'dashi_name': self._haa_dashi_name
            },
            'agent': {'resource_id': self.resource_id},
        }

        self._base_services, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')

        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher',
                                              'class': 'TestProcess'}
        self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition)

        self.waiter = ProcessStateWaiter()
Example #8
0
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url("res/deploy/r2cei.yml")
        # self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = "test"
        self.process_definition = ProcessDefinition(
            name=self.process_definition_name,
            executable={"module": "ion.agents.cei.test.test_haagent", "class": "TestProcess"},
        )
        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "%s.hatests" % bootstrap.get_sys_name()
        self._haa_config = {
            "highavailability": {
                "policy": {"interval": 1, "name": "npreserving", "parameters": {"preserve_n": 0}},
                "process_definition_id": self.process_definition_id,
                "dashi_messaging": True,
                "dashi_exchange": self._haa_dashi_exchange,
                "dashi_name": self._haa_dashi_name,
            },
            "agent": {"resource_id": self.resource_id},
        }

        self._base_services, _ = self.container.resource_registry.find_resources(
            restype="Service", name=self.process_definition_name
        )

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(
            name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent",
            config=self._haa_config,
        )

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info("Got haa client %s.", str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test_haagent_%s' % self.process_definition_id
        self.process_definition = ProcessDefinition(name=self.process_definition_name, executable={
            'module': 'ion.agents.cei.test.test_haagent',
            'class': 'TestProcess'
        })
        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        service_definition = SERVICE_DEFINITION_TMPL % self.process_definition_name
        sd = IonObject(RT.ServiceDefinition, {"name": self.process_definition_name,
            "definition": service_definition})
        self.service_def_id, _ = self.container.resource_registry.create(sd)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "hatests"
        self._haa_config = self._get_haagent_config()

        self._base_services, _ = self.container.resource_registry.find_resources(
            restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._spawn_haagent()
        self.addCleanup(self._stop_haagent)

        self._setup_haa_client()
Example #10
0
class HighAvailabilityAgentSensorPolicyTest(IonIntegrationTestCase):

    def _start_webserver(self, port=None):
        """ Start a webserver for testing code download
        Note: tries really hard to get a port, and if it can't use
        the suggested port, randomly picks another, and returns it
        """
        def log_message(self, format, *args):
            #swallow log massages
            pass

        class TestRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
            server_version = 'test_server'
            extensions_map = ''

            def do_GET(self):
                self.send_response(200)
                self.send_header("Content-type", "text/plain")
                self.send_header("Content-Length", len(self.server.response))
                self.end_headers()
                self.wfile.write(self.server.response)

        class Server(HTTPServer):

            response = ''

            def serve_forever(self):
                self._serving = 1
                while self._serving:
                    self.handle_request()

            def stop(self):
                self._serving = 0

        if port is None:
            port = 8008
        Handler = TestRequestHandler
        Handler.log_message = log_message

        for i in range(0, 100):
            try:
                self._webserver = Server(("localhost", port), Handler)
            except socket.error:
                print "port %s is in use, picking another" % port
                port = randint(8000, 10000)
                continue
            else:
                break

        self._web_glet = gevent.spawn(self._webserver.serve_forever)
        return port

    def _stop_webserver(self):
        if self._webserver is not None:
            self._webserver.stop()
            gevent.sleep(2)
            self._web_glet.kill()

    def await_ha_state(self, want_state, timeout=20):

        for i in range(0, timeout):
            try:
                status = self.haa_client.status().result
                if status == want_state:
                    return
                else:
                    procs = self.get_running_procs()
                    num_procs = len(procs)
                    log.debug("assert wants state %s, got state %s, with %s procs" % (want_state,status, num_procs))
            except Exception:
                log.exception("Problem getting HA status, trying again...")
                gevent.sleep(1)

        raise Exception("Took more than %s to get to ha state %s" % (timeout, want_state))

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition = ProcessDefinition(name='test', executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })

        self.pd_cli.create_process_definition(self.process_definition,
                self.process_definition_id)

        http_port = 8919
        http_port = self._start_webserver(port=http_port)

        self.resource_id = "haagent_4567"
        self._haa_name = "high_availability_agent"
        self._haa_config = {
            'server': {
                'trafficsentinel': {
                    'host': 'localhost',
                    'port': http_port,
                    'protocol': 'http',
                    'username': '******',
                    'password': '******'
                }
            },
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'sensor',
                    'parameters': {
                        'metric': 'app_attributes:ml',
                        'sample_period': 600,
                        'sample_function': 'Average',
                        'cooldown_period': 5,
                        'scale_up_threshold': 2.0,
                        'scale_up_n_processes': 1,
                        'scale_down_threshold': 1.0,
                        'scale_down_n_processes': 1,
                        'maximum_processes': 5,
                        'minimum_processes': 1,
                    }
                },
                'process_definition_id': self.process_definition_id,
                "process_dispatchers": [
                    'process_dispatcher'
                ]
            },
            'agent': {'resource_id': self.resource_id},
        }

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)

    def tearDown(self):
        new_policy = { 'metric': 'app_attributes:ml',
                        'sample_period': 600,
                        'sample_function': 'Average',
                        'cooldown_period': 0,
                        'scale_up_threshold': 2.0,
                        'scale_up_n_processes': 1,
                        'scale_down_threshold': 1.0,
                        'scale_down_n_processes': 1,
                        'maximum_processes': 0,
                        'minimum_processes': 0,
                    }
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)
        self.assertEqual(len(self.get_running_procs()), 0)

        self.waiter.stop()
        self.container.terminate_process(self._haa_pid)
        self._stop_webserver()
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [cproc for cproc in current if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING]
        return normal

    def _get_managed_upids(self):
        result = self.haa_client.dump().result
        upids = result['managed_upids']
        return upids

    def _set_response(self, response):
        self._webserver.response = response

    def test_sensor_policy(self):
        status = self.haa_client.status().result
        # Ensure HA hasn't already failed
        assert status in ('PENDING', 'READY', 'STEADY')

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 1)

        self.await_ha_state('STEADY')

        # Set ml for each proc such that we scale up
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "pid=%s&ml=5\n" % upid
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 2)

        # Set ml so we stay steady
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "pid=%s&ml=1.5\n" % upid
        self._set_response(response)

        self.assertEqual(len(self.get_running_procs()), 2)

        self.await_ha_state('STEADY')

        # Set ml so we scale down
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "pid=%s&ml=0.5\n" % upid
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)

        self.assertEqual(len(self.get_running_procs()), 1)

        self.await_ha_state('STEADY')
class ProcessDispatcherEEAgentIntTest(ProcessDispatcherServiceIntTest):
    """Run the basic int tests again, with a different environment
    """

    def setUp(self):
        self.dashi = None
        self._start_container()
        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self.container = self.container_client._get_container_instance()

        app = dict(name="process_dispatcher", processapp=("process_dispatcher",
                               "ion.services.cei.process_dispatcher_service",
                               "ProcessDispatcherService"))
        self.container.start_app(app, config=pd_config)

        self.rr_cli = self.container.resource_registry

        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher',
                                              'class': 'TestProcess'}
        self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition)

        self._eea_pids = []
        self._tmpdirs = []

        self.dashi = get_dashi(uuid.uuid4().hex,
            pd_config['processdispatcher']['dashi_uri'],
            pd_config['processdispatcher']['dashi_exchange'])

        #send a fake node_state message to PD's dashi binding.
        self.node1_id = uuid.uuid4().hex
        self._send_node_state("engine1", self.node1_id)
        self._start_eeagent(self.node1_id)

        self.waiter = ProcessStateWaiter()

    def _send_node_state(self, engine_id, node_id=None):
        node_id = node_id or uuid.uuid4().hex
        node_state = dict(node_id=node_id, state=InstanceState.RUNNING,
            domain_id=domain_id_from_engine(engine_id))
        self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state)

    def _start_eeagent(self, node_id):
        persistence_dir = tempfile.mkdtemp()
        self._tmpdirs.append(persistence_dir)
        agent_config = _get_eeagent_config(node_id, persistence_dir)
        pid = self.container_client.spawn_process(name="eeagent",
            module="ion.agents.cei.execution_engine_agent",
            cls="ExecutionEngineAgent", config=agent_config)
        log.info('Agent pid=%s.', str(pid))
        self._eea_pids.append(pid)

    def tearDown(self):
        for pid in self._eea_pids:
            self.container.terminate_process(pid)
        for d in self._tmpdirs:
            shutil.rmtree(d)

        self.waiter.stop()
        if self.dashi:
            self.dashi.cancel()


    def test_requested_ee(self):

        # request non-default engine

        process_target = ProcessTarget(execution_engine_id="engine2")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start()

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.WAITING)


        # request unknown engine, with NEVER queuing mode. The request
        # should be rejected.
        # verifies L4-CI-CEI-RQ52

        process_target = ProcessTarget(execution_engine_id="not-a-real-ee")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        rejected_pid = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=rejected_pid)

        self.waiter.await_state_event(rejected_pid, ProcessStateEnum.REJECTED)

        # now add a node and eeagent for engine2. original process should leave
        # queue and start running
        node2_id = uuid.uuid4().hex
        self._send_node_state("engine2", node2_id)
        self._start_eeagent(node2_id)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        # spawn another process. it should start immediately.

        process_target = ProcessTarget(execution_engine_id="engine2")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        pid2 = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid2)

        self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING)

        # one more with node exclusive

        process_target = ProcessTarget(execution_engine_id="engine2",
            node_exclusive="hats")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        pid3 = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid3)

        self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING)

        # kill the processes for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED)

    def test_node_exclusive(self):

        # the node_exclusive constraint is used to ensure multiple processes
        # of the same "kind" each get a VM exclusive of each other. Other
        # processes may run on these VMs, just not processes with the same
        # node_exclusive tag. Since we cannot directly query the contents
        # of each node in this test, we prove the capability by scheduling
        # processes one by one and checking their state.

        # verifies L4-CI-CEI-RQ121
        # verifies L4-CI-CEI-RQ57

        # first off, setUp() created a single node and eeagent.
        # We schedule two processes with the same "abc" node_exclusive
        # tag. Since there is only one node, the first process should run
        # and the second should be queued.

        process_target = ProcessTarget(execution_engine_id="engine1")
        process_target.node_exclusive = "abc"
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        pid1 = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start()

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid1)

        self.waiter.await_state_event(pid1, ProcessStateEnum.RUNNING)

        pid2 = self.pd_cli.create_process(self.process_definition_id)
        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.WAITING)

        # now demonstrate that the node itself is not full by launching
        # a third process without a node_exclusive tag -- it should start
        # immediately

        process_target.node_exclusive = None
        pid3 = self.pd_cli.create_process(self.process_definition_id)
        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING)

        # finally, add a second node to the engine. pid2 should be started
        # since there is an exclusive "abc" node free.
        node2_id = uuid.uuid4().hex
        self._send_node_state("engine1", node2_id)
        self._start_eeagent(node2_id)
        self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING)

        # kill the processes for good
        self.pd_cli.cancel_process(pid1)
        self.waiter.await_state_event(pid1, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED)

    def test_code_download(self):
        # create a process definition that has no URL; only module and class.
        process_definition_no_url = ProcessDefinition(name='test_process_nodownload')
        process_definition_no_url.executable = {'module': 'ion.my.test.process',
                'class': 'TestProcess'}
        process_definition_id_no_url = self.pd_cli.create_process_definition(process_definition_no_url)

        # create another that has a URL of the python file (this very file)
        # verifies L4-CI-CEI-RQ114
        url = "file://%s" % os.path.join(os.path.dirname(__file__), 'test_process_dispatcher.py')
        process_definition = ProcessDefinition(name='test_process_download')
        process_definition.executable = {'module': 'ion.my.test.process',
                'class': 'TestProcess', 'url': url}
        process_definition_id = self.pd_cli.create_process_definition(process_definition)

        process_target = ProcessTarget()
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        self.waiter.start()

        # Test a module with no download fails
        pid_no_url = self.pd_cli.create_process(process_definition_id_no_url)

        self.pd_cli.schedule_process(process_definition_id_no_url,
            process_schedule, process_id=pid_no_url)

        self.waiter.await_state_event(pid_no_url, ProcessStateEnum.FAILED)

        # Test a module with a URL runs
        pid = self.pd_cli.create_process(process_definition_id)

        self.pd_cli.schedule_process(process_definition_id,
            process_schedule, process_id=pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)
class ProcessDispatcherServiceIntTest(IonIntegrationTestCase):

    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')

        self.rr_cli  = ResourceRegistryServiceClient()
        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher',
                                              'class': 'TestProcess'}
        self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition)

        self.waiter = ProcessStateWaiter()

    def tearDown(self):
        self.waiter.stop()

    def test_create_schedule_cancel(self):
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration={}, process_id=pid)
        self.assertEqual(pid, pid2)

        # verifies L4-CI-CEI-RQ141 and L4-CI-CEI-RQ142
        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING)

        # make sure process is readable directly from RR (mirrored)
        # verifies L4-CI-CEI-RQ63
        # verifies L4-CI-CEI-RQ64
        proc = self.rr_cli.read(pid)
        self.assertEqual(proc.process_id, pid)

        # now try communicating with the process to make sure it is really running
        test_client = TestClient()
        for i in range(5):
            self.assertEqual(i + 1, test_client.count(timeout=10))

        # verifies L4-CI-CEI-RQ147

        # kill the process and start it again
        self.pd_cli.cancel_process(pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)
        self.waiter.stop()

        oldpid = pid

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration={}, process_id=pid)
        self.assertEqual(pid, pid2)
        self.assertNotEqual(oldpid, pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        for i in range(5):
            self.assertEqual(i + 1, test_client.count(timeout=10))

        # kill the process for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

    def test_schedule_with_config(self):

        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        # verifies L4-CI-CEI-RQ66

        # feed in a string that the process will return -- verifies that
        # configuration actually makes it to the instantiated process
        test_response = uuid.uuid4().hex
        configuration = {"test_response" : test_response}

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration=configuration, process_id=pid)
        self.assertEqual(pid, pid2)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        test_client = TestClient()

        # verifies L4-CI-CEI-RQ139
        # assure that configuration block (which can contain inputs, outputs,
        # and arbitrary config) 1) makes it to the process and 2) is returned
        # in process queries

        self.assertEqual(test_client.query(), test_response)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, configuration)

        # kill the process for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

    def test_schedule_bad_config(self):

        process_schedule = ProcessSchedule()

        # a non-JSON-serializable IonObject
        o = ProcessTarget()

        with self.assertRaises(BadRequest) as ar:
            self.pd_cli.schedule_process(self.process_definition_id,
                process_schedule, configuration={"bad": o})
        self.assertTrue(ar.exception.message.startswith("bad configuration"))

    def test_create_invalid_definition(self):
        # create process definition missing module and class
        # verifies L4-CI-CEI-RQ137
        executable = dict(url="http://somewhere.com/something.py")
        definition = ProcessDefinition(name="test_process", executable=executable)
        with self.assertRaises(BadRequest) as ar:
            self.pd_cli.create_process_definition(definition)
class ProcessDispatcherEEAgentIntTest(ProcessDispatcherServiceIntTest):
    """Run the basic int tests again, with a different environment
    """

    def setUp(self):
        self.dashi = None
        self._start_container()
        from pyon.public import CFG

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self.container = self.container_client._get_container_instance()

        app = dict(name="process_dispatcher", processapp=("process_dispatcher",
                               "ion.services.cei.process_dispatcher_service",
                               "ProcessDispatcherService"))
        self.container.start_app(app, config=pd_config)

        self.rr_cli = self.container.resource_registry

        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {'module': 'ion.services.cei.test.test_process_dispatcher',
                                              'class': 'TestProcess'}
        self.process_definition_id = self.pd_cli.create_process_definition(self.process_definition)

        self._eea_pids = []
        self._eea_pid_to_resource_id = {}
        self._eea_pid_to_persistence_dir = {}
        self._tmpdirs = []

        self.dashi = get_dashi(uuid.uuid4().hex,
            pd_config['processdispatcher']['dashi_uri'],
            pd_config['processdispatcher']['dashi_exchange'],
            sysname=CFG.get_safe("dashi.sysname")
            )

        #send a fake node_state message to PD's dashi binding.
        self.node1_id = uuid.uuid4().hex
        self._send_node_state("engine1", self.node1_id)
        self._initial_eea_pid = self._start_eeagent(self.node1_id)

        self.waiter = ProcessStateWaiter()

    def _send_node_state(self, engine_id, node_id=None):
        node_id = node_id or uuid.uuid4().hex
        node_state = dict(node_id=node_id, state=InstanceState.RUNNING,
            domain_id=domain_id_from_engine(engine_id))
        self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state)

    def _start_eeagent(self, node_id, resource_id=None, persistence_dir=None):
        if not persistence_dir:
            persistence_dir = tempfile.mkdtemp()
            self._tmpdirs.append(persistence_dir)
        resource_id = resource_id or uuid.uuid4().hex
        agent_config = _get_eeagent_config(node_id, persistence_dir,
            resource_id=resource_id)
        pid = self.container_client.spawn_process(name="eeagent",
            module="ion.agents.cei.execution_engine_agent",
            cls="ExecutionEngineAgent", config=agent_config)
        log.info('Agent pid=%s.', str(pid))
        self._eea_pids.append(pid)
        self._eea_pid_to_resource_id[pid] = resource_id
        self._eea_pid_to_persistence_dir[pid] = persistence_dir
        return pid

    def _kill_eeagent(self, pid):
        self.assertTrue(pid in self._eea_pids)
        self.container.terminate_process(pid)
        self._eea_pids.remove(pid)
        del self._eea_pid_to_resource_id[pid]
        del self._eea_pid_to_persistence_dir[pid]

    def tearDown(self):
        for pid in list(self._eea_pids):
            self._kill_eeagent(pid)
        for d in self._tmpdirs:
            shutil.rmtree(d)

        self.waiter.stop()
        if self.dashi:
            self.dashi.cancel()

    def test_requested_ee(self):

        # request non-default engine

        process_target = ProcessTarget(execution_engine_id="engine2")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start()

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.WAITING)

        # request unknown engine, with NEVER queuing mode. The request
        # should be rejected.
        # verifies L4-CI-CEI-RQ52

        process_target = ProcessTarget(execution_engine_id="not-a-real-ee")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        rejected_pid = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=rejected_pid)

        self.waiter.await_state_event(rejected_pid, ProcessStateEnum.REJECTED)

        # now add a node and eeagent for engine2. original process should leave
        # queue and start running
        node2_id = uuid.uuid4().hex
        self._send_node_state("engine2", node2_id)
        self._start_eeagent(node2_id)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        # spawn another process. it should start immediately.

        process_target = ProcessTarget(execution_engine_id="engine2")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        pid2 = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid2)

        self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING)

        # one more with node exclusive

        process_target = ProcessTarget(execution_engine_id="engine2",
            node_exclusive="hats")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        pid3 = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid3)

        self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING)

        # kill the processes for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED)

    def test_node_exclusive(self):

        # the node_exclusive constraint is used to ensure multiple processes
        # of the same "kind" each get a VM exclusive of each other. Other
        # processes may run on these VMs, just not processes with the same
        # node_exclusive tag. Since we cannot directly query the contents
        # of each node in this test, we prove the capability by scheduling
        # processes one by one and checking their state.

        # verifies L4-CI-CEI-RQ121
        # verifies L4-CI-CEI-RQ57

        # first off, setUp() created a single node and eeagent.
        # We schedule two processes with the same "abc" node_exclusive
        # tag. Since there is only one node, the first process should run
        # and the second should be queued.

        process_target = ProcessTarget(execution_engine_id="engine1")
        process_target.node_exclusive = "abc"
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        pid1 = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start()

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid1)

        self.waiter.await_state_event(pid1, ProcessStateEnum.RUNNING)

        pid2 = self.pd_cli.create_process(self.process_definition_id)
        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.WAITING)

        # now demonstrate that the node itself is not full by launching
        # a third process without a node_exclusive tag -- it should start
        # immediately

        process_target.node_exclusive = None
        pid3 = self.pd_cli.create_process(self.process_definition_id)
        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING)

        # finally, add a second node to the engine. pid2 should be started
        # since there is an exclusive "abc" node free.
        node2_id = uuid.uuid4().hex
        self._send_node_state("engine1", node2_id)
        self._start_eeagent(node2_id)
        self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING)

        # kill the processes for good
        self.pd_cli.cancel_process(pid1)
        self.waiter.await_state_event(pid1, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED)

    def test_code_download(self):
        # create a process definition that has no URL; only module and class.
        process_definition_no_url = ProcessDefinition(name='test_process_nodownload')
        process_definition_no_url.executable = {'module': 'ion.my.test.process',
                'class': 'TestProcess'}
        process_definition_id_no_url = self.pd_cli.create_process_definition(process_definition_no_url)

        # create another that has a URL of the python file (this very file)
        # verifies L4-CI-CEI-RQ114
        url = "file://%s" % os.path.join(os.path.dirname(__file__), 'test_process_dispatcher.py')
        process_definition = ProcessDefinition(name='test_process_download')
        process_definition.executable = {'module': 'ion.my.test.process',
                'class': 'TestProcess', 'url': url}
        process_definition_id = self.pd_cli.create_process_definition(process_definition)

        process_target = ProcessTarget()
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        self.waiter.start()

        # Test a module with no download fails
        pid_no_url = self.pd_cli.create_process(process_definition_id_no_url)

        self.pd_cli.schedule_process(process_definition_id_no_url,
            process_schedule, process_id=pid_no_url)

        self.waiter.await_state_event(pid_no_url, ProcessStateEnum.FAILED)

        # Test a module with a URL runs
        pid = self.pd_cli.create_process(process_definition_id)

        self.pd_cli.schedule_process(process_definition_id,
            process_schedule, process_id=pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

    def _add_test_process(self, restart_mode=None):
        process_schedule = ProcessSchedule()
        if restart_mode is not None:
            process_schedule.restart_mode = restart_mode
        pid = self.pd_cli.create_process(self.process_definition_id)

        pid_listen_name = "PDtestproc_%s" % uuid.uuid4().hex
        config = {'process': {'listen_name': pid_listen_name}}

        self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, process_id=pid, configuration=config)

        client = TestClient(to_name=pid_listen_name)
        return pid, client

    def test_restart(self):
        self.waiter.start()

        restartable_pids = []
        nonrestartable_pids = []
        clients = {}
        # start 10 processes with RestartMode.ALWAYS
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.ALWAYS)
            restartable_pids.append(pid)
            clients[pid] = client

        # and 10 processes with RestartMode.ABNORMAL
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.ABNORMAL)
            restartable_pids.append(pid)
            clients[pid] = client

        # and 10 with RestartMode.NEVER
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.NEVER)
            nonrestartable_pids.append(pid)
            clients[pid] = client

        all_pids = restartable_pids + nonrestartable_pids

        self.waiter.await_many_state_events(all_pids, ProcessStateEnum.RUNNING)

        for pid in all_pids:
            client = clients[pid]
            self.assertFalse(client.is_restart())
            self.assertEqual(client.count(), 1)

        # now kill the whole eeagent and restart it. processes should
        # show up as FAILED in the next heartbeat.
        resource_id = self._eea_pid_to_resource_id[self._initial_eea_pid]
        persistence_dir = self._eea_pid_to_persistence_dir[self._initial_eea_pid]
        log.debug("Restarting eeagent %s", self._initial_eea_pid)
        self._kill_eeagent(self._initial_eea_pid)

        # manually kill the processes to simulate a real container failure
        for pid in all_pids:
            self.container.terminate_process(pid)

        self._start_eeagent(self.node1_id, resource_id=resource_id,
            persistence_dir=persistence_dir)

        # wait for restartables to restart
        self.waiter.await_many_state_events(restartable_pids, ProcessStateEnum.RUNNING)

        # query the processes again. it should have restart mode config
        for pid in restartable_pids:
            client = clients[pid]
            self.assertTrue(client.is_restart())
            self.assertEqual(client.count(), 1)

        # meanwhile some procs should not have restarted
        for pid in nonrestartable_pids:
            proc = self.pd_cli.read_process(pid)
            self.assertEqual(proc.process_state, ProcessStateEnum.FAILED)

        # guard against extraneous events we were receiving as part of a bug:
        # processes restarting again after they were already restarted
        self.waiter.await_nothing(timeout=5)

    def test_idempotency(self):
        # ensure every operation can be safely retried
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        proc_name = 'myreallygoodname'
        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        # note: if we import UNSCHEDULED state into ProcessStateEnum,
        # this assertion will need to change.
        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_state, ProcessStateEnum.REQUESTED)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration={}, process_id=pid, name=proc_name)
        self.assertEqual(pid, pid2)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        # repeating schedule is harmless
        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
            process_schedule, configuration={}, process_id=pid, name=proc_name)
        self.assertEqual(pid, pid2)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING)

        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

        # repeating cancel is harmless
        self.pd_cli.cancel_process(pid)
        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.TERMINATED)
Example #14
0
class HighAvailabilityAgentSensorPolicyTest(IonIntegrationTestCase):
    def _start_webserver(self, port=None):
        """ Start a webserver for testing code download
        Note: tries really hard to get a port, and if it can't use
        the suggested port, randomly picks another, and returns it
        """

        def log_message(self, format, *args):
            # swallow log massages
            pass

        class TestRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
            server_version = "test_server"
            extensions_map = ""

            def do_GET(self):
                self.send_response(200)
                self.send_header("Content-type", "text/plain")
                self.send_header("Content-Length", len(self.server.response))
                self.end_headers()
                self.wfile.write(self.server.response)

        class Server(HTTPServer):

            response = ""

            def serve_forever(self):
                self._serving = 1
                while self._serving:
                    self.handle_request()

            def stop(self):
                self._serving = 0

        if port is None:
            port = 8008
        Handler = TestRequestHandler
        Handler.log_message = log_message

        for i in range(0, 100):
            try:
                self._webserver = Server(("localhost", port), Handler)
            except socket.error:
                print "port %s is in use, picking another" % port
                port = randint(8000, 10000)
                continue
            else:
                break

        self._web_glet = gevent.spawn(self._webserver.serve_forever)
        return port

    def _stop_webserver(self):
        if self._webserver is not None:
            self._webserver.stop()
            gevent.sleep(2)
            self._web_glet.kill()

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url("res/deploy/r2cei.yml")
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition = ProcessDefinition(
            name="test", executable={"module": "ion.agents.cei.test.test_haagent", "class": "TestProcess"}
        )

        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        http_port = 8919
        http_port = self._start_webserver(port=http_port)

        self.resource_id = "haagent_4567"
        self._haa_name = "high_availability_agent"
        self._haa_config = {
            "highavailability": {
                "policy": {
                    "interval": 1,
                    "name": "sensor",
                    "parameters": {
                        "metric": "app_attributes:ml",
                        "sample_period": 600,
                        "sample_function": "Average",
                        "cooldown_period": 20,
                        "scale_up_threshold": 2.0,
                        "scale_up_n_processes": 1,
                        "scale_down_threshold": 1.0,
                        "scale_down_n_processes": 1,
                        "maximum_processes": 5,
                        "minimum_processes": 1,
                    },
                },
                "aggregator": {
                    "type": "trafficsentinel",
                    "host": "localhost",
                    "port": http_port,
                    "protocol": "http",
                    "username": "******",
                    "password": "******",
                },
                "process_definition_id": self.process_definition_id,
                "process_dispatchers": ["process_dispatcher"],
            },
            "agent": {"resource_id": self.resource_id},
        }

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(
            name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent",
            config=self._haa_config,
        )

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info("Got haa client %s.", str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)

    def tearDown(self):
        self.waiter.stop()
        self.container.terminate_process(self._haa_pid)
        self._stop_webserver()
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [
            cproc
            for cproc in current
            if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING
        ]
        return normal

    def _get_managed_upids(self):
        result = self.haa_client.dump().result
        upids = result["managed_upids"]
        return upids

    def _set_response(self, response):
        self._webserver.response = response

    def test_sensor_policy(self):
        status = self.haa_client.status().result
        # Ensure HA hasn't already failed
        assert status in ("PENDING", "READY", "STEADY")

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, "STEADY")
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # Set ml for each proc such that we scale up
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=5\n"
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 2)

        # Set ml so we stay steady
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=1.5\n"
        self._set_response(response)

        self.assertEqual(len(self.get_running_procs()), 2)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, "STEADY")
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # Set ml so we scale down
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=0.5\n"
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, "STEADY")
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"
Example #15
0
class HighAvailabilityAgentTest(IonIntegrationTestCase):

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test'
        self.process_definition =  ProcessDefinition(name=self.process_definition_name, executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })
        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "%s.hatests" % bootstrap.get_sys_name()
        self._haa_config = {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'npreserving',
                    'parameters': {
                        'preserve_n': 0
                    }
                },
                'process_definition_id': self.process_definition_id,
                'dashi_messaging' : True,
                'dashi_exchange' : self._haa_dashi_exchange,
                'dashi_name': self._haa_dashi_name
            },
            'agent': {'resource_id': self.resource_id},
        }

        self._base_services, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)


    def tearDown(self):
        self.waiter.stop()
        try:
            self.container.terminate_process(self._haa_pid)
        except BadRequest:
            log.warning("Couldn't terminate HA Agent in teardown (May have been terminated by a test)")
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [cproc for cproc in current if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING]
        return normal

    def get_new_services(self):

        base = self._base_services
        base_names = [i.name for i in base]
        services_registered, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)
        current_names = [i.name for i in services_registered]
        normal = [cserv for cserv in services_registered if cserv.name not in base_names]
        return normal

    def await_ha_state(self, want_state, timeout=10):

        for i in range(0, timeout):
            status = self.haa_client.status().result
            if status == want_state:
                return
            gevent.sleep(1)

        raise Exception("Took more than %s to get to ha state %s" % (timeout, want_state))


    def test_features(self):
        status = self.haa_client.status().result
        # Ensure HA hasn't already failed
        assert status in ('PENDING', 'READY', 'STEADY')


        # verifies L4-CI-CEI-RQ44
        # Note: the HA agent is started in the setUp() method, with config
        # pointing to the test "service". The initial config is set to preserve
        # 0 service processes. With this reconfigure step below, we change that
        # to launch 1.

        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)

        result = self.haa_client.dump().result
        self.assertEqual(result['policy'], new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, 'STEADY')
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # verifies L4-CI-CEI-RQ122 and L4-CI-CEI-RQ124

        new_policy = {'preserve_n': 2}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 2)

        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)

        self.assertEqual(len(self.get_running_procs()), 1)

        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)
        self.assertEqual(len(self.get_running_procs()), 0)

    def test_associations(self):

        # Ensure that once the HA Agent starts, there is a Service object in
        # the registry
        result = self.haa_client.dump().result
        service_id = result.get('service_id')
        self.assertIsNotNone(service_id)
        service = self.container.resource_registry.read(service_id)
        self.assertIsNotNone(service)

        # Ensure that once a process is started, there is an association between
        # it and the service
        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)
        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)
        self.assertEqual(len(self.get_running_procs()), 1)

        self.await_ha_state('STEADY')

        proc = self.get_running_procs()[0]

        processes_associated, _ = self.container.resource_registry.find_resources(
                restype="Process", name=proc.process_id)
        self.assertEqual(len(processes_associated), 1)

        has_processes = self.container.resource_registry.find_associations(
            service, "hasProcess")
        self.assertEqual(len(has_processes), 1)

        self.await_ha_state('STEADY')

        # Ensure that once we terminate that process, there are no associations
        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)
        self.assertEqual(len(self.get_running_procs()), 0)

        processes_associated, _ = self.container.resource_registry.find_resources(
                restype="Process", name=proc.process_id)
        self.assertEqual(len(processes_associated), 0)

        has_processes = self.container.resource_registry.find_associations(
            service, "hasProcess")
        self.assertEqual(len(has_processes), 0)

        # Ensure that once we terminate that HA Agent, the Service object is
        # cleaned up
        self.container.terminate_process(self._haa_pid)

        with self.assertRaises(NotFound):
            service = self.container.resource_registry.read(service_id)

    def test_dashi(self):

        import dashi

        dashi_conn = dashi.DashiConnection("something", self._haa_dashi_uri,
            self._haa_dashi_exchange)

        status = dashi_conn.call(self._haa_dashi_name, "status")
        assert status in ('PENDING', 'READY', 'STEADY')

        new_policy = {'preserve_n': 0}
        dashi_conn.call(self._haa_dashi_name, "reconfigure_policy",
            new_policy=new_policy)
Example #16
0
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition =  ProcessDefinition(name='test', executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })

        self.pd_cli.create_process_definition(self.process_definition,
                self.process_definition_id)


        http_port = 8919
        http_port = self._start_webserver(port=http_port)

        self.resource_id = "haagent_4567"
        self._haa_name = "high_availability_agent"
        self._haa_config = {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'sensor',
                    'parameters': {
                        'metric': 'app_attributes:ml',
                        'sample_period': 600,
                        'sample_function': 'Average',
                        'cooldown_period': 20,
                        'scale_up_threshold': 2.0,
                        'scale_up_n_processes': 1,
                        'scale_down_threshold': 1.0,
                        'scale_down_n_processes': 1,
                        'maximum_processes': 5,
                        'minimum_processes': 1,
                    }
                },
                'aggregator': {
                    'type': 'trafficsentinel',
                    'host': 'localhost',
                    'port': http_port,
                    'protocol': 'http',
                    'username': '******',
                    'password': '******'
                },
                'process_definition_id': self.process_definition_id,
                "process_dispatchers": [
                    'process_dispatcher'
                ]
            },
            'agent': {'resource_id': self.resource_id},
        }


        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)
Example #17
0
class BaseHighAvailabilityAgentTest(IonIntegrationTestCase):
    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(
            to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test_haagent_%s' % self.process_definition_id
        self.process_definition = ProcessDefinition(
            name=self.process_definition_name,
            executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
            })
        self.pd_cli.create_process_definition(self.process_definition,
                                              self.process_definition_id)

        service_definition = SERVICE_DEFINITION_TMPL % self.process_definition_name
        sd = IonObject(RT.ServiceDefinition, {
            "name": self.process_definition_name,
            "definition": service_definition
        })
        self.service_def_id, _ = self.container.resource_registry.create(sd)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "hatests"
        self._haa_config = self._get_haagent_config()

        self._base_services, _ = self.container.resource_registry.find_resources(
            restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
                                                     name=self.container.name)
        self._spawn_haagent()

        self._setup_haa_client()

    def _get_haagent_config(self):
        return {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'npreserving',
                    'parameters': {
                        'preserve_n': 0
                    }
                },
                'process_definition_id': self.process_definition_id,
                'dashi_messaging': True,
                'dashi_exchange': self._haa_dashi_exchange,
                'dashi_name': self._haa_dashi_name
            },
            'agent': {
                'resource_id': self.resource_id
            },
        }

    def _setup_haa_client(self):
        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(
            self.resource_id, process=FakeProcess())

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)

    def _spawn_haagent(self, policy_parameters=None):

        config = deepcopy(self._haa_config)
        if policy_parameters is not None:
            config['highavailability']['policy'][
                'parameters'] = policy_parameters
        self._haa_pid = self.container_client.spawn_process(
            name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent",
            config=config)

    def _kill_haagent(self):
        self.container.terminate_process(self._haa_pid)

    def tearDown(self):

        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.assertEqual(len(self.get_running_procs()), 0)
        self.await_ha_state('STEADY')

        self.waiter.stop()
        try:
            self._kill_haagent()
        except BadRequest:
            log.warning(
                "Couldn't terminate HA Agent in teardown (May have been terminated by a test)"
            )
        self.container.resource_registry.delete(self.service_def_id,
                                                del_associations=True)
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [
            cproc for cproc in current if cproc.process_id not in base_pids
            and cproc.process_state == ProcessStateEnum.RUNNING
        ]
        return normal

    def get_new_services(self):

        base = self._base_services
        base_names = [i.name for i in base]
        services_registered, _ = self.container.resource_registry.find_resources(
            restype="Service", name=self.process_definition_name)
        normal = [
            cserv for cserv in services_registered
            if cserv.name not in base_names
        ]
        return normal

    def await_ha_state(self, want_state, timeout=20):

        for i in range(0, timeout):
            try:
                status = self.haa_client.status().result
                if status == want_state:
                    return
                else:
                    procs = self.get_running_procs()
                    num_procs = len(procs)
                    log.debug(
                        "assert wants state %s, got state %s, with %s procs" %
                        (want_state, status, num_procs))
            except Exception:
                log.exception("Problem getting HA status, trying again...")
                gevent.sleep(1)

        raise Exception("Took more than %s to get to ha state %s" %
                        (timeout, want_state))

    def await_pyon_ha_state(self, want_state, timeout=20):
        for i in range(0, timeout):
            try:
                result = self.haa_client.dump().result
                service_id = result.get('service_id')
                service = self.container.resource_registry.read(service_id)

                if service.state == want_state:
                    return
                else:
                    log.debug("want state %s, got state %s") % (want_state,
                                                                service.state)

            except Exception:
                log.exception("Problem getting HA status, trying again...")
                gevent.sleep(1)

        raise Exception("Took more than %s to get to pyon ha state %s" %
                        (timeout, want_state))
Example #18
0
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition = ProcessDefinition(name='test', executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })

        self.pd_cli.create_process_definition(self.process_definition,
                self.process_definition_id)

        http_port = 8919
        http_port = self._start_webserver(port=http_port)

        self.resource_id = "haagent_4567"
        self._haa_name = "high_availability_agent"
        self._haa_config = {
            'server': {
                'trafficsentinel': {
                    'host': 'localhost',
                    'port': http_port,
                    'protocol': 'http',
                    'username': '******',
                    'password': '******'
                }
            },
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'sensor',
                    'parameters': {
                        'metric': 'app_attributes:ml',
                        'sample_period': 600,
                        'sample_function': 'Average',
                        'cooldown_period': 5,
                        'scale_up_threshold': 2.0,
                        'scale_up_n_processes': 1,
                        'scale_down_threshold': 1.0,
                        'scale_down_n_processes': 1,
                        'maximum_processes': 5,
                        'minimum_processes': 1,
                    }
                },
                'process_definition_id': self.process_definition_id,
                "process_dispatchers": [
                    'process_dispatcher'
                ]
            },
            'agent': {'resource_id': self.resource_id},
        }

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)
Example #19
0
class HighAvailabilityAgentTest(IonIntegrationTestCase):

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test'
        self.process_definition =  ProcessDefinition(name=self.process_definition_name, executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })
        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "%s.hatests" % bootstrap.get_sys_name()
        self._haa_config = {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'npreserving',
                    'parameters': {
                        'preserve_n': 0
                    }
                },
                'process_definition_id': self.process_definition_id,
                'dashi_messaging' : True,
                'dashi_exchange' : self._haa_dashi_exchange,
                'dashi_name': self._haa_dashi_name
            },
            'agent': {'resource_id': self.resource_id},
        }

        self._base_services, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)


    def tearDown(self):
        self.waiter.stop()
        try:
            self.container.terminate_process(self._haa_pid)
        except BadRequest:
            log.warning("Couldn't terminate HA Agent in teardown (May have been terminated by a test)")
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [cproc for cproc in current if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING]
        return normal

    def get_new_services(self):

        base = self._base_services
        base_names = [i.name for i in base]
        services_registered, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)
        current_names = [i.name for i in services_registered]
        normal = [cserv for cserv in services_registered if cserv.name not in base_names]
        return normal

    def await_ha_state(self, want_state, timeout=10):

        for i in range(0, timeout):
            status = self.haa_client.status().result
            if status == want_state:
                return
            gevent.sleep(1)

        raise Exception("Took more than %s to get to ha state %s" % (timeout, want_state))


    def test_features(self):
        status = self.haa_client.status().result
        # Ensure HA hasn't already failed
        assert status in ('PENDING', 'READY', 'STEADY')


        # verifies L4-CI-CEI-RQ44
        # Note: the HA agent is started in the setUp() method, with config
        # pointing to the test "service". The initial config is set to preserve
        # 0 service processes. With this reconfigure step below, we change that
        # to launch 1.

        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)

        result = self.haa_client.dump().result
        self.assertEqual(result['policy'], new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, 'STEADY')
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # Ensure Service object has the correct state
        result = self.haa_client.dump().result
        service_id = result.get('service_id')
        service = self.container.resource_registry.read(service_id)
        self.assertEqual(service.state, ServiceStateEnum.STEADY)

        # verifies L4-CI-CEI-RQ122 and L4-CI-CEI-RQ124

        new_policy = {'preserve_n': 2}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 2)

        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)

        self.assertEqual(len(self.get_running_procs()), 1)

        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)
        self.assertEqual(len(self.get_running_procs()), 0)

    def test_associations(self):

        # Ensure that once the HA Agent starts, there is a Service object in
        # the registry
        result = self.haa_client.dump().result
        service_id = result.get('service_id')
        self.assertIsNotNone(service_id)
        service = self.container.resource_registry.read(service_id)
        self.assertIsNotNone(service)

        # Ensure that once a process is started, there is an association between
        # it and the service
        new_policy = {'preserve_n': 1}
        self.haa_client.reconfigure_policy(new_policy)
        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)
        self.assertEqual(len(self.get_running_procs()), 1)

        self.await_ha_state('STEADY')

        proc = self.get_running_procs()[0]

        processes_associated, _ = self.container.resource_registry.find_resources(
                restype="Process", name=proc.process_id)
        self.assertEqual(len(processes_associated), 1)

        has_processes = self.container.resource_registry.find_associations(
            service, "hasProcess")
        self.assertEqual(len(has_processes), 1)

        self.await_ha_state('STEADY')

        # Ensure that once we terminate that process, there are no associations
        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)
        self.assertEqual(len(self.get_running_procs()), 0)

        processes_associated, _ = self.container.resource_registry.find_resources(
                restype="Process", name=proc.process_id)
        self.assertEqual(len(processes_associated), 0)

        has_processes = self.container.resource_registry.find_associations(
            service, "hasProcess")
        self.assertEqual(len(has_processes), 0)

        # Ensure that once we terminate that HA Agent, the Service object is
        # cleaned up
        self.container.terminate_process(self._haa_pid)

        with self.assertRaises(NotFound):
            service = self.container.resource_registry.read(service_id)

    def test_dashi(self):

        import dashi

        dashi_conn = dashi.DashiConnection("something", self._haa_dashi_uri,
            self._haa_dashi_exchange)

        status = dashi_conn.call(self._haa_dashi_name, "status")
        assert status in ('PENDING', 'READY', 'STEADY')

        new_policy = {'preserve_n': 0}
        dashi_conn.call(self._haa_dashi_name, "reconfigure_policy",
            new_policy=new_policy)
Example #20
0
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url("res/deploy/r2cei.yml")
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition = ProcessDefinition(
            name="test", executable={"module": "ion.agents.cei.test.test_haagent", "class": "TestProcess"}
        )

        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        http_port = 8919
        http_port = self._start_webserver(port=http_port)

        self.resource_id = "haagent_4567"
        self._haa_name = "high_availability_agent"
        self._haa_config = {
            "highavailability": {
                "policy": {
                    "interval": 1,
                    "name": "sensor",
                    "parameters": {
                        "metric": "app_attributes:ml",
                        "sample_period": 600,
                        "sample_function": "Average",
                        "cooldown_period": 20,
                        "scale_up_threshold": 2.0,
                        "scale_up_n_processes": 1,
                        "scale_down_threshold": 1.0,
                        "scale_down_n_processes": 1,
                        "maximum_processes": 5,
                        "minimum_processes": 1,
                    },
                },
                "aggregator": {
                    "type": "trafficsentinel",
                    "host": "localhost",
                    "port": http_port,
                    "protocol": "http",
                    "username": "******",
                    "password": "******",
                },
                "process_definition_id": self.process_definition_id,
                "process_dispatchers": ["process_dispatcher"],
            },
            "agent": {"resource_id": self.resource_id},
        }

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node, name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(
            name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent",
            config=self._haa_config,
        )

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info("Got haa client %s.", str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)
class ProcessDispatcherEEAgentIntTest(ProcessDispatcherServiceIntTest):
    """Run the basic int tests again, with a different environment
    """
    def setUp(self):
        self.dashi = None
        self._start_container()
        from pyon.public import CFG

        self.container_client = ContainerAgentClient(node=self.container.node,
                                                     name=self.container.name)
        self.container = self.container_client._get_container_instance()

        app = dict(name="process_dispatcher",
                   processapp=("process_dispatcher",
                               "ion.services.cei.process_dispatcher_service",
                               "ProcessDispatcherService"))
        self.container.start_app(app, config=pd_config)

        self.rr_cli = self.container.resource_registry

        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {
            'module': 'ion.services.cei.test.test_process_dispatcher',
            'class': 'TestProcess'
        }
        self.process_definition_id = self.pd_cli.create_process_definition(
            self.process_definition)

        self._eea_pids = []
        self._eea_pid_to_resource_id = {}
        self._eea_pid_to_persistence_dir = {}
        self._tmpdirs = []

        self.dashi = get_dashi(
            uuid.uuid4().hex,
            pd_config['processdispatcher']['dashi_uri'],
            pd_config['processdispatcher']['dashi_exchange'],
            sysname=CFG.get_safe("dashi.sysname"))

        #send a fake node_state message to PD's dashi binding.
        self.node1_id = uuid.uuid4().hex
        self._send_node_state("engine1", self.node1_id)
        self._initial_eea_pid = self._start_eeagent(self.node1_id)

        self.waiter = ProcessStateWaiter()

    def _send_node_state(self, engine_id, node_id=None):
        node_id = node_id or uuid.uuid4().hex
        node_state = dict(node_id=node_id,
                          state=InstanceState.RUNNING,
                          domain_id=domain_id_from_engine(engine_id))
        self.dashi.fire(get_pd_dashi_name(), "node_state", args=node_state)

    def _start_eeagent(self, node_id, resource_id=None, persistence_dir=None):
        if not persistence_dir:
            persistence_dir = tempfile.mkdtemp()
            self._tmpdirs.append(persistence_dir)
        resource_id = resource_id or uuid.uuid4().hex
        agent_config = _get_eeagent_config(node_id,
                                           persistence_dir,
                                           resource_id=resource_id)
        pid = self.container_client.spawn_process(
            name="eeagent",
            module="ion.agents.cei.execution_engine_agent",
            cls="ExecutionEngineAgent",
            config=agent_config)
        log.info('Agent pid=%s.', str(pid))
        self._eea_pids.append(pid)
        self._eea_pid_to_resource_id[pid] = resource_id
        self._eea_pid_to_persistence_dir[pid] = persistence_dir
        return pid

    def _kill_eeagent(self, pid):
        self.assertTrue(pid in self._eea_pids)
        self.container.terminate_process(pid)
        self._eea_pids.remove(pid)
        del self._eea_pid_to_resource_id[pid]
        del self._eea_pid_to_persistence_dir[pid]

    def tearDown(self):
        for pid in list(self._eea_pids):
            self._kill_eeagent(pid)
        for d in self._tmpdirs:
            shutil.rmtree(d)

        self.waiter.stop()
        if self.dashi:
            self.dashi.cancel()

    def test_requested_ee(self):

        # request non-default engine

        process_target = ProcessTarget(execution_engine_id="engine2")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start()

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.WAITING)

        # request unknown engine, with NEVER queuing mode. The request
        # should be rejected.
        # verifies L4-CI-CEI-RQ52

        process_target = ProcessTarget(execution_engine_id="not-a-real-ee")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        rejected_pid = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=rejected_pid)

        self.waiter.await_state_event(rejected_pid, ProcessStateEnum.REJECTED)

        # now add a node and eeagent for engine2. original process should leave
        # queue and start running
        node2_id = uuid.uuid4().hex
        self._send_node_state("engine2", node2_id)
        self._start_eeagent(node2_id)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        # spawn another process. it should start immediately.

        process_target = ProcessTarget(execution_engine_id="engine2")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        pid2 = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid2)

        self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING)

        # one more with node exclusive

        process_target = ProcessTarget(execution_engine_id="engine2",
                                       node_exclusive="hats")
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.NEVER
        process_schedule.target = process_target

        pid3 = self.pd_cli.create_process(self.process_definition_id)

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid3)

        self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING)

        # kill the processes for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED)

    def test_node_exclusive(self):

        # the node_exclusive constraint is used to ensure multiple processes
        # of the same "kind" each get a VM exclusive of each other. Other
        # processes may run on these VMs, just not processes with the same
        # node_exclusive tag. Since we cannot directly query the contents
        # of each node in this test, we prove the capability by scheduling
        # processes one by one and checking their state.

        # verifies L4-CI-CEI-RQ121
        # verifies L4-CI-CEI-RQ57

        # first off, setUp() created a single node and eeagent.
        # We schedule two processes with the same "abc" node_exclusive
        # tag. Since there is only one node, the first process should run
        # and the second should be queued.

        process_target = ProcessTarget(execution_engine_id="engine1")
        process_target.node_exclusive = "abc"
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        pid1 = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start()

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid1)

        self.waiter.await_state_event(pid1, ProcessStateEnum.RUNNING)

        pid2 = self.pd_cli.create_process(self.process_definition_id)
        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.WAITING)

        # now demonstrate that the node itself is not full by launching
        # a third process without a node_exclusive tag -- it should start
        # immediately

        process_target.node_exclusive = None
        pid3 = self.pd_cli.create_process(self.process_definition_id)
        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.RUNNING)

        # finally, add a second node to the engine. pid2 should be started
        # since there is an exclusive "abc" node free.
        node2_id = uuid.uuid4().hex
        self._send_node_state("engine1", node2_id)
        self._start_eeagent(node2_id)
        self.waiter.await_state_event(pid2, ProcessStateEnum.RUNNING)

        # kill the processes for good
        self.pd_cli.cancel_process(pid1)
        self.waiter.await_state_event(pid1, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid2)
        self.waiter.await_state_event(pid2, ProcessStateEnum.TERMINATED)
        self.pd_cli.cancel_process(pid3)
        self.waiter.await_state_event(pid3, ProcessStateEnum.TERMINATED)

    def test_code_download(self):
        # create a process definition that has no URL; only module and class.
        process_definition_no_url = ProcessDefinition(
            name='test_process_nodownload')
        process_definition_no_url.executable = {
            'module': 'ion.my.test.process',
            'class': 'TestProcess'
        }
        process_definition_id_no_url = self.pd_cli.create_process_definition(
            process_definition_no_url)

        # create another that has a URL of the python file (this very file)
        # verifies L4-CI-CEI-RQ114
        url = "file://%s" % os.path.join(os.path.dirname(__file__),
                                         'test_process_dispatcher.py')
        process_definition = ProcessDefinition(name='test_process_download')
        process_definition.executable = {
            'module': 'ion.my.test.process',
            'class': 'TestProcess',
            'url': url
        }
        process_definition_id = self.pd_cli.create_process_definition(
            process_definition)

        process_target = ProcessTarget()
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS
        process_schedule.target = process_target

        self.waiter.start()

        # Test a module with no download fails
        pid_no_url = self.pd_cli.create_process(process_definition_id_no_url)

        self.pd_cli.schedule_process(process_definition_id_no_url,
                                     process_schedule,
                                     process_id=pid_no_url)

        self.waiter.await_state_event(pid_no_url, ProcessStateEnum.FAILED)

        # Test a module with a URL runs
        pid = self.pd_cli.create_process(process_definition_id)

        self.pd_cli.schedule_process(process_definition_id,
                                     process_schedule,
                                     process_id=pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

    def _add_test_process(self, restart_mode=None):
        process_schedule = ProcessSchedule()
        if restart_mode is not None:
            process_schedule.restart_mode = restart_mode
        pid = self.pd_cli.create_process(self.process_definition_id)

        pid_listen_name = "PDtestproc_%s" % uuid.uuid4().hex
        config = {'process': {'listen_name': pid_listen_name}}

        self.pd_cli.schedule_process(self.process_definition_id,
                                     process_schedule,
                                     process_id=pid,
                                     configuration=config)

        client = TestClient(to_name=pid_listen_name)
        return pid, client

    def test_restart(self):
        self.waiter.start()

        restartable_pids = []
        nonrestartable_pids = []
        clients = {}
        # start 10 processes with RestartMode.ALWAYS
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.ALWAYS)
            restartable_pids.append(pid)
            clients[pid] = client

        # and 10 processes with RestartMode.ABNORMAL
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.ABNORMAL)
            restartable_pids.append(pid)
            clients[pid] = client

        # and 10 with RestartMode.NEVER
        for _ in range(10):
            pid, client = self._add_test_process(ProcessRestartMode.NEVER)
            nonrestartable_pids.append(pid)
            clients[pid] = client

        all_pids = restartable_pids + nonrestartable_pids

        self.waiter.await_many_state_events(all_pids, ProcessStateEnum.RUNNING)

        for pid in all_pids:
            client = clients[pid]
            self.assertFalse(client.is_restart())
            self.assertEqual(client.count(), 1)

        # now kill the whole eeagent and restart it. processes should
        # show up as FAILED in the next heartbeat.
        resource_id = self._eea_pid_to_resource_id[self._initial_eea_pid]
        persistence_dir = self._eea_pid_to_persistence_dir[
            self._initial_eea_pid]
        log.debug("Restarting eeagent %s", self._initial_eea_pid)
        self._kill_eeagent(self._initial_eea_pid)

        # manually kill the processes to simulate a real container failure
        for pid in all_pids:
            self.container.terminate_process(pid)

        self._start_eeagent(self.node1_id,
                            resource_id=resource_id,
                            persistence_dir=persistence_dir)

        # wait for restartables to restart
        self.waiter.await_many_state_events(restartable_pids,
                                            ProcessStateEnum.RUNNING)

        # query the processes again. it should have restart mode config
        for pid in restartable_pids:
            client = clients[pid]
            self.assertTrue(client.is_restart())
            self.assertEqual(client.count(), 1)

        # meanwhile some procs should not have restarted
        for pid in nonrestartable_pids:
            proc = self.pd_cli.read_process(pid)
            self.assertEqual(proc.process_state, ProcessStateEnum.FAILED)

        # guard against extraneous events we were receiving as part of a bug:
        # processes restarting again after they were already restarted
        self.waiter.await_nothing(timeout=5)

    def test_idempotency(self):
        # ensure every operation can be safely retried
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        proc_name = 'myreallygoodname'
        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        # note: if we import UNSCHEDULED state into ProcessStateEnum,
        # this assertion will need to change.
        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_state, ProcessStateEnum.REQUESTED)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
                                            process_schedule,
                                            configuration={},
                                            process_id=pid,
                                            name=proc_name)
        self.assertEqual(pid, pid2)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        # repeating schedule is harmless
        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
                                            process_schedule,
                                            configuration={},
                                            process_id=pid,
                                            name=proc_name)
        self.assertEqual(pid, pid2)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING)

        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

        # repeating cancel is harmless
        self.pd_cli.cancel_process(pid)
        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.TERMINATED)
class ProcessDispatcherServiceIntTest(IonIntegrationTestCase):
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')

        self.rr_cli = ResourceRegistryServiceClient()
        self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)

        self.process_definition = ProcessDefinition(name='test_process')
        self.process_definition.executable = {
            'module': 'ion.services.cei.test.test_process_dispatcher',
            'class': 'TestProcess'
        }
        self.process_definition_id = self.pd_cli.create_process_definition(
            self.process_definition)

        self.waiter = ProcessStateWaiter()

    def tearDown(self):
        self.waiter.stop()

    def test_create_schedule_cancel(self):
        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        proc_name = 'myreallygoodname'
        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
                                            process_schedule,
                                            configuration={},
                                            process_id=pid,
                                            name=proc_name)
        self.assertEqual(pid, pid2)

        # verifies L4-CI-CEI-RQ141 and L4-CI-CEI-RQ142
        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, {})
        self.assertEqual(proc.process_state, ProcessStateEnum.RUNNING)

        # make sure process is readable directly from RR (mirrored)
        # verifies L4-CI-CEI-RQ63
        # verifies L4-CI-CEI-RQ64
        proc = self.rr_cli.read(pid)
        self.assertEqual(proc.process_id, pid)

        # now try communicating with the process to make sure it is really running
        test_client = TestClient()
        for i in range(5):
            self.assertEqual(i + 1, test_client.count(timeout=10))

        # verifies L4-CI-CEI-RQ147

        # check the process name was set in container
        got_proc_name = test_client.get_process_name(pid=pid2)
        self.assertEqual(proc_name, got_proc_name)

        # kill the process and start it again
        self.pd_cli.cancel_process(pid)

        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
                                            process_schedule,
                                            configuration={},
                                            process_id=pid)
        self.assertEqual(pid, pid2)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        for i in range(5):
            self.assertEqual(i + 1, test_client.count(timeout=10))

        # kill the process for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

    def test_schedule_with_config(self):

        process_schedule = ProcessSchedule()
        process_schedule.queueing_mode = ProcessQueueingMode.ALWAYS

        pid = self.pd_cli.create_process(self.process_definition_id)
        self.waiter.start(pid)

        # verifies L4-CI-CEI-RQ66

        # feed in a string that the process will return -- verifies that
        # configuration actually makes it to the instantiated process
        test_response = uuid.uuid4().hex
        configuration = {"test_response": test_response}

        pid2 = self.pd_cli.schedule_process(self.process_definition_id,
                                            process_schedule,
                                            configuration=configuration,
                                            process_id=pid)
        self.assertEqual(pid, pid2)

        self.waiter.await_state_event(pid, ProcessStateEnum.RUNNING)

        test_client = TestClient()

        # verifies L4-CI-CEI-RQ139
        # assure that configuration block (which can contain inputs, outputs,
        # and arbitrary config) 1) makes it to the process and 2) is returned
        # in process queries

        self.assertEqual(test_client.query(), test_response)

        proc = self.pd_cli.read_process(pid)
        self.assertEqual(proc.process_id, pid)
        self.assertEqual(proc.process_configuration, configuration)

        # kill the process for good
        self.pd_cli.cancel_process(pid)
        self.waiter.await_state_event(pid, ProcessStateEnum.TERMINATED)

    def test_schedule_bad_config(self):

        process_schedule = ProcessSchedule()

        # a non-JSON-serializable IonObject
        o = ProcessTarget()

        with self.assertRaises(BadRequest) as ar:
            self.pd_cli.schedule_process(self.process_definition_id,
                                         process_schedule,
                                         configuration={"bad": o})
        self.assertTrue(ar.exception.message.startswith("bad configuration"))

    def test_cancel_notfound(self):
        with self.assertRaises(NotFound):
            self.pd_cli.cancel_process("not-a-real-process-id")

    def test_create_invalid_definition(self):
        # create process definition missing module and class
        # verifies L4-CI-CEI-RQ137
        executable = dict(url="http://somewhere.com/something.py")
        definition = ProcessDefinition(name="test_process",
                                       executable=executable)
        with self.assertRaises(BadRequest):
            self.pd_cli.create_process_definition(definition)
Example #23
0
class BaseHighAvailabilityAgentTest(IonIntegrationTestCase):

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        #self.pd_cli = ProcessDispatcherServiceClient(node=self.container.node)
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition_name = 'test_haagent_%s' % self.process_definition_id
        self.process_definition = ProcessDefinition(name=self.process_definition_name, executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })
        self.pd_cli.create_process_definition(self.process_definition, self.process_definition_id)

        service_definition = SERVICE_DEFINITION_TMPL % self.process_definition_name
        sd = IonObject(RT.ServiceDefinition, {"name": self.process_definition_name,
            "definition": service_definition})
        self.service_def_id, _ = self.container.resource_registry.create(sd)

        self.resource_id = "haagent_1234"
        self._haa_name = "high_availability_agent"
        self._haa_dashi_name = "dashi_haa_" + uuid4().hex
        self._haa_dashi_uri = get_dashi_uri_from_cfg()
        self._haa_dashi_exchange = "hatests"
        self._haa_config = self._get_haagent_config()

        self._base_services, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)

        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._spawn_haagent()

        self._setup_haa_client()

    def _get_haagent_config(self):
        return {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'npreserving',
                    'parameters': {
                        'preserve_n': 0
                    }
                },
                'process_definition_id': self.process_definition_id,
                'dashi_messaging': True,
                'dashi_exchange': self._haa_dashi_exchange,
                'dashi_name': self._haa_dashi_name
            },
            'agent': {'resource_id': self.resource_id},
        }

    def _setup_haa_client(self):
        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)

    def _spawn_haagent(self, policy_parameters=None):

        config = deepcopy(self._haa_config)
        if policy_parameters is not None:
            config['highavailability']['policy']['parameters'] = policy_parameters
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=config)

    def _kill_haagent(self):
        self.container.terminate_process(self._haa_pid)

    def tearDown(self):


        new_policy = {'preserve_n': 0}
        self.haa_client.reconfigure_policy(new_policy)

        self.assertEqual(len(self.get_running_procs()), 0)
        self.await_ha_state('STEADY')

        self.waiter.stop()
        try:
            self._kill_haagent()
        except BadRequest:
            log.warning("Couldn't terminate HA Agent in teardown (May have been terminated by a test)")
        self.container.resource_registry.delete(self.service_def_id, del_associations=True)
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [cproc for cproc in current if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING]
        return normal

    def get_new_services(self):

        base = self._base_services
        base_names = [i.name for i in base]
        services_registered, _ = self.container.resource_registry.find_resources(
                restype="Service", name=self.process_definition_name)
        normal = [cserv for cserv in services_registered if cserv.name not in base_names]
        return normal

    def await_ha_state(self, want_state, timeout=20):

        for i in range(0, timeout):
            try:
                status = self.haa_client.status().result
                if status == want_state:
                    return
                else:
                    procs = self.get_running_procs()
                    num_procs = len(procs)
                    log.debug("assert wants state %s, got state %s, with %s procs" % (want_state,status, num_procs))
            except Exception:
                log.exception("Problem getting HA status, trying again...")
                gevent.sleep(1)

        raise Exception("Took more than %s to get to ha state %s" % (timeout, want_state))

    def await_pyon_ha_state(self, want_state, timeout=20):
        for i in range(0, timeout):
            try:
                result = self.haa_client.dump().result
                service_id = result.get('service_id')
                service = self.container.resource_registry.read(service_id)

                if service.state == want_state:
                    return
                else:
                    log.debug("want state %s, got state %s") % (want_state, service.state)

            except Exception:
                log.exception("Problem getting HA status, trying again...")
                gevent.sleep(1)

        raise Exception("Took more than %s to get to pyon ha state %s" % (timeout, want_state))
Example #24
0
class HighAvailabilityAgentSensorPolicyTest(IonIntegrationTestCase):

    def _start_webserver(self, port=None):
        """ Start a webserver for testing code download
        Note: tries really hard to get a port, and if it can't use
        the suggested port, randomly picks another, and returns it
        """
        def log_message(self, format, *args):
            #swallow log massages
            pass

        class TestRequestHandler(BaseHTTPServer.BaseHTTPRequestHandler):
            server_version = 'test_server'
            extensions_map = ''
            def do_GET(self):
                self.send_response(200)
                self.send_header("Content-type", "text/plain")
                self.send_header("Content-Length", len(self.server.response))
                self.end_headers()
                self.wfile.write(self.server.response)


        class Server(HTTPServer):

            response = ''

            def serve_forever(self):
                self._serving = 1
                while self._serving:
                    self.handle_request()

            def stop(self):
                self._serving = 0

        if port is None:
            port = 8008
        Handler = TestRequestHandler
        Handler.log_message = log_message

        for i in range(0, 100):
            try:
                self._webserver = Server(("localhost", port), Handler)
            except socket.error:
                print "port %s is in use, picking another" % port
                port = randint(8000, 10000)
                continue
            else:
                break

        self._web_glet = gevent.spawn(self._webserver.serve_forever)
        return port

    def _stop_webserver(self):
        if self._webserver is not None:
            self._webserver.stop()
            gevent.sleep(2)
            self._web_glet.kill()

    @needs_epu
    def setUp(self):
        self._start_container()
        self.container.start_rel_from_url('res/deploy/r2cei.yml')
        self.pd_cli = ProcessDispatcherServiceClient(to_name="process_dispatcher")

        self.process_definition_id = uuid4().hex
        self.process_definition =  ProcessDefinition(name='test', executable={
                'module': 'ion.agents.cei.test.test_haagent',
                'class': 'TestProcess'
        })

        self.pd_cli.create_process_definition(self.process_definition,
                self.process_definition_id)


        http_port = 8919
        http_port = self._start_webserver(port=http_port)

        self.resource_id = "haagent_4567"
        self._haa_name = "high_availability_agent"
        self._haa_config = {
            'highavailability': {
                'policy': {
                    'interval': 1,
                    'name': 'sensor',
                    'parameters': {
                        'metric': 'app_attributes:ml',
                        'sample_period': 600,
                        'sample_function': 'Average',
                        'cooldown_period': 20,
                        'scale_up_threshold': 2.0,
                        'scale_up_n_processes': 1,
                        'scale_down_threshold': 1.0,
                        'scale_down_n_processes': 1,
                        'maximum_processes': 5,
                        'minimum_processes': 1,
                    }
                },
                'aggregator': {
                    'type': 'trafficsentinel',
                    'host': 'localhost',
                    'port': http_port,
                    'protocol': 'http',
                    'username': '******',
                    'password': '******'
                },
                'process_definition_id': self.process_definition_id,
                "process_dispatchers": [
                    'process_dispatcher'
                ]
            },
            'agent': {'resource_id': self.resource_id},
        }


        self._base_procs = self.pd_cli.list_processes()

        self.waiter = ProcessStateWaiter()
        self.waiter.start()

        self.container_client = ContainerAgentClient(node=self.container.node,
            name=self.container.name)
        self._haa_pid = self.container_client.spawn_process(name=self._haa_name,
            module="ion.agents.cei.high_availability_agent",
            cls="HighAvailabilityAgent", config=self._haa_config)

        # Start a resource agent client to talk with the instrument agent.
        self._haa_pyon_client = SimpleResourceAgentClient(self.resource_id, process=FakeProcess())
        log.info('Got haa client %s.', str(self._haa_pyon_client))

        self.haa_client = HighAvailabilityAgentClient(self._haa_pyon_client)


    def tearDown(self):
        self.waiter.stop()
        self.container.terminate_process(self._haa_pid)
        self._stop_webserver()
        self._stop_container()

    def get_running_procs(self):
        """returns a normalized set of running procs (removes the ones that
        were there at setup time)
        """

        base = self._base_procs
        base_pids = [proc.process_id for proc in base]
        current = self.pd_cli.list_processes()
        current_pids = [proc.process_id for proc in current]
        print "filtering base procs %s from %s" % (base_pids, current_pids)
        normal = [cproc for cproc in current if cproc.process_id not in base_pids and cproc.process_state == ProcessStateEnum.RUNNING]
        return normal

    def _get_managed_upids(self):
        result = self.haa_client.dump().result
        upids = result['managed_upids']
        return upids

    def _set_response(self, response):
        self._webserver.response = response

    def test_sensor_policy(self):
        status = self.haa_client.status().result
        # Ensure HA hasn't already failed
        assert status in ('PENDING', 'READY', 'STEADY')

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, 'STEADY')
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # Set ml for each proc such that we scale up
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=5\n"
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.RUNNING)

        self.assertEqual(len(self.get_running_procs()), 2)

        # Set ml so we stay steady
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=1.5\n"
        self._set_response(response)

        self.assertEqual(len(self.get_running_procs()), 2)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, 'STEADY')
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"

        # Set ml so we scale down
        upids = self._get_managed_upids()
        response = ""
        for upid in upids:
            response += "%s,ml=0.5\n"
        self._set_response(response)

        self.waiter.await_state_event(state=ProcessStateEnum.TERMINATED)

        self.assertEqual(len(self.get_running_procs()), 1)

        for i in range(0, 5):
            status = self.haa_client.status().result
            try:
                self.assertEqual(status, 'STEADY')
                break
            except:
                gevent.sleep(1)
        else:
            assert False, "HA Service took too long to get to state STEADY"