Beispiel #1
0
    def start(self):
        queue_name = get_safe(self._pd_core.pd_cfg, "command_queue") or "pd_command"
        self.sub_cont = Subscriber(binding=queue_name, from_name=queue_name, callback=self._receive_command)
        self.sub_cont_gl = spawn(self.sub_cont.listen, activate=False)
        self.sub_cont.get_ready_event().wait()

        self.pub_result = Publisher()
Beispiel #2
0
class ProcessDispatcherClient(object):
    def __init__(self, container, to_name=None):
        self.container = container
        self._cmd_queue_name = to_name or CFG.get_safe("container.process.pd_command_queue", "pd_command")

        self.cmd_pub = Publisher(to_name=self._cmd_queue_name)

    # -------------------------------------------------------------------------
    # Public API (callable by process management service)

    def start_rel(self, rel_def, reply_to=None):
        command_id = create_simple_unique_id()
        action_cmd = dict(command="start_rel", command_id=command_id, rel_def=rel_def,
                          reply_to=reply_to)

        self.cmd_pub.publish(action_cmd)
        return command_id

    def start_rel_blocking(self, rel_def, timeout=None):
        waiter = AsyncResultWaiter()
        reply_to = waiter.activate()
        command_id = self.start_rel(rel_def, reply_to)
        cmd_res = waiter.await(timeout=timeout, request_id=command_id)   # This could take a long time!
        return cmd_res

    def schedule(self, process_id, process_definition, schedule, configuration, name):
        command_id = create_simple_unique_id()
        action_cmd = dict(command="schedule", command_id=command_id,
                          process_id=process_id, process_definition=process_definition,
                          schedule=schedule, configuration=configuration, name=name)

        self.cmd_pub.publish(action_cmd)
        return command_id

    def cancel(self, process_id):
        command_id = create_simple_unique_id()
        action_cmd = dict(command="cancel", command_id=command_id,
                          process_id=process_id)
        self.cmd_pub.publish(action_cmd)
        return command_id

    def list(self):
        command_id = create_simple_unique_id()
        action_cmd = dict(command="list", command_id=command_id)
        self.cmd_pub.publish(action_cmd)
        return command_id
Beispiel #3
0
    def __init__(self, container, to_name=None):
        self.container = container
        self._cmd_queue_name = to_name or CFG.get_safe("container.process.pd_command_queue", "pd_command")

        self.cmd_pub = Publisher(to_name=self._cmd_queue_name)
Beispiel #4
0
class ProcessDispatcherDecisionEngine(object):
    """ Base class for PD decision engines """

    def __init__(self, pd_core):
        self._pd_core = pd_core
        self.container = self._pd_core.container
        self.registry = self._pd_core.registry
        self.executor = self._pd_core.executor

        self.sub_cont = None
        self.sub_cont_gl = None
        self.sub_active = False

        self._pd_core.leader_manager.add_leader_callback(self._leader_callback)

        self._load_rules()

    def start(self):
        queue_name = get_safe(self._pd_core.pd_cfg, "command_queue") or "pd_command"
        self.sub_cont = Subscriber(binding=queue_name, from_name=queue_name, callback=self._receive_command)
        self.sub_cont_gl = spawn(self.sub_cont.listen, activate=False)
        self.sub_cont.get_ready_event().wait()

        self.pub_result = Publisher()

    def stop(self):
        if self.sub_cont:
            self.sub_cont.close()
            self.sub_cont_gl.join(timeout=2)
            self.sub_cont_gl.kill()
            self.sub_cont_gl = None
            self.sub_cont = None

    def _leader_callback(self, leader_info):
        if leader_info["action"] == "acquire_leader":
            def start_sub():
                if not self.registry.preconditions_true.is_set():
                    log.info("PD is leader - awaiting PD preconditions")
                    # Await preconditions
                    await_timeout = get_safe(self._pd_core.pd_cfg, "engine.await_preconditions.await_timeout")
                    precond_true = self.registry.preconditions_true.wait(timeout=await_timeout)
                    if not precond_true:
                        log.warn("PD preconditions not satisfied after timeout - continuing")

                if self._pd_core.is_leader() and self.sub_cont is not None and not self.sub_active:
                    # Are we still leader? Not activated?
                    num_msg, num_cons = self.sub_cont.get_stats()
                    log.info("PD is leader - starting to consume (%s pending commands, %s consumers)", num_msg, num_cons)
                    self.sub_cont.activate()
                    self.sub_active = True
            start_sub_gl = spawn(start_sub)
        elif leader_info["action"] == "release_leader":
            if self.sub_cont is not None and self.sub_active:
                self.sub_cont.deactivate()
                self.sub_active = False

    def _receive_command(self, command, headers, *args):
        log.info("PD execute command %s", command["command"])

        cmd_funcname = "_cmd_{}".format(command["command"])
        cmd_func = getattr(self, cmd_funcname, None)
        cmd_res, cmd_complete = None, False
        if not cmd_func:
            log.warn("Command function not found")
            self._send_command_reply(command, dict(message="ERROR"), status=400)
            return
        try:
            cmd_res = cmd_func(command)
            cmd_complete = True
        except Exception as ex:
            log.exception("Error executing command")
            self._send_command_reply(command, dict(message=str(ex)), status=400)

        if cmd_complete:
            self._send_command_reply(command, cmd_res)

    def _send_command_reply(self, command, result=None, status=200):
        reply_to = command.get("reply_to", None)
        if not reply_to:
            return
        res_msg = AsyncResultMsg(request_id=command["command_id"], result=result, status=status)
        self.pub_result.publish(to_name=reply_to, msg=res_msg)

    # -------------------------------------------------------------------------

    def _cmd_start_rel(self, command):
        log.debug("Start rel")
        rel = command["rel_def"]

        max_proc_replicas = int(CFG.get_safe("container.process.max_replicas", 0))

        for rel_app_cfg in rel["apps"]:
            app_name = rel_app_cfg["name"]
            log.debug("app definition in rel: %s", str(rel_app_cfg))

            # Decide where process should go
            target_engine = self._determine_target_engine(rel_app_cfg)
            log.debug("Dispatch app %s to engine %s", app_name, target_engine)

            if 'processapp' in rel_app_cfg:
                name, module, cls = rel_app_cfg["processapp"]

                rel_cfg = None
                if 'config' in rel_app_cfg:
                    rel_cfg = deepcopy(rel_app_cfg["config"])

                if 'replicas' in rel_app_cfg:
                    proc_replicas = int(rel_app_cfg["replicas"])
                    if max_proc_replicas > 0:
                        if proc_replicas > max_proc_replicas:
                            log.info("Limiting number of proc replicas to %s from %s", max_proc_replicas, proc_replicas)
                        proc_replicas = min(proc_replicas, max_proc_replicas)
                    if proc_replicas < 1 or proc_replicas > 100:
                        log.warn("Invalid number of process replicas: %s", proc_replicas)
                        proc_replicas = 1
                    for i in xrange(proc_replicas):
                        cont_info = self._determine_target_container(rel_app_cfg, target_engine)
                        container_name = self._get_cc_agent_name(cont_info)
                        proc_name = "%s.%s" % (name, i) if i else name
                        action_res = self._add_spawn_process_action(cc_agent=container_name, proc_name=proc_name,
                                                                    module=module, cls=cls, config=rel_cfg)
                        proc_id = action_res.wait()
                        self._slot_process(cont_info, proc_id, dict(proc_name=proc_name, state=ProcessStateEnum.RUNNING))
                else:
                    cont_info = self._determine_target_container(rel_app_cfg, target_engine)
                    container_name = self._get_cc_agent_name(cont_info)
                    action_res = self._add_spawn_process_action(cc_agent=container_name, proc_name=name,
                                                                module=module, cls=cls, config=rel_cfg)
                    proc_id = action_res.wait()
                    self._slot_process(cont_info, proc_id, dict(proc_name=name, state=ProcessStateEnum.RUNNING))

            else:
                log.warn("App file not supported")

    def _add_spawn_process_action(self, cc_agent, proc_name, module, cls, config):
        action_res = AsyncResult()
        action_kwargs = dict(cc_agent=cc_agent, proc_name=proc_name, module=module, cls=cls, config=config)
        action = ("spawn_process", action_res, action_kwargs)
        self.executor.add_action(action)
        return action_res

    # -------------------------------------------------------------------------

    def _load_rules(self):
        self.rules_cfg = get_safe(self._pd_core.pd_cfg, "engine.dispatch_rules") or []
        self.default_engine = get_safe(self._pd_core.pd_cfg, "engine.default_engine") or "default"

    def _determine_target_engine(self, app_cfg):

        # Determine nominal engine
        target_engine = self.default_engine
        app_name = app_cfg["name"]
        for rule in self.rules_cfg:
            if "appname_pattern" in rule:
                if re.match(rule["appname_pattern"], app_name):
                    target_engine = rule["engine"]

        # For check if engine has actual containers
        ee_containers = self.registry.get_engine_containers()
        if not ee_containers.get(target_engine, None):
            target_engine = self.default_engine

        return target_engine

    def _determine_target_container(self, app_cfg, target_engine):
        app_name = app_cfg["name"]
        if target_engine is None:
            target_engine = self._determine_target_engine(app_cfg)

        # Determine available containers
        ee_containers = self.registry.get_engine_containers()
        ee_conts = ee_containers.get(target_engine, None)
        if not ee_conts:
            raise BadRequest("No running containers for app {}".format(app_name))
        ee_conts_sorted = sorted(ee_conts, key=lambda c: c["ts_created"])

        # Load balance across containers
        dispatch_spread = get_safe(self._pd_core.pd_cfg, "engine.dispatch_spread") or "round_robin"
        if dispatch_spread == "round_robin":
            cont_info, current_min_alloc = None, 9999999
            for cont in ee_conts_sorted:
                if len(cont["allocation"]) < current_min_alloc:
                    max_capacity = int(get_safe(cont["ee_info"], "capacity.max", 0))
                    if max_capacity and len(cont["allocation"]) < max_capacity:
                        cont_info = cont
                        current_min_alloc = len(cont["allocation"])
            if not cont_info:
                raise BadRequest("Could not find open slot")
        elif dispatch_spread == "fill_up":
            cont_info, current_min_alloc = None, 9999999
            for cont in ee_conts_sorted:
                if len(cont["allocation"]) < int(get_safe(cont["ee_info"], "capacity.max", 0)):
                    cont_info = cont
                    break
            if not cont_info:
                raise BadRequest("Could not find open slot")
        elif dispatch_spread == "random":
            cont_num = random.randint(0, len(ee_conts_sorted)-1)
            cont_info = ee_conts_sorted[cont_num]
        else:
            raise BadRequest("Unknown dispatch_spread {}".format(dispatch_spread))

        return cont_info

    def _get_cc_agent_name(self, cont_info):
        container_name = cont_info["cc_obj"].cc_agent
        return container_name

    def _slot_process(self, cont_info, proc_id, proc_info):
        self.registry.register_process(cont_info["container_id"], proc_id, proc_info, update=False)