Esempio n. 1
0
    def receiveMsg_StartEngine(self, msg, sender):
        self.logger.info("Received signal from race control to start engine.")
        self.race_control = sender
        self.cfg = msg.cfg
        cls = metrics.metrics_store_class(self.cfg)
        self.metrics_store = cls(self.cfg)
        self.metrics_store.open(ctx=msg.open_metrics_context)
        self.car, _ = load_team(self.cfg, msg.external)

        # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue.
        hosts = self.cfg.opts("client", "hosts").default
        if len(hosts) == 0:
            raise exceptions.LaunchError("No target hosts are configured.")

        if msg.external:
            self.logger.info("Cluster will not be provisioned by Rally.")
            # just create one actor for this special case and run it on the coordinator node (i.e. here)
            m = self.createActor(NodeMechanicActor,
                                 targetActorRequirements={"coordinator": True})
            self.children.append(m)
            self.send(m, msg.for_nodes(ip=hosts))
        else:
            self.logger.info(
                "Cluster consisting of %s will be provisioned by Rally.",
                hosts)
            msg.hosts = hosts
            # Initialize the children array to have the right size to
            # ensure waiting for all responses
            self.children = [None] * len(nodes_by_host(to_ip_port(hosts)))
            self.send(self.createActor(Dispatcher), msg)
        self.status = "starting"
        self.received_responses = []
Esempio n. 2
0
    def on_start_engine(self, msg, sender):
        logger.info("Received signal from race control to start engine.")
        self.race_control = sender
        self.cfg = msg.cfg
        cls = metrics.metrics_store_class(self.cfg)
        self.metrics_store = cls(self.cfg)
        self.metrics_store.open(ctx=msg.open_metrics_context)

        # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue.
        mechanics_and_start_message = []
        hosts = self.cfg.opts("client", "hosts")
        if len(hosts) == 0:
            raise exceptions.LaunchError("No target hosts are configured.")

        if msg.external:
            logger.info("Cluster will not be provisioned by Rally.")
            # just create one actor for this special case and run it on the coordinator node (i.e. here)
            m = self.createActor(NodeMechanicActor,
                                 #globalName="/rally/mechanic/worker/external",
                                 targetActorRequirements={"coordinator": True})
            self.children.append(m)
            mechanics_and_start_message.append((m, msg.for_nodes(ip=hosts)))
        else:
            logger.info("Cluster consisting of %s will be provisioned by Rally." % hosts)
            all_ips_and_ports = to_ip_port(hosts)
            all_node_ips = extract_all_node_ips(all_ips_and_ports)
            for ip_port, nodes in nodes_by_host(all_ips_and_ports).items():
                ip, port = ip_port
                if ip == "127.0.0.1":
                    m = self.createActor(NodeMechanicActor,
                                         #globalName="/rally/mechanic/worker/localhost",
                                         targetActorRequirements={"coordinator": True})
                    self.children.append(m)
                    mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes)))
                else:
                    if self.cfg.opts("system", "remote.benchmarking.supported"):
                        logger.info("Benchmarking against %s with external Rally daemon." % hosts)
                    else:
                        logger.error("User tried to benchmark against %s but no external Rally daemon has been started." % hosts)
                        raise exceptions.SystemSetupError("To benchmark remote hosts (e.g. %s) you need to start the Rally daemon "
                                                          "on each machine including this one." % ip)
                    already_running = actor.actor_system_already_running(ip=ip)
                    logger.info("Actor system on [%s] already running? [%s]" % (ip, str(already_running)))
                    if not already_running:
                        console.println("Waiting for Rally daemon on [%s] " % ip, end="", flush=True)
                    while not actor.actor_system_already_running(ip=ip):
                        console.println(".", end="", flush=True)
                        time.sleep(3)
                    if not already_running:
                        console.println(" [OK]")
                    m = self.createActor(NodeMechanicActor,
                                         #globalName="/rally/mechanic/worker/%s" % ip,
                                         targetActorRequirements={"ip": ip})
                    mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes)))
                    self.children.append(m)
        self.status = "starting"
        self.received_responses = []
        for mechanic_actor, start_message in mechanics_and_start_message:
            self.send(mechanic_actor, start_message)
Esempio n. 3
0
    def receiveMsg_StartNodes(self, msg, sender):
        try:
            self.host = msg.ip
            if msg.external:
                self.logger.info(
                    "Connecting to externally provisioned nodes on [%s].",
                    msg.ip)
            else:
                self.logger.info("Starting node(s) %s on [%s].", msg.node_ids,
                                 msg.ip)

            # Load node-specific configuration
            self.config = config.auto_load_local_config(
                msg.cfg,
                additional_sections=[
                    # only copy the relevant bits
                    "track",
                    "mechanic",
                    "client",
                    # allow metrics store to extract race meta-data
                    "race",
                    "source"
                ])
            # set root path (normally done by the main entry point)
            self.config.add(config.Scope.application, "node", "rally.root",
                            paths.rally_root())
            if not msg.external:
                self.config.add(config.Scope.benchmark, "provisioning",
                                "node.ip", msg.ip)
                # we need to override the port with the value that the user has specified instead of using the default value (39200)
                self.config.add(config.Scope.benchmark, "provisioning",
                                "node.http.port", msg.port)
                self.config.add(config.Scope.benchmark, "provisioning",
                                "node.ids", msg.node_ids)

            cls = metrics.metrics_store_class(self.config)
            self.metrics_store = cls(self.config)
            self.metrics_store.open(ctx=msg.open_metrics_context)
            # avoid follow-up errors in case we receive an unexpected ActorExitRequest due to an early failure in a parent actor.
            self.metrics_store.lap = 0

            self.mechanic = create(self.config, self.metrics_store,
                                   msg.all_node_ips, msg.cluster_settings,
                                   msg.sources, msg.build, msg.distribution,
                                   msg.external, msg.docker)
            nodes = self.mechanic.start_engine()
            self.running = True
            self.send(
                getattr(msg, "reply_to", sender),
                NodesStarted([NodeMetaInfo(node) for node in nodes],
                             self.metrics_store.meta_info))
        except Exception:
            self.logger.exception("Cannot process message [%s]", msg)
            # avoid "can't pickle traceback objects"
            import traceback
            ex_type, ex_value, ex_traceback = sys.exc_info()
            self.send(getattr(msg, "reply_to", sender),
                      actor.BenchmarkFailure(ex_value, traceback.format_exc()))
Esempio n. 4
0
    def receiveMsg_StartNodes(self, msg, sender):
        try:
            self.host = msg.ip
            if msg.external:
                self.logger.info("Connecting to externally provisioned nodes on [%s].", msg.ip)
            else:
                self.logger.info("Starting node(s) %s on [%s].", msg.node_ids, msg.ip)

            # Load node-specific configuration
            cfg = config.auto_load_local_config(
                msg.cfg,
                additional_sections=[
                    # only copy the relevant bits
                    "track",
                    "mechanic",
                    "client",
                    "telemetry",
                    # allow metrics store to extract race meta-data
                    "race",
                    "source",
                ],
            )
            # set root path (normally done by the main entry point)
            cfg.add(config.Scope.application, "node", "rally.root", paths.rally_root())
            if not msg.external:
                cfg.add(config.Scope.benchmark, "provisioning", "node.ids", msg.node_ids)

            cls = metrics.metrics_store_class(cfg)
            metrics_store = cls(cfg)
            metrics_store.open(ctx=msg.open_metrics_context)
            # avoid follow-up errors in case we receive an unexpected ActorExitRequest due to an early failure in a parent actor.

            self.mechanic = create(
                cfg,
                metrics_store,
                msg.ip,
                msg.port,
                msg.all_node_ips,
                msg.all_node_ids,
                msg.sources,
                msg.distribution,
                msg.external,
                msg.docker,
            )
            self.mechanic.start_engine()
            self.wakeupAfter(METRIC_FLUSH_INTERVAL_SECONDS)
            self.send(getattr(msg, "reply_to", sender), NodesStarted())
        except Exception:
            self.logger.exception("Cannot process message [%s]", msg)
            # avoid "can't pickle traceback objects"
            _, ex_value, _ = sys.exc_info()
            self.send(getattr(msg, "reply_to", sender), actor.BenchmarkFailure(ex_value, traceback.format_exc()))
Esempio n. 5
0
def stop(cfg):
    root_path = paths.install_root(cfg)
    node_config = provisioner.load_node_configuration(root_path)
    if node_config.build_type == "tar":
        node_launcher = launcher.ProcessLauncher(cfg)
    elif node_config.build_type == "docker":
        node_launcher = launcher.DockerLauncher(cfg)
    else:
        raise exceptions.SystemSetupError("Unknown build type [{}]".format(
            node_config.build_type))

    nodes, race_id = _load_node_file(root_path)

    cls = metrics.metrics_store_class(cfg)
    metrics_store = cls(cfg)

    race_store = metrics.race_store(cfg)
    try:
        current_race = race_store.find_by_race_id(race_id)
    except exceptions.NotFound:
        logging.getLogger(__name__).info(
            "Could not find race [%s] most likely because an in-memory metrics store is "
            "used across multiple machines. Use an Elasticsearch metrics store to persist "
            "results.", race_id)
        # we are assuming here that we use an Elasticsearch metrics store... . If we use a file race store (across
        # multiple machines) we will not be able to retrieve a race. In that case we open our in-memory metrics store
        # with settings derived from startup parameters (because we can't store system metrics persistently anyway).
        current_race = metrics.create_race(cfg, track=None, challenge=None)

    metrics_store.open(race_id=current_race.race_id,
                       race_timestamp=current_race.race_timestamp,
                       track_name=current_race.track_name,
                       challenge_name=current_race.challenge_name)

    node_launcher.stop(nodes, metrics_store)
    _delete_node_file(root_path)

    metrics_store.flush(refresh=True)
    for node in nodes:
        results = metrics.calculate_system_results(metrics_store,
                                                   node.node_name)
        current_race.add_results(results)
        metrics.results_store(cfg).store_results(current_race)

    metrics_store.close()

    # TODO: Do we need to expose this as a separate command as well?
    provisioner.cleanup(preserve=cfg.opts("mechanic", "preserve.install"),
                        install_dir=node_config.binary_path,
                        data_paths=node_config.data_paths)
Esempio n. 6
0
def stop(cfg):
    root_path = paths.install_root(cfg)
    node_config = provisioner.load_node_configuration(root_path)
    if node_config.build_type == "tar":
        node_launcher = launcher.ProcessLauncher(cfg)
    elif node_config.build_type == "docker":
        node_launcher = launcher.DockerLauncher(cfg)
    else:
        raise exceptions.SystemSetupError("Unknown build type [{}]".format(
            node_config.build_type))

    nodes, race_id = _load_node_file(root_path)

    cls = metrics.metrics_store_class(cfg)
    metrics_store = cls(cfg)

    race_store = metrics.race_store(cfg)
    try:
        current_race = race_store.find_by_race_id(race_id)
        metrics_store.open(race_id=current_race.race_id,
                           race_timestamp=current_race.race_timestamp,
                           track_name=current_race.track_name,
                           challenge_name=current_race.challenge_name)
    except exceptions.NotFound:
        logging.getLogger(__name__).info(
            "Could not find race [%s] and will thus not persist system metrics.",
            race_id)
        # Don't persist system metrics if we can't retrieve the race as we cannot derive the required meta-data.
        current_race = None
        metrics_store = None

    node_launcher.stop(nodes, metrics_store)
    _delete_node_file(root_path)

    if current_race:
        metrics_store.flush(refresh=True)
        for node in nodes:
            results = metrics.calculate_system_results(metrics_store,
                                                       node.node_name)
            current_race.add_results(results)
            metrics.results_store(cfg).store_results(current_race)

        metrics_store.close()

    # TODO: Do we need to expose this as a separate command as well?
    provisioner.cleanup(preserve=cfg.opts("mechanic", "preserve.install"),
                        install_dir=node_config.binary_path,
                        data_paths=node_config.data_paths)
Esempio n. 7
0
    def receiveMsg_StartEngine(self, msg, sender):
        self.logger.info("Received signal from race control to start engine.")
        self.race_control = sender
        self.cfg = msg.cfg
        cls = metrics.metrics_store_class(self.cfg)
        self.metrics_store = cls(self.cfg)
        self.metrics_store.open(ctx=msg.open_metrics_context)
        self.car, _ = load_team(self.cfg, msg.external)
        self.team_revision = self.cfg.opts("mechanic", "repository.revision")

        # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue.
        hosts = self.cfg.opts("client", "hosts").default
        if len(hosts) == 0:
            raise exceptions.LaunchError("No target hosts are configured.")

        self.externally_provisioned = msg.external
        if self.externally_provisioned:
            self.logger.info("Cluster will not be provisioned by Rally.")
            # TODO: This needs to be handled later - we should probably disallow this entirely
            if msg.cluster_settings:
                pretty_settings = json.dumps(msg.cluster_settings, indent=2)
                warning = "Ensure that these settings are defined in elasticsearch.yml:\n\n{}\n\nIf they are absent, running this track " \
                          "will fail or lead to unexpected results.".format(pretty_settings)
                console.warn(warning, logger=self.logger)
            self.status = "nodes_started"
            self.received_responses = []
            self.on_all_nodes_started()
            self.status = "cluster_started"
        else:
            console.info("Preparing for race ...", flush=True)
            self.logger.info(
                "Cluster consisting of %s will be provisioned by Rally.",
                hosts)
            msg.hosts = hosts
            # Initialize the children array to have the right size to
            # ensure waiting for all responses
            self.children = [None] * len(nodes_by_host(to_ip_port(hosts)))
            self.send(self.createActor(Dispatcher), msg)
            self.status = "starting"
            self.received_responses = []
Esempio n. 8
0
    def receiveMsg_StartEngine(self, msg, sender):
        self.logger.info("Received signal from race control to start engine.")
        self.race_control = sender
        self.cfg = msg.cfg
        cls = metrics.metrics_store_class(self.cfg)
        self.metrics_store = cls(self.cfg)
        self.metrics_store.open(ctx=msg.open_metrics_context)
        name = self.cfg.opts("race", "pipeline")
        self.car, _ = load_team(self.cfg, msg.external)
        self.team_revision = self.cfg.opts("mechanic", "repository.revision")

        # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue.
        hosts = self.cfg.opts("client", "hosts").default
        if len(hosts) == 0:
            raise exceptions.LaunchError("No target hosts are configured.")

        if msg.external:
            self.logger.info("Cluster will not be provisioned by Rally.")
            if msg.cluster_settings:
                pretty_settings = json.dumps(msg.cluster_settings, indent=2)
                warning = "Ensure that these settings are defined in elasticsearch.yml:\n\n{}\n\nIf they are absent, running this track " \
                          "will fail or lead to unexpected results.".format(pretty_settings)
                console.warn(warning, logger=self.logger)
            # just create one actor for this special case and run it on the coordinator node (i.e. here)
            m = self.createActor(NodeMechanicActor,
                                 targetActorRequirements={"coordinator": True})
            self.children.append(m)
            self.send(m, msg.for_nodes(ip=hosts))
        else:
            console.info("Preparing for race ...", flush=True)
            self.logger.info(
                "Cluster consisting of %s will be provisioned by Rally.",
                hosts)
            msg.hosts = hosts
            # Initialize the children array to have the right size to
            # ensure waiting for all responses
            self.children = [None] * len(nodes_by_host(to_ip_port(hosts)))
            self.send(self.createActor(Dispatcher), msg)
        self.status = "starting"
        self.received_responses = []
Esempio n. 9
0
    def receiveMessage(self, msg, sender):
        # at the moment, we implement all message handling blocking. This is not ideal but simple to get started with. Besides, the caller
        # needs to block anyway. The only reason we implement mechanic as an actor is to distribute them.
        # noinspection PyBroadException
        try:
            logger.debug(
                "NodeMechanicActor#receiveMessage(msg = [%s] sender = [%s])" %
                (str(type(msg)), str(sender)))
            if isinstance(msg, StartNodes):
                self.host = msg.ip
                if msg.external:
                    logger.info(
                        "Connecting to externally provisioned nodes on [%s]." %
                        msg.ip)
                else:
                    logger.info("Starting node(s) %s on [%s]." %
                                (msg.node_ids, msg.ip))

                # Load node-specific configuration
                self.config = config.auto_load_local_config(
                    msg.cfg,
                    additional_sections=[
                        # only copy the relevant bits
                        "track",
                        "mechanic",
                        "client",
                        # allow metrics store to extract race meta-data
                        "race",
                        "source"
                    ])
                # set root path (normally done by the main entry point)
                self.config.add(config.Scope.application, "node", "rally.root",
                                paths.rally_root())
                if not msg.external:
                    self.config.add(config.Scope.benchmark, "provisioning",
                                    "node.ip", msg.ip)
                    # we need to override the port with the value that the user has specified instead of using the default value (39200)
                    self.config.add(config.Scope.benchmark, "provisioning",
                                    "node.http.port", msg.port)
                    self.config.add(config.Scope.benchmark, "provisioning",
                                    "node.ids", msg.node_ids)

                cls = metrics.metrics_store_class(self.config)
                self.metrics_store = cls(self.config)
                self.metrics_store.open(ctx=msg.open_metrics_context)
                # avoid follow-up errors in case we receive an unexpected ActorExitRequest due to an early failure in a parent actor.
                self.metrics_store.lap = 0

                self.mechanic = create(self.config, self.metrics_store,
                                       msg.all_node_ips, msg.cluster_settings,
                                       msg.sources, msg.build,
                                       msg.distribution, msg.external,
                                       msg.docker)
                nodes = self.mechanic.start_engine()
                self.running = True
                self.send(
                    sender,
                    NodesStarted([NodeMetaInfo(node) for node in nodes],
                                 self.metrics_store.meta_info))
            elif isinstance(msg, ApplyMetricsMetaInfo):
                self.metrics_store.merge_meta_info(msg.meta_info)
                self.send(sender, MetricsMetaInfoApplied())
            elif isinstance(msg, ResetRelativeTime):
                logger.info(
                    "Resetting relative time of system metrics store on host [%s]."
                    % self.host)
                self.metrics_store.reset_relative_time()
            elif isinstance(msg, OnBenchmarkStart):
                self.metrics_store.lap = msg.lap
                self.mechanic.on_benchmark_start()
                self.wakeupAfter(
                    NodeMechanicActor.METRIC_FLUSH_INTERVAL_SECONDS)
                self.send(sender, BenchmarkStarted())
            elif isinstance(msg, thespian.actors.WakeupMessage):
                if self.running:
                    logger.info("Flushing system metrics store on host [%s]." %
                                self.host)
                    self.metrics_store.flush(refresh=False)
                    self.wakeupAfter(
                        NodeMechanicActor.METRIC_FLUSH_INTERVAL_SECONDS)
            elif isinstance(msg, OnBenchmarkStop):
                self.mechanic.on_benchmark_stop()
                self.metrics_store.flush(refresh=False)
                # clear metrics store data to not send duplicate system metrics data
                self.send(
                    sender,
                    BenchmarkStopped(
                        self.metrics_store.to_externalizable(clear=True)))
            elif isinstance(msg, StopNodes):
                logger.info("Stopping nodes %s." % self.mechanic.nodes)
                self.mechanic.stop_engine()
                self.send(sender,
                          NodesStopped(self.metrics_store.to_externalizable()))
                # clear all state as the mechanic might get reused later
                self.metrics_store.close()
                self.running = False
                self.config = None
                self.mechanic = None
                self.metrics_store = None
            elif isinstance(msg, thespian.actors.ActorExitRequest):
                if self.running:
                    logger.info("Stopping nodes %s (due to ActorExitRequest)" %
                                self.mechanic.nodes)
                    self.mechanic.stop_engine()
                    self.running = False
        except BaseException as e:
            self.running = False
            logger.exception("Cannot process message [%s]" % msg)
            self.send(
                sender,
                actor.BenchmarkFailure("Error on host %s" % str(self.host), e))