def receiveMsg_StartEngine(self, msg, sender): self.logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg cls = metrics.metrics_store_class(self.cfg) self.metrics_store = cls(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) self.car, _ = load_team(self.cfg, msg.external) # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. hosts = self.cfg.opts("client", "hosts").default if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") if msg.external: self.logger.info("Cluster will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor(NodeMechanicActor, targetActorRequirements={"coordinator": True}) self.children.append(m) self.send(m, msg.for_nodes(ip=hosts)) else: self.logger.info( "Cluster consisting of %s will be provisioned by Rally.", hosts) msg.hosts = hosts # Initialize the children array to have the right size to # ensure waiting for all responses self.children = [None] * len(nodes_by_host(to_ip_port(hosts))) self.send(self.createActor(Dispatcher), msg) self.status = "starting" self.received_responses = []
def on_start_engine(self, msg, sender): logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg cls = metrics.metrics_store_class(self.cfg) self.metrics_store = cls(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. mechanics_and_start_message = [] hosts = self.cfg.opts("client", "hosts") if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") if msg.external: logger.info("Cluster will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/external", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(ip=hosts))) else: logger.info("Cluster consisting of %s will be provisioned by Rally." % hosts) all_ips_and_ports = to_ip_port(hosts) all_node_ips = extract_all_node_ips(all_ips_and_ports) for ip_port, nodes in nodes_by_host(all_ips_and_ports).items(): ip, port = ip_port if ip == "127.0.0.1": m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/localhost", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) else: if self.cfg.opts("system", "remote.benchmarking.supported"): logger.info("Benchmarking against %s with external Rally daemon." % hosts) else: logger.error("User tried to benchmark against %s but no external Rally daemon has been started." % hosts) raise exceptions.SystemSetupError("To benchmark remote hosts (e.g. %s) you need to start the Rally daemon " "on each machine including this one." % ip) already_running = actor.actor_system_already_running(ip=ip) logger.info("Actor system on [%s] already running? [%s]" % (ip, str(already_running))) if not already_running: console.println("Waiting for Rally daemon on [%s] " % ip, end="", flush=True) while not actor.actor_system_already_running(ip=ip): console.println(".", end="", flush=True) time.sleep(3) if not already_running: console.println(" [OK]") m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/%s" % ip, targetActorRequirements={"ip": ip}) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) self.children.append(m) self.status = "starting" self.received_responses = [] for mechanic_actor, start_message in mechanics_and_start_message: self.send(mechanic_actor, start_message)
def receiveMsg_StartNodes(self, msg, sender): try: self.host = msg.ip if msg.external: self.logger.info( "Connecting to externally provisioned nodes on [%s].", msg.ip) else: self.logger.info("Starting node(s) %s on [%s].", msg.node_ids, msg.ip) # Load node-specific configuration self.config = config.auto_load_local_config( msg.cfg, additional_sections=[ # only copy the relevant bits "track", "mechanic", "client", # allow metrics store to extract race meta-data "race", "source" ]) # set root path (normally done by the main entry point) self.config.add(config.Scope.application, "node", "rally.root", paths.rally_root()) if not msg.external: self.config.add(config.Scope.benchmark, "provisioning", "node.ip", msg.ip) # we need to override the port with the value that the user has specified instead of using the default value (39200) self.config.add(config.Scope.benchmark, "provisioning", "node.http.port", msg.port) self.config.add(config.Scope.benchmark, "provisioning", "node.ids", msg.node_ids) cls = metrics.metrics_store_class(self.config) self.metrics_store = cls(self.config) self.metrics_store.open(ctx=msg.open_metrics_context) # avoid follow-up errors in case we receive an unexpected ActorExitRequest due to an early failure in a parent actor. self.metrics_store.lap = 0 self.mechanic = create(self.config, self.metrics_store, msg.all_node_ips, msg.cluster_settings, msg.sources, msg.build, msg.distribution, msg.external, msg.docker) nodes = self.mechanic.start_engine() self.running = True self.send( getattr(msg, "reply_to", sender), NodesStarted([NodeMetaInfo(node) for node in nodes], self.metrics_store.meta_info)) except Exception: self.logger.exception("Cannot process message [%s]", msg) # avoid "can't pickle traceback objects" import traceback ex_type, ex_value, ex_traceback = sys.exc_info() self.send(getattr(msg, "reply_to", sender), actor.BenchmarkFailure(ex_value, traceback.format_exc()))
def receiveMsg_StartNodes(self, msg, sender): try: self.host = msg.ip if msg.external: self.logger.info("Connecting to externally provisioned nodes on [%s].", msg.ip) else: self.logger.info("Starting node(s) %s on [%s].", msg.node_ids, msg.ip) # Load node-specific configuration cfg = config.auto_load_local_config( msg.cfg, additional_sections=[ # only copy the relevant bits "track", "mechanic", "client", "telemetry", # allow metrics store to extract race meta-data "race", "source", ], ) # set root path (normally done by the main entry point) cfg.add(config.Scope.application, "node", "rally.root", paths.rally_root()) if not msg.external: cfg.add(config.Scope.benchmark, "provisioning", "node.ids", msg.node_ids) cls = metrics.metrics_store_class(cfg) metrics_store = cls(cfg) metrics_store.open(ctx=msg.open_metrics_context) # avoid follow-up errors in case we receive an unexpected ActorExitRequest due to an early failure in a parent actor. self.mechanic = create( cfg, metrics_store, msg.ip, msg.port, msg.all_node_ips, msg.all_node_ids, msg.sources, msg.distribution, msg.external, msg.docker, ) self.mechanic.start_engine() self.wakeupAfter(METRIC_FLUSH_INTERVAL_SECONDS) self.send(getattr(msg, "reply_to", sender), NodesStarted()) except Exception: self.logger.exception("Cannot process message [%s]", msg) # avoid "can't pickle traceback objects" _, ex_value, _ = sys.exc_info() self.send(getattr(msg, "reply_to", sender), actor.BenchmarkFailure(ex_value, traceback.format_exc()))
def stop(cfg): root_path = paths.install_root(cfg) node_config = provisioner.load_node_configuration(root_path) if node_config.build_type == "tar": node_launcher = launcher.ProcessLauncher(cfg) elif node_config.build_type == "docker": node_launcher = launcher.DockerLauncher(cfg) else: raise exceptions.SystemSetupError("Unknown build type [{}]".format( node_config.build_type)) nodes, race_id = _load_node_file(root_path) cls = metrics.metrics_store_class(cfg) metrics_store = cls(cfg) race_store = metrics.race_store(cfg) try: current_race = race_store.find_by_race_id(race_id) except exceptions.NotFound: logging.getLogger(__name__).info( "Could not find race [%s] most likely because an in-memory metrics store is " "used across multiple machines. Use an Elasticsearch metrics store to persist " "results.", race_id) # we are assuming here that we use an Elasticsearch metrics store... . If we use a file race store (across # multiple machines) we will not be able to retrieve a race. In that case we open our in-memory metrics store # with settings derived from startup parameters (because we can't store system metrics persistently anyway). current_race = metrics.create_race(cfg, track=None, challenge=None) metrics_store.open(race_id=current_race.race_id, race_timestamp=current_race.race_timestamp, track_name=current_race.track_name, challenge_name=current_race.challenge_name) node_launcher.stop(nodes, metrics_store) _delete_node_file(root_path) metrics_store.flush(refresh=True) for node in nodes: results = metrics.calculate_system_results(metrics_store, node.node_name) current_race.add_results(results) metrics.results_store(cfg).store_results(current_race) metrics_store.close() # TODO: Do we need to expose this as a separate command as well? provisioner.cleanup(preserve=cfg.opts("mechanic", "preserve.install"), install_dir=node_config.binary_path, data_paths=node_config.data_paths)
def stop(cfg): root_path = paths.install_root(cfg) node_config = provisioner.load_node_configuration(root_path) if node_config.build_type == "tar": node_launcher = launcher.ProcessLauncher(cfg) elif node_config.build_type == "docker": node_launcher = launcher.DockerLauncher(cfg) else: raise exceptions.SystemSetupError("Unknown build type [{}]".format( node_config.build_type)) nodes, race_id = _load_node_file(root_path) cls = metrics.metrics_store_class(cfg) metrics_store = cls(cfg) race_store = metrics.race_store(cfg) try: current_race = race_store.find_by_race_id(race_id) metrics_store.open(race_id=current_race.race_id, race_timestamp=current_race.race_timestamp, track_name=current_race.track_name, challenge_name=current_race.challenge_name) except exceptions.NotFound: logging.getLogger(__name__).info( "Could not find race [%s] and will thus not persist system metrics.", race_id) # Don't persist system metrics if we can't retrieve the race as we cannot derive the required meta-data. current_race = None metrics_store = None node_launcher.stop(nodes, metrics_store) _delete_node_file(root_path) if current_race: metrics_store.flush(refresh=True) for node in nodes: results = metrics.calculate_system_results(metrics_store, node.node_name) current_race.add_results(results) metrics.results_store(cfg).store_results(current_race) metrics_store.close() # TODO: Do we need to expose this as a separate command as well? provisioner.cleanup(preserve=cfg.opts("mechanic", "preserve.install"), install_dir=node_config.binary_path, data_paths=node_config.data_paths)
def receiveMsg_StartEngine(self, msg, sender): self.logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg cls = metrics.metrics_store_class(self.cfg) self.metrics_store = cls(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) self.car, _ = load_team(self.cfg, msg.external) self.team_revision = self.cfg.opts("mechanic", "repository.revision") # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. hosts = self.cfg.opts("client", "hosts").default if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") self.externally_provisioned = msg.external if self.externally_provisioned: self.logger.info("Cluster will not be provisioned by Rally.") # TODO: This needs to be handled later - we should probably disallow this entirely if msg.cluster_settings: pretty_settings = json.dumps(msg.cluster_settings, indent=2) warning = "Ensure that these settings are defined in elasticsearch.yml:\n\n{}\n\nIf they are absent, running this track " \ "will fail or lead to unexpected results.".format(pretty_settings) console.warn(warning, logger=self.logger) self.status = "nodes_started" self.received_responses = [] self.on_all_nodes_started() self.status = "cluster_started" else: console.info("Preparing for race ...", flush=True) self.logger.info( "Cluster consisting of %s will be provisioned by Rally.", hosts) msg.hosts = hosts # Initialize the children array to have the right size to # ensure waiting for all responses self.children = [None] * len(nodes_by_host(to_ip_port(hosts))) self.send(self.createActor(Dispatcher), msg) self.status = "starting" self.received_responses = []
def receiveMsg_StartEngine(self, msg, sender): self.logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg cls = metrics.metrics_store_class(self.cfg) self.metrics_store = cls(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) name = self.cfg.opts("race", "pipeline") self.car, _ = load_team(self.cfg, msg.external) self.team_revision = self.cfg.opts("mechanic", "repository.revision") # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. hosts = self.cfg.opts("client", "hosts").default if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") if msg.external: self.logger.info("Cluster will not be provisioned by Rally.") if msg.cluster_settings: pretty_settings = json.dumps(msg.cluster_settings, indent=2) warning = "Ensure that these settings are defined in elasticsearch.yml:\n\n{}\n\nIf they are absent, running this track " \ "will fail or lead to unexpected results.".format(pretty_settings) console.warn(warning, logger=self.logger) # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor(NodeMechanicActor, targetActorRequirements={"coordinator": True}) self.children.append(m) self.send(m, msg.for_nodes(ip=hosts)) else: console.info("Preparing for race ...", flush=True) self.logger.info( "Cluster consisting of %s will be provisioned by Rally.", hosts) msg.hosts = hosts # Initialize the children array to have the right size to # ensure waiting for all responses self.children = [None] * len(nodes_by_host(to_ip_port(hosts))) self.send(self.createActor(Dispatcher), msg) self.status = "starting" self.received_responses = []
def receiveMessage(self, msg, sender): # at the moment, we implement all message handling blocking. This is not ideal but simple to get started with. Besides, the caller # needs to block anyway. The only reason we implement mechanic as an actor is to distribute them. # noinspection PyBroadException try: logger.debug( "NodeMechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartNodes): self.host = msg.ip if msg.external: logger.info( "Connecting to externally provisioned nodes on [%s]." % msg.ip) else: logger.info("Starting node(s) %s on [%s]." % (msg.node_ids, msg.ip)) # Load node-specific configuration self.config = config.auto_load_local_config( msg.cfg, additional_sections=[ # only copy the relevant bits "track", "mechanic", "client", # allow metrics store to extract race meta-data "race", "source" ]) # set root path (normally done by the main entry point) self.config.add(config.Scope.application, "node", "rally.root", paths.rally_root()) if not msg.external: self.config.add(config.Scope.benchmark, "provisioning", "node.ip", msg.ip) # we need to override the port with the value that the user has specified instead of using the default value (39200) self.config.add(config.Scope.benchmark, "provisioning", "node.http.port", msg.port) self.config.add(config.Scope.benchmark, "provisioning", "node.ids", msg.node_ids) cls = metrics.metrics_store_class(self.config) self.metrics_store = cls(self.config) self.metrics_store.open(ctx=msg.open_metrics_context) # avoid follow-up errors in case we receive an unexpected ActorExitRequest due to an early failure in a parent actor. self.metrics_store.lap = 0 self.mechanic = create(self.config, self.metrics_store, msg.all_node_ips, msg.cluster_settings, msg.sources, msg.build, msg.distribution, msg.external, msg.docker) nodes = self.mechanic.start_engine() self.running = True self.send( sender, NodesStarted([NodeMetaInfo(node) for node in nodes], self.metrics_store.meta_info)) elif isinstance(msg, ApplyMetricsMetaInfo): self.metrics_store.merge_meta_info(msg.meta_info) self.send(sender, MetricsMetaInfoApplied()) elif isinstance(msg, ResetRelativeTime): logger.info( "Resetting relative time of system metrics store on host [%s]." % self.host) self.metrics_store.reset_relative_time() elif isinstance(msg, OnBenchmarkStart): self.metrics_store.lap = msg.lap self.mechanic.on_benchmark_start() self.wakeupAfter( NodeMechanicActor.METRIC_FLUSH_INTERVAL_SECONDS) self.send(sender, BenchmarkStarted()) elif isinstance(msg, thespian.actors.WakeupMessage): if self.running: logger.info("Flushing system metrics store on host [%s]." % self.host) self.metrics_store.flush(refresh=False) self.wakeupAfter( NodeMechanicActor.METRIC_FLUSH_INTERVAL_SECONDS) elif isinstance(msg, OnBenchmarkStop): self.mechanic.on_benchmark_stop() self.metrics_store.flush(refresh=False) # clear metrics store data to not send duplicate system metrics data self.send( sender, BenchmarkStopped( self.metrics_store.to_externalizable(clear=True))) elif isinstance(msg, StopNodes): logger.info("Stopping nodes %s." % self.mechanic.nodes) self.mechanic.stop_engine() self.send(sender, NodesStopped(self.metrics_store.to_externalizable())) # clear all state as the mechanic might get reused later self.metrics_store.close() self.running = False self.config = None self.mechanic = None self.metrics_store = None elif isinstance(msg, thespian.actors.ActorExitRequest): if self.running: logger.info("Stopping nodes %s (due to ActorExitRequest)" % self.mechanic.nodes) self.mechanic.stop_engine() self.running = False except BaseException as e: self.running = False logger.exception("Cannot process message [%s]" % msg) self.send( sender, actor.BenchmarkFailure("Error on host %s" % str(self.host), e))