def race(cfg, sources=False, build=False, distribution=False, external=False, docker=False): logger = logging.getLogger(__name__) # at this point an actor system has to run and we should only join actor_system = actor.bootstrap_actor_system(try_join=True) benchmark_actor = actor_system.createActor( BenchmarkActor, targetActorRequirements={"coordinator": True}) try: result = actor_system.ask( benchmark_actor, Setup(cfg, sources, build, distribution, external, docker)) if isinstance(result, Success): logger.info("Benchmark has finished successfully.") # may happen if one of the load generators has detected that the user has cancelled the benchmark. elif isinstance(result, actor.BenchmarkCancelled): logger.info( "User has cancelled the benchmark (detected by actor).") elif isinstance(result, actor.BenchmarkFailure): logger.error("A benchmark failure has occurred") raise exceptions.RallyError(result.message, result.cause) else: raise exceptions.RallyError( "Got an unexpected result during benchmarking: [%s]." % str(result)) except KeyboardInterrupt: logger.info( "User has cancelled the benchmark (detected by race control).") # notify the coordinator so it can properly handle this state. Do it blocking so we don't have a race between this message # and the actor exit request. actor_system.ask(benchmark_actor, actor.BenchmarkCancelled()) finally: logger.info("Telling benchmark actor to exit.") actor_system.tell(benchmark_actor, thespian.actors.ActorExitRequest())
def create(cfg, sources, distribution, build, challenge_root_path, plugins): revisions = _extract_revisions(cfg.opts("mechanic", "source.revision")) java9_home = _java9_home(cfg) distribution_version = cfg.opts("mechanic", "distribution.version", mandatory=False) supply_requirements = _supply_requirements(sources, distribution, build, plugins, revisions, distribution_version) build_needed = any([build for _, _, build in supply_requirements.values()]) src_config = cfg.all_opts("source") suppliers = [] if build_needed: gradle = cfg.opts("build", "gradle.bin") es_src_dir = os.path.join(_src_dir(cfg), _config_value(src_config, "elasticsearch.src.subdir")) builder = Builder(es_src_dir, gradle, java9_home, challenge_root_path) else: builder = None es_supplier_type, es_version, es_build = supply_requirements["elasticsearch"] if es_supplier_type == "source": es_src_dir = os.path.join(_src_dir(cfg), _config_value(src_config, "elasticsearch.src.subdir")) suppliers.append(ElasticsearchSourceSupplier(es_version, es_src_dir, remote_url=cfg.opts("source", "remote.repo.url"), builder=builder)) repo = None else: es_src_dir = None distributions_root = os.path.join(cfg.opts("node", "root.dir"), cfg.opts("source", "distribution.dir")) repo = DistributionRepository(name=cfg.opts("mechanic", "distribution.repository"), distribution_config=cfg.all_opts("distributions"), version=es_version) suppliers.append(ElasticsearchDistributionSupplier(repo, distributions_root)) for plugin in plugins: supplier_type, plugin_version, build_plugin = supply_requirements[plugin.name] if supplier_type == "source": if CorePluginSourceSupplier.can_handle(plugin): logger.info("Adding core plugin source supplier for [%s]." % plugin.name) assert es_src_dir is not None, "Cannot build core plugin %s when Elasticsearch is not built from source." % plugin.name suppliers.append(CorePluginSourceSupplier(plugin, es_src_dir, builder)) elif ExternalPluginSourceSupplier.can_handle(plugin): logger.info("Adding external plugin source supplier for [%s]." % plugin.name) suppliers.append(ExternalPluginSourceSupplier(plugin, plugin_version, _src_dir(cfg, mandatory=False), src_config, builder)) else: raise exceptions.RallyError("Plugin %s can neither be treated as core nor as external plugin. Requirements: %s" % (plugin.name, supply_requirements[plugin.name])) else: logger.info("Adding plugin distribution supplier for [%s]." % plugin.name) assert repo is not None, "Cannot benchmark plugin %s from a distribution version but Elasticsearch from sources" % plugin.name suppliers.append(PluginDistributionSupplier(repo, plugin)) return CompositeSupplier(suppliers)
def main(): check_python_version() log.remove_obsolete_default_log_config() log.install_default_log_config() log.configure_logging() console.init() parser = argparse.ArgumentParser( prog=PROGRAM_NAME, description=BANNER + "\n\n Rally daemon to support remote benchmarks", epilog="Find out more about Rally at %s" % console.format.link(DOC_LINK), formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('--version', action='version', version="%(prog)s " + version.version()) subparsers = parser.add_subparsers(title="subcommands", dest="subcommand", help="") subparsers.required = True start_command = subparsers.add_parser("start", help="Starts the Rally daemon") restart_command = subparsers.add_parser("restart", help="Restarts the Rally daemon") for p in [start_command, restart_command]: p.add_argument("--node-ip", required=True, help="The IP of this node.") p.add_argument("--coordinator-ip", required=True, help="The IP of the coordinator node.") subparsers.add_parser("stop", help="Stops the Rally daemon") subparsers.add_parser( "status", help="Shows the current status of the local Rally daemon") args = parser.parse_args() if args.subcommand == "start": start(args) elif args.subcommand == "stop": stop() elif args.subcommand == "status": status() elif args.subcommand == "restart": stop(raise_errors=False) start(args) else: raise exceptions.RallyError("Unknown subcommand [%s]" % args.subcommand)
def run(self, lap): self.metrics_store.lap = lap main_driver = self.actor_system.createActor(driver.Driver) self.cluster.on_benchmark_start() result = self.actor_system.ask( main_driver, driver.StartBenchmark(self.cfg, self.track, self.metrics_store.meta_info, self.metrics_store.lap)) if isinstance(result, driver.BenchmarkComplete): logger.info("Benchmark is complete.") logger.info("Notifying cluster.") self.cluster.on_benchmark_stop() logger.info("Bulk adding data to metrics store.") self.metrics_store.bulk_add(result.metrics) logger.info("Flushing metrics data...") self.metrics_store.flush() logger.info("Flushing done") elif isinstance(result, driver.BenchmarkFailure): raise exceptions.RallyError(result.message, result.cause) else: raise exceptions.RallyError( "Driver has returned no metrics but instead [%s]. Terminating race without result." % str(result))
def start(args): if actor.actor_system_already_running(): raise exceptions.RallyError( "An actor system appears to be already running.") # TheSpian writes the following warning upon start (at least) on Mac OS X: # # WARNING:root:Unable to get address info for address 103.1.168.192.in-addr.arpa (AddressFamily.AF_INET,\ # SocketKind.SOCK_DGRAM, 17, 0): <class 'socket.gaierror'> [Errno 8] nodename nor servname provided, or not known # # Therefore, we will not show warnings but only errors. logging.basicConfig(level=logging.ERROR) actor.bootstrap_actor_system(local_ip=args.node_ip, coordinator_ip=args.coordinator_ip) console.info( "Successfully started actor system on node [%s] with coordinator node IP [%s]." % (args.node_ip, args.coordinator_ip))
def run(cfg): logger = logging.getLogger(__name__) name = cfg.opts("race", "pipeline") race_id = cfg.opts("system", "race.id") console.info(f"Race id is [{race_id}]", logger=logger) if len(name) == 0: # assume from-distribution pipeline if distribution.version has been specified and --pipeline cli arg not set if cfg.exists("mechanic", "distribution.version"): name = "from-distribution" else: name = "from-sources" logger.info( "User specified no pipeline. Automatically derived pipeline [%s].", name) cfg.add(config.Scope.applicationOverride, "race", "pipeline", name) else: logger.info("User specified pipeline [%s].", name) if os.environ.get("RALLY_RUNNING_IN_DOCKER", "").upper() == "TRUE": # in this case only benchmarking remote Elasticsearch clusters makes sense if name != "benchmark-only": raise exceptions.SystemSetupError( "Only the [benchmark-only] pipeline is supported by the Rally Docker image.\n" "Add --pipeline=benchmark-only in your Rally arguments and try again.\n" "For more details read the docs for the benchmark-only pipeline in {}\n" .format(doc_link("pipelines.html#benchmark-only"))) try: pipeline = pipelines[name] except KeyError: raise exceptions.SystemSetupError( "Unknown pipeline [%s]. List the available pipelines with %s list pipelines." % (name, PROGRAM_NAME)) try: pipeline(cfg) except exceptions.RallyError as e: # just pass on our own errors. It should be treated differently on top-level raise e except KeyboardInterrupt: logger.info("User has cancelled the benchmark.") raise exceptions.UserInterrupted( "User has cancelled the benchmark (detected by race control)." ) from None except BaseException: tb = sys.exc_info()[2] raise exceptions.RallyError( "This race ended with a fatal crash.").with_traceback(tb)
def run(cfg): logger = logging.getLogger(__name__) name = cfg.opts("race", "pipeline") if len(name) == 0: # assume from-distribution pipeline if distribution.version has been specified and --pipeline cli arg not set if cfg.exists("mechanic", "distribution.version"): name = "from-distribution" else: name = "from-sources-complete" logger.info( "User specified no pipeline. Automatically derived pipeline [%s].", name) cfg.add(config.Scope.applicationOverride, "race", "pipeline", name) else: if (cfg.exists("mechanic", "distribution.version") and name in [ "from-sources-complete", "from-sources-skip-build", "benchmark-only" ]): raise exceptions.SystemSetupError( "--distribution-version can only be used together with pipeline from-distribution, " "" "but you specified {}.\n" "If you intend to benchmark an externally provisioned cluster, don't specify --distribution-version otherwise\n" "please read the docs for from-distribution pipeline at " "{}/pipelines.html#from-distribution".format(name, DOC_LINK)) logger.info("User specified pipeline [%s].", name) try: pipeline = pipelines[name] except KeyError: raise exceptions.SystemSetupError( "Unknown pipeline [%s]. List the available pipelines with %s list pipelines." % (name, PROGRAM_NAME)) try: pipeline(cfg) except exceptions.RallyError as e: # just pass on our own errors. It should be treated differently on top-level raise e except KeyboardInterrupt: logger.info("User has cancelled the benchmark.") except BaseException: tb = sys.exc_info()[2] raise exceptions.RallyError( "This race ended with a fatal crash.").with_traceback(tb)
def run(cfg): name = cfg.opts("system", "pipeline") try: pipeline = pipelines[name](RacingContext(cfg)) except KeyError: raise exceptions.ImproperlyConfigured( "Unknown pipeline [%s]. You can list the available pipelines with %s list pipelines." % (name, PROGRAM_NAME)) try: pipeline() except exceptions.RallyError as e: # just pass on our own errors. It should be treated differently on top-level raise e except BaseException: tb = sys.exc_info()[2] raise exceptions.RallyError( "This race ended early with a fatal crash. For details please see the logs." ).with_traceback(tb)
def install(self, es_home_path, plugin_url=None): installer_binary_path = os.path.join(es_home_path, "bin", "elasticsearch-plugin") if plugin_url: logger.info("Installing [%s] into [%s] from [%s]" % (self.plugin_name, es_home_path, plugin_url)) install_cmd = '%s install --batch "%s"' % (installer_binary_path, plugin_url) else: logger.info("Installing [%s] into [%s]" % (self.plugin_name, es_home_path)) install_cmd = '%s install --batch "%s"' % (installer_binary_path, self.plugin_name) return_code = process.run_subprocess_with_logging(install_cmd) # see: https://www.elastic.co/guide/en/elasticsearch/plugins/current/_other_command_line_parameters.html if return_code == 0: logger.info("Successfully installed [%s]." % self.plugin_name) elif return_code == 64: # most likely this is an unknown plugin raise exceptions.SystemSetupError("Unknown plugin [%s]" % self.plugin_name) elif return_code == 74: raise exceptions.SupplyError("I/O error while trying to install [%s]" % self.plugin_name) else: raise exceptions.RallyError("Unknown error while trying to install [%s] (installer return code [%s]). Please check the logs." % (self.plugin_name, str(return_code)))
def benchmark_external(ctx): # TODO dm module refactoring: we can just inline prepare_benchmark_external and simplify this code a bit track_name = ctx.config.opts("system", "track") challenge_name = ctx.config.opts("benchmarks", "challenge") print("Racing on track [%s] and challenge [%s]" % (track_name, challenge_name)) actors = thespian.actors.ActorSystem() main_driver = actors.createActor(driver.Driver) #TODO dm: Retrieving the metrics store here is *dirty*... metrics_store = ctx.mechanic._metrics_store ctx.cluster.on_benchmark_start() completed = actors.ask( main_driver, driver.StartBenchmark(ctx.config, ctx.track, metrics_store.meta_info)) ctx.cluster.on_benchmark_stop() if not hasattr(completed, "metrics"): raise exceptions.RallyError( "Driver has returned no metrics but instead [%s]. Terminating race without result." % str(completed)) metrics_store.bulk_add(completed.metrics) ctx.mechanic.stop_metrics()
def list_facts(cfg): console.info("This is an experimental command and subject to change.") # provide a custom error message target_hosts = cfg.opts("facts", "hosts", mandatory=False) if not target_hosts: raise exceptions.SystemSetupError( "Please define a target host with --target-hosts") if len(target_hosts) > 1: raise exceptions.SystemSetupError( "Only one target host is supported at the moment but you provided %s" % target_hosts) # at this point an actor system has to run and we should only join actor_system = actor.bootstrap_actor_system(try_join=True) facts_actor = actor_system.createActor( FactsActor, targetActorRequirements={"ip": target_hosts[0]}) result = actor_system.ask(facts_actor, GatherFacts()) if isinstance(result, Facts): console.println(json.dumps(result.facts, indent=" ")) else: raise exceptions.RallyError("Could not gather facts: [%s]." % str(result))
def run_async(cfg): console.warn("The race-async command is experimental.") logger = logging.getLogger(__name__) # We'll use a special car name for external benchmarks. cfg.add(config.Scope.benchmark, "mechanic", "car.names", ["external"]) coordinator = BenchmarkCoordinator(cfg) try: coordinator.setup() race_driver = driver.AsyncDriver(cfg, coordinator.current_track, coordinator.current_challenge) distribution_flavor, distribution_version, revision = race_driver.setup( ) coordinator.on_preparation_complete(distribution_flavor, distribution_version, revision) new_metrics = race_driver.run() coordinator.on_benchmark_complete(new_metrics) except KeyboardInterrupt: logger.info("User has cancelled the benchmark.") except BaseException as e: tb = sys.exc_info()[2] raise exceptions.RallyError(str(e)).with_traceback(tb)
def create(cfg, metrics_store, all_node_ips, cluster_settings=None, sources=False, build=False, distribution=False, external=False, docker=False): races_root = paths.races_root(cfg) challenge_root_path = paths.race_root(cfg) node_ids = cfg.opts("provisioning", "node.ids", mandatory=False) repo = team.team_repo(cfg) # externally provisioned clusters do not support cars / plugins if external: car = None plugins = [] else: car = team.load_car(repo, cfg.opts("mechanic", "car.names")) plugins = team.load_plugins(repo, cfg.opts("mechanic", "car.plugins")) if sources: try: src_dir = cfg.opts("source", "local.src.dir") except config.ConfigError: logger.exception("Cannot determine source directory") raise exceptions.SystemSetupError("You cannot benchmark Elasticsearch from sources. Did you install Gradle? Please install" " all prerequisites and reconfigure Rally with %s configure" % PROGRAM_NAME) remote_url = cfg.opts("source", "remote.repo.url") revision = cfg.opts("mechanic", "source.revision") gradle = cfg.opts("build", "gradle.bin") java_home = cfg.opts("runtime", "java.home") if len(plugins) > 0: raise exceptions.RallyError("Source builds of plugins are not supported yet. For more details, please " "check https://github.com/elastic/rally/issues/309 and upgrade Rally in case support has been " "added in the meantime.") s = lambda: supplier.from_sources(remote_url, src_dir, revision, gradle, java_home, challenge_root_path, build) p = [] for node_id in node_ids: p.append(provisioner.local_provisioner(cfg, car, plugins, cluster_settings, all_node_ips, challenge_root_path, node_id)) l = launcher.InProcessLauncher(cfg, metrics_store, races_root) elif distribution: version = cfg.opts("mechanic", "distribution.version") repo_name = cfg.opts("mechanic", "distribution.repository") distributions_root = "%s/%s" % (cfg.opts("node", "root.dir"), cfg.opts("source", "distribution.dir")) distribution_cfg = cfg.all_opts("distributions") s = lambda: supplier.from_distribution(version=version, repo_name=repo_name, distribution_config=distribution_cfg, distributions_root=distributions_root, plugins=plugins) p = [] for node_id in node_ids: p.append(provisioner.local_provisioner(cfg, car, plugins, cluster_settings, all_node_ips, challenge_root_path, node_id)) l = launcher.InProcessLauncher(cfg, metrics_store, races_root) elif external: if cluster_settings: logger.warning("Cannot apply challenge-specific cluster settings [%s] for an externally provisioned cluster. Please ensure " "that the cluster settings are present or the benchmark may fail or behave unexpectedly." % cluster_settings) if len(plugins) > 0: raise exceptions.SystemSetupError("You cannot specify any plugins for externally provisioned clusters. Please remove " "\"--elasticsearch-plugins\" and try again.") s = lambda: None p = [provisioner.no_op_provisioner()] l = launcher.ExternalLauncher(cfg, metrics_store) elif docker: if len(plugins) > 0: raise exceptions.SystemSetupError("You cannot specify any plugins for Docker clusters. Please remove " "\"--elasticsearch-plugins\" and try again.") s = lambda: None p = [] for node_id in node_ids: p.append(provisioner.docker_provisioner(cfg, car, cluster_settings, challenge_root_path, node_id)) l = launcher.DockerLauncher(cfg, metrics_store) else: # It is a programmer error (and not a user error) if this function is called with wrong parameters raise RuntimeError("One of sources, distribution, docker or external must be True") return Mechanic(s, p, l)
def receiveMessage(self, msg, sender): try: logger.info( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): self.on_start_engine(msg, sender) elif isinstance(msg, NodesStarted): self.metrics_store.merge_meta_info(msg.system_meta_info) self.transition_when_all_children_responded( sender, msg, "starting", "nodes_started", self.on_all_nodes_started) elif isinstance(msg, MetricsMetaInfoApplied): self.transition_when_all_children_responded( sender, msg, "apply_meta_info", "cluster_started", self.on_cluster_started) elif isinstance(msg, OnBenchmarkStart): self.metrics_store.lap = msg.lap self.cluster.on_benchmark_start() # in the first lap, we are in state "cluster_started", after that in "benchmark_stopped" self.send_to_children_and_transition( sender, msg, ["cluster_started", "benchmark_stopped"], "benchmark_starting") elif isinstance(msg, BenchmarkStarted): self.transition_when_all_children_responded( sender, msg, "benchmark_starting", "benchmark_started", self.on_benchmark_started) elif isinstance(msg, ResetRelativeTime): if msg.reset_in_seconds > 0: self.wakeupAfter(msg.reset_in_seconds) else: self.reset_relative_time() elif isinstance(msg, thespian.actors.WakeupMessage): self.reset_relative_time() elif isinstance(msg, actor.BenchmarkFailure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): self.send_to_children_and_transition(sender, msg, "benchmark_started", "benchmark_stopping") elif isinstance(msg, BenchmarkStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "benchmark_stopping", "benchmark_stopped", self.on_benchmark_stopped) elif isinstance(msg, StopEngine): # detach from cluster and gather all system metrics self.cluster_launcher.stop(self.cluster) # we might have experienced a launch error or the user has cancelled the benchmark. Hence we need to allow to stop the # cluster from various states and we don't check here for a specific one. self.send_to_children_and_transition(sender, StopNodes(), [], "cluster_stopping") elif isinstance(msg, NodesStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "cluster_stopping", "cluster_stopped", self.on_all_nodes_stopped) elif isinstance(msg, thespian.actors.ActorExitRequest): # due to early termination by race control. If it's self-initiated we already took care of the rest. if sender != self.myAddress: self.send_to_children_and_transition( self.myAddress, msg, expected_status=None, new_status="cluster_stopping") elif isinstance(msg, thespian.actors.ChildActorExited): if self.is_current_status_expected("cluster_stopping"): logger.info( "Child actor exited while engine is stopping: [%s]" % msg) else: raise exceptions.RallyError( "Child actor exited with [%s] while in status [%s]." % (msg, self.status)) elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) except BaseException: # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor. recipient = self.race_control if sender in self.children else sender logger.exception("Cannot process message [%s]. Notifying [%s]." % (msg, recipient)) ex_type, ex_value, ex_traceback = sys.exc_info() # avoid "can't pickle traceback objects" import traceback self.send( recipient, actor.BenchmarkFailure( "Could not execute command (%s)" % ex_value, traceback.format_exc()))
def create(cfg, sources, distribution, build, car, plugins=None): logger = logging.getLogger(__name__) if plugins is None: plugins = [] revisions = _extract_revisions(cfg.opts("mechanic", "source.revision")) distribution_version = cfg.opts("mechanic", "distribution.version", mandatory=False) supply_requirements = _supply_requirements(sources, distribution, build, plugins, revisions, distribution_version) build_needed = any([build for _, _, build in supply_requirements.values()]) src_config = cfg.all_opts("source") suppliers = [] if build_needed: java_home = _java_home(car) es_src_dir = os.path.join( _src_dir(cfg), _config_value(src_config, "elasticsearch.src.subdir")) builder = Builder(es_src_dir, java_home, paths.logs()) else: builder = None es_supplier_type, es_version, es_build = supply_requirements[ "elasticsearch"] if es_supplier_type == "source": es_src_dir = os.path.join( _src_dir(cfg), _config_value(src_config, "elasticsearch.src.subdir")) suppliers.append( ElasticsearchSourceSupplier(es_version, es_src_dir, remote_url=cfg.opts( "source", "remote.repo.url"), car=car, builder=builder)) repo = None else: es_src_dir = None distributions_root = os.path.join( cfg.opts("node", "root.dir"), cfg.opts("source", "distribution.dir")) dist_cfg = {} # car / plugin defines defaults... dist_cfg.update(car.variables) for plugin in plugins: for k, v in plugin.variables.items(): dist_cfg["plugin_{}_{}".format(plugin.name, k)] = v # ... but the user can override it in rally.ini dist_cfg.update(cfg.all_opts("distributions")) repo = DistributionRepository(name=cfg.opts("mechanic", "distribution.repository"), distribution_config=dist_cfg, version=es_version) suppliers.append( ElasticsearchDistributionSupplier(repo, distributions_root)) for plugin in plugins: supplier_type, plugin_version, build_plugin = supply_requirements[ plugin.name] if supplier_type == "source": if CorePluginSourceSupplier.can_handle(plugin): logger.info("Adding core plugin source supplier for [%s].", plugin.name) assert es_src_dir is not None, "Cannot build core plugin %s when Elasticsearch is not built from source." % plugin.name suppliers.append( CorePluginSourceSupplier(plugin, es_src_dir, builder)) elif ExternalPluginSourceSupplier.can_handle(plugin): logger.info("Adding external plugin source supplier for [%s].", plugin.name) suppliers.append( ExternalPluginSourceSupplier( plugin, plugin_version, _src_dir(cfg, mandatory=False), src_config, builder)) else: raise exceptions.RallyError( "Plugin %s can neither be treated as core nor as external plugin. Requirements: %s" % (plugin.name, supply_requirements[plugin.name])) else: logger.info("Adding plugin distribution supplier for [%s].", plugin.name) assert repo is not None, "Cannot benchmark plugin %s from a distribution version but Elasticsearch from sources" % plugin.name suppliers.append(PluginDistributionSupplier(repo, plugin)) return CompositeSupplier(suppliers)
def receiveMessage(self, msg, sender): try: logger.info( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): self.on_start_engine(msg, sender) elif isinstance(msg, NodesStarted): self.metrics_store.merge_meta_info(msg.system_meta_info) self.transition_when_all_children_responded( sender, msg, "starting", "nodes_started", self.on_all_nodes_started) elif isinstance(msg, MetricsMetaInfoApplied): self.transition_when_all_children_responded( sender, msg, "apply_meta_info", "cluster_started", self.on_cluster_started) elif isinstance(msg, OnBenchmarkStart): self.metrics_store.lap = msg.lap # in the first lap, we are in state "cluster_started", after that in "benchmark_stopped" self.send_to_children_and_transition( sender, msg, ["cluster_started", "benchmark_stopped"], "benchmark_starting") elif isinstance(msg, BenchmarkStarted): self.transition_when_all_children_responded( sender, msg, "benchmark_starting", "benchmark_started", self.on_benchmark_started) elif isinstance(msg, ResetRelativeTime): if msg.reset_in_seconds > 0: self.wakeupAfter(msg.reset_in_seconds) else: self.reset_relative_time() elif isinstance(msg, thespian.actors.WakeupMessage): self.reset_relative_time() elif isinstance(msg, actor.BenchmarkFailure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): self.send_to_children_and_transition(sender, msg, "benchmark_started", "benchmark_stopping") elif isinstance(msg, BenchmarkStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "benchmark_stopping", "benchmark_stopped", self.on_benchmark_stopped) elif isinstance(msg, StopEngine): # detach from cluster and gather all system metrics self.cluster_launcher.stop(self.cluster) # we might have experienced a launch error or the user has cancelled the benchmark. Hence we need to allow to stop the # cluster from various states and we don't check here for a specific one. self.send_to_children_and_transition(sender, StopNodes(), [], "cluster_stopping") elif isinstance(msg, NodesStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "cluster_stopping", "cluster_stopped", self.on_all_nodes_stopped) elif isinstance(msg, thespian.actors.ActorExitRequest): # due to early termination by race control. If it's self-initiated we already took care of the rest. if sender != self.myAddress: self.send_to_children_and_transition( self.myAddress, msg, expected_status=None, new_status="cluster_stopping") elif isinstance(msg, thespian.actors.ChildActorExited): if self.is_current_status_expected( ["cluster_stopping", "cluster_stopped"]): logger.info( "Child actor exited while engine is stopping or stopped: [%s]" % msg) else: raise exceptions.RallyError( "Child actor exited with [%s] while in status [%s]." % (msg, self.status)) elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) else: logger.info( "MechanicActor received unknown message [%s] (ignoring)." % (str(msg))) except BaseException as e: logger.exception("Cannot process message") logger.error("Failed message details: [%s]. Notifying [%s]." % (msg, self.race_control)) self.send( self.race_control, actor.BenchmarkFailure( "Error in Elasticsearch cluster coordinator", e))
def create(cfg, sources, distribution, car, plugins=None): logger = logging.getLogger(__name__) if plugins is None: plugins = [] caching_enabled = cfg.opts("source", "cache", mandatory=False, default_value=True) revisions = _extract_revisions( cfg.opts("mechanic", "source.revision", mandatory=sources)) distribution_version = cfg.opts("mechanic", "distribution.version", mandatory=False) supply_requirements = _supply_requirements(sources, distribution, plugins, revisions, distribution_version) build_needed = any([build for _, _, build in supply_requirements.values()]) es_supplier_type, es_version, _ = supply_requirements["elasticsearch"] src_config = cfg.all_opts("source") suppliers = [] target_os = cfg.opts("mechanic", "target.os", mandatory=False) target_arch = cfg.opts("mechanic", "target.arch", mandatory=False) template_renderer = TemplateRenderer(version=es_version, os_name=target_os, arch=target_arch) if build_needed: raw_build_jdk = car.mandatory_var("build.jdk") try: build_jdk = int(raw_build_jdk) except ValueError: raise exceptions.SystemSetupError( f"Car config key [build.jdk] is invalid: [{raw_build_jdk}] (must be int)" ) es_src_dir = os.path.join( _src_dir(cfg), _config_value(src_config, "elasticsearch.src.subdir")) builder = Builder(es_src_dir, build_jdk, paths.logs()) else: builder = None distributions_root = os.path.join(cfg.opts("node", "root.dir"), cfg.opts("source", "distribution.dir")) dist_cfg = {} # car / plugin defines defaults... dist_cfg.update(car.variables) for plugin in plugins: for k, v in plugin.variables.items(): dist_cfg["plugin_{}_{}".format(plugin.name, k)] = v # ... but the user can override it in rally.ini dist_cfg.update(cfg.all_opts("distributions")) if caching_enabled: logger.info("Enabling source artifact caching.") max_age_days = int( cfg.opts("source", "cache.days", mandatory=False, default_value=7)) if max_age_days <= 0: raise exceptions.SystemSetupError( f"cache.days must be a positive number but is {max_age_days}") source_distributions_root = os.path.join(distributions_root, "src") _prune(source_distributions_root, max_age_days) else: logger.info("Disabling source artifact caching.") source_distributions_root = None if es_supplier_type == "source": es_src_dir = os.path.join( _src_dir(cfg), _config_value(src_config, "elasticsearch.src.subdir")) source_supplier = ElasticsearchSourceSupplier( es_version, es_src_dir, remote_url=cfg.opts("source", "remote.repo.url"), car=car, builder=builder, template_renderer=template_renderer) if caching_enabled: es_file_resolver = ElasticsearchFileNameResolver( dist_cfg, template_renderer) source_supplier = CachedSourceSupplier(source_distributions_root, source_supplier, es_file_resolver) suppliers.append(source_supplier) repo = None else: es_src_dir = None repo = DistributionRepository(name=cfg.opts("mechanic", "distribution.repository"), distribution_config=dist_cfg, template_renderer=template_renderer) suppliers.append( ElasticsearchDistributionSupplier(repo, es_version, distributions_root)) for plugin in plugins: supplier_type, plugin_version, _ = supply_requirements[plugin.name] if supplier_type == "source": if CorePluginSourceSupplier.can_handle(plugin): logger.info("Adding core plugin source supplier for [%s].", plugin.name) assert es_src_dir is not None, f"Cannot build core plugin {plugin.name} when Elasticsearch is not built from source." plugin_supplier = CorePluginSourceSupplier( plugin, es_src_dir, builder) elif ExternalPluginSourceSupplier.can_handle(plugin): logger.info("Adding external plugin source supplier for [%s].", plugin.name) plugin_supplier = ExternalPluginSourceSupplier( plugin, plugin_version, _src_dir(cfg, mandatory=False), src_config, builder) else: raise exceptions.RallyError( "Plugin %s can neither be treated as core nor as external plugin. Requirements: %s" % (plugin.name, supply_requirements[plugin.name])) if caching_enabled: plugin_file_resolver = PluginFileNameResolver( plugin.name, plugin_version) plugin_supplier = CachedSourceSupplier( source_distributions_root, plugin_supplier, plugin_file_resolver) suppliers.append(plugin_supplier) else: logger.info("Adding plugin distribution supplier for [%s].", plugin.name) assert repo is not None, "Cannot benchmark plugin %s from a distribution version but Elasticsearch from sources" % plugin.name suppliers.append(PluginDistributionSupplier(repo, plugin)) return CompositeSupplier(suppliers)
def run(self, lap): """ Runs the provided lap of a benchmark. :param lap: The current lap number. :return: True iff the benchmark may go on. False iff the user has cancelled the benchmark. """ self.metrics_store.lap = lap logger.info("Notifying mechanic of benchmark start.") # we could use #tell() here but then the ask call to driver below will fail because it returns the response that mechanic # sends (see http://godaddy.github.io/Thespian/doc/using.html#sec-6-6-1). self.actor_system.ask(self.mechanic, mechanic.OnBenchmarkStart(lap)) logger.info("Asking driver to start benchmark.") main_driver = self.actor_system.createActor( driver.DriverActor, targetActorRequirements={"coordinator": True}, globalName="/rally/driver/coordinator") try: result = self.actor_system.ask( main_driver, driver.StartBenchmark(self.cfg, self.race.track, self.metrics_store.meta_info, lap)) except KeyboardInterrupt: logger.info("User has cancelled the benchmark.") self.actor_system.send(main_driver, driver.BenchmarkCancelled()) return False finally: logger.info( "Race control has received a benchmark result message. Terminating main driver actor." ) import thespian.actors self.actor_system.tell(main_driver, thespian.actors.ActorExitRequest()) if isinstance(result, driver.BenchmarkComplete): logger.info("Benchmark is complete.") logger.info("Bulk adding request metrics to metrics store.") self.metrics_store.bulk_add(result.metrics) stop_result = self.actor_system.ask(self.mechanic, mechanic.OnBenchmarkStop()) if isinstance(stop_result, mechanic.BenchmarkStopped): logger.info("Bulk adding system metrics to metrics store.") self.metrics_store.bulk_add(stop_result.system_metrics) else: raise exceptions.RallyError( "Mechanic has returned no metrics but instead [%s]. Terminating race without result." % str(stop_result)) logger.info("Flushing metrics data...") self.metrics_store.flush() logger.info("Flushing done") # may happen if one of the load generators has detected that the user has cancelled the benchmark. elif isinstance(result, driver.BenchmarkCancelled): logger.info("User has cancelled the benchmark.") return False elif isinstance(result, driver.BenchmarkFailure): logger.info("Driver has reported a benchmark failure.") raise exceptions.RallyError(result.message, result.cause) else: raise exceptions.RallyError( "Driver has returned no metrics but instead [%s]. Terminating race without result." % str(result)) return True
def size(self): raise exceptions.RallyError( "Do not use a BulkIndexParamSource without partitioning")
def runner_for(operation_type): try: return __RUNNERS[operation_type] except KeyError: raise exceptions.RallyError("No runner available for operation type [%s]" % operation_type)
def start(args): if actor.actor_system_already_running(): raise exceptions.RallyError("An actor system appears to be already running.") actor.bootstrap_actor_system(local_ip=args.node_ip, coordinator_ip=args.coordinator_ip) console.info("Successfully started actor system on node [%s] with coordinator node IP [%s]." % (args.node_ip, args.coordinator_ip))
def receiveMessage(self, msg, sender): try: logger.debug( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): logger.info( "Received signal from race control to start engine.") self.race_control = sender # In our startup procedure we first create all mechanics. Only if this succeeds mechanics_and_start_message = [] if msg.external: logger.info( "Target node(s) will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor( LocalNodeMechanicActor, globalName="/rally/mechanic/worker/external", targetActorRequirements={"coordinator": True}) self.mechanics.append(m) # we can use the original message in this case mechanics_and_start_message.append((m, msg)) else: hosts = msg.cfg.opts("client", "hosts") logger.info( "Target node(s) %s will be provisioned by Rally." % hosts) if len(hosts) == 0: raise exceptions.LaunchError( "No target hosts are configured.") for host in hosts: ip = host["host"] port = int(host["port"]) # user may specify "localhost" on the command line but the problem is that we auto-register the actor system # with "ip": "127.0.0.1" so we convert this special case automatically. In all other cases the user needs to # start the actor system on the other host and is aware that the parameter for the actor system and the # --target-hosts parameter need to match. if ip == "localhost" or ip == "127.0.0.1": m = self.createActor( LocalNodeMechanicActor, globalName="/rally/mechanic/worker/localhost", targetActorRequirements={"coordinator": True}) self.mechanics.append(m) mechanics_and_start_message.append( (m, msg.with_port(port))) else: if msg.cfg.opts("system", "remote.benchmarking.supported"): logger.info( "Benchmarking against %s with external Rally daemon." % hosts) else: logger.error( "User tried to benchmark against %s but no external Rally daemon has been started." % hosts) raise exceptions.SystemSetupError( "To benchmark remote hosts (e.g. %s) you need to start the Rally daemon " "on each machine including this one." % ip) already_running = actor.actor_system_already_running( ip=ip) logger.info( "Actor system on [%s] already running? [%s]" % (ip, str(already_running))) if not already_running: console.println( "Waiting for Rally daemon on [%s] " % ip, end="", flush=True) while not actor.actor_system_already_running( ip=ip): console.println(".", end="", flush=True) time.sleep(3) if not already_running: console.println(" [OK]") m = self.createActor( RemoteNodeMechanicActor, globalName="/rally/mechanic/worker/%s" % ip, targetActorRequirements={"ip": ip}) mechanics_and_start_message.append( (m, msg.with_port(port))) self.mechanics.append(m) for mechanic_actor, start_message in mechanics_and_start_message: self.send(mechanic_actor, start_message) elif isinstance(msg, EngineStarted): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStart): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, Success): self.send(self.race_control, msg) elif isinstance(msg, Failure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, BenchmarkStopped): # TODO dm: Actually we need to wait for all BenchmarkStopped messages from all our mechanic actors # TODO dm: We will actually duplicate cluster level metrics if each of our mechanic actors gathers these... self.send(self.race_control, msg) elif isinstance(msg, StopEngine): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, EngineStopped): self.send(self.race_control, msg) # clear all state as the mechanic might get reused later for m in self.mechanics: self.send(m, thespian.actors.ActorExitRequest()) self.mechanics = [] # self terminate + slave nodes self.send(self.myAddress, thespian.actors.ActorExitRequest()) elif isinstance(msg, thespian.actors.ChildActorExited): # TODO dm: Depending on our state model this can be fine (e.g. when it exited due to our ActorExitRequest message # or it could be problematic and mean that an exception has occured. pass elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) except BaseException: logger.exception("Cannot process message [%s]" % msg) # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor. recipient = self.race_control if sender in self.mechanics else sender ex_type, ex_value, ex_traceback = sys.exc_info() # avoid "can't pickle traceback objects" import traceback self.send( recipient, Failure("Could not execute command (%s)" % ex_value, traceback.format_exc()))
def partition(self, partition_index, total_partitions): raise exceptions.RallyError( "Cannot partition a PartitionBulkIndexParamSource further")
def receiveMessage(self, msg, sender): try: logger.debug( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): self.on_start_engine(msg, sender) elif isinstance(msg, NodesStarted): self.metrics_store.merge_meta_info(msg.system_meta_info) self.transition_when_all_children_responded( sender, msg, "starting", "nodes_started", self.on_all_nodes_started) elif isinstance(msg, MetricsMetaInfoApplied): self.transition_when_all_children_responded( sender, msg, "apply_meta_info", "cluster_started", self.on_cluster_started) elif isinstance(msg, OnBenchmarkStart): self.metrics_store.lap = msg.lap self.cluster.on_benchmark_start() # in the first lap, we are in state "cluster_started", after that in "benchmark_stopped" self.send_to_children_and_transition( sender, msg, ["cluster_started", "benchmark_stopped"], "benchmark_starting") elif isinstance(msg, BenchmarkStarted): self.transition_when_all_children_responded( sender, msg, "benchmark_starting", "benchmark_started", self.on_benchmark_started) elif isinstance(msg, Failure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): self.send_to_children_and_transition(sender, msg, "benchmark_started", "benchmark_stopping") elif isinstance(msg, BenchmarkStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "benchmark_stopping", "benchmark_stopped", self.on_benchmark_stopped) elif isinstance(msg, StopEngine): # detach from cluster and gather all system metrics self.cluster_launcher.stop(self.cluster) # we might have experienced a launch error, hence we need to allow to stop the cluster also after a launch self.send_to_children_and_transition( sender, StopNodes(), ["nodes_started", "benchmark_stopped"], "cluster_stopping") elif isinstance(msg, NodesStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "cluster_stopping", "cluster_stopped", self.on_all_nodes_stopped) elif isinstance(msg, thespian.actors.ChildActorExited): if self.is_current_status_expected("cluster_stopping"): logger.info( "Child actor exited while engine is stopping: [%s]" % msg) else: raise exceptions.RallyError( "Child actor exited with [%s] while in status [%s]." % (msg, self.status)) elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) except BaseException: logger.exception("Cannot process message [%s]" % msg) # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor. recipient = self.race_control if sender in self.mechanics else sender ex_type, ex_value, ex_traceback = sys.exc_info() # avoid "can't pickle traceback objects" import traceback self.send( recipient, Failure("Could not execute command (%s)" % ex_value, traceback.format_exc()))