def race(cfg, kill_running_processes=False): logger = logging.getLogger(__name__) if kill_running_processes: logger.info("Killing running Rally processes") # Kill any lingering Rally processes before attempting to continue - the actor system needs to be a singleton on this machine # noinspection PyBroadException try: process.kill_running_rally_instances() except KeyboardInterrupt: raise exceptions.UserInterrupted( "User has cancelled the benchmark whilst terminating Rally instances." ) from None except BaseException: logger.exception( "Could not terminate potentially running Rally instances correctly. Attempting to go on anyway." ) else: other_rally_processes = process.find_all_other_rally_processes() if other_rally_processes: pids = [p.pid for p in other_rally_processes] msg = ( f"There are other Rally processes running on this machine (PIDs: {pids}) but only one Rally " f"benchmark is allowed to run at the same time.\n\nYou can use --kill-running-processes flag " f"to kill running processes automatically and allow Rally to continue to run a new benchmark. " f"Otherwise, you need to manually kill them.") raise exceptions.RallyError(msg) with_actor_system(racecontrol.run, cfg)
def race(cfg, sources=False, distribution=False, external=False, docker=False): logger = logging.getLogger(__name__) # at this point an actor system has to run and we should only join actor_system = actor.bootstrap_actor_system(try_join=True) benchmark_actor = actor_system.createActor( BenchmarkActor, targetActorRequirements={"coordinator": True}) try: result = actor_system.ask( benchmark_actor, Setup(cfg, sources, distribution, external, docker)) if isinstance(result, Success): logger.info("Benchmark has finished successfully.") # may happen if one of the load generators has detected that the user has cancelled the benchmark. elif isinstance(result, actor.BenchmarkCancelled): logger.info( "User has cancelled the benchmark (detected by actor).") elif isinstance(result, actor.BenchmarkFailure): logger.error("A benchmark failure has occurred") raise exceptions.RallyError(result.message, result.cause) else: raise exceptions.RallyError( "Got an unexpected result during benchmarking: [%s]." % str(result)) except KeyboardInterrupt: logger.info( "User has cancelled the benchmark (detected by race control).") # notify the coordinator so it can properly handle this state. Do it blocking so we don't have a race between this message # and the actor exit request. actor_system.ask(benchmark_actor, actor.BenchmarkCancelled()) raise exceptions.UserInterrupted( "User has cancelled the benchmark (detected by race control)." ) from None finally: logger.info("Telling benchmark actor to exit.") actor_system.tell(benchmark_actor, thespian.actors.ActorExitRequest())
def run(cfg): logger = logging.getLogger(__name__) name = cfg.opts("race", "pipeline") race_id = cfg.opts("system", "race.id") console.info(f"Race id is [{race_id}]", logger=logger) if len(name) == 0: # assume from-distribution pipeline if distribution.version has been specified and --pipeline cli arg not set if cfg.exists("mechanic", "distribution.version"): name = "from-distribution" else: name = "from-sources" logger.info( "User specified no pipeline. Automatically derived pipeline [%s].", name) cfg.add(config.Scope.applicationOverride, "race", "pipeline", name) else: logger.info("User specified pipeline [%s].", name) if os.environ.get("RALLY_RUNNING_IN_DOCKER", "").upper() == "TRUE": # in this case only benchmarking remote Elasticsearch clusters makes sense if name != "benchmark-only": raise exceptions.SystemSetupError( "Only the [benchmark-only] pipeline is supported by the Rally Docker image.\n" "Add --pipeline=benchmark-only in your Rally arguments and try again.\n" "For more details read the docs for the benchmark-only pipeline in {}\n" .format(doc_link("pipelines.html#benchmark-only"))) try: pipeline = pipelines[name] except KeyError: raise exceptions.SystemSetupError( "Unknown pipeline [%s]. List the available pipelines with %s list pipelines." % (name, PROGRAM_NAME)) try: pipeline(cfg) except exceptions.RallyError as e: # just pass on our own errors. It should be treated differently on top-level raise e except KeyboardInterrupt: logger.info("User has cancelled the benchmark.") raise exceptions.UserInterrupted( "User has cancelled the benchmark (detected by race control)." ) from None except BaseException: tb = sys.exc_info()[2] raise exceptions.RallyError( "This race ended with a fatal crash.").with_traceback(tb)
def with_actor_system(runnable, cfg): logger = logging.getLogger(__name__) already_running = actor.actor_system_already_running() logger.info("Actor system already running locally? [%s]", str(already_running)) try: actors = actor.bootstrap_actor_system( try_join=already_running, prefer_local_only=not already_running) # We can only support remote benchmarks if we have a dedicated daemon that is not only bound to 127.0.0.1 cfg.add(config.Scope.application, "system", "remote.benchmarking.supported", already_running) # This happens when the admin process could not be started, e.g. because it could not open a socket. except thespian.actors.InvalidActorAddress: logger.info("Falling back to offline actor system.") actor.use_offline_actor_system() actors = actor.bootstrap_actor_system(try_join=True) except KeyboardInterrupt: raise exceptions.UserInterrupted( "User has cancelled the benchmark (detected whilst bootstrapping actor system)." ) from None except Exception as e: logger.exception("Could not bootstrap actor system.") if str(e) == "Unable to determine valid external socket address.": console.warn( "Could not determine a socket address. Are you running without any network? Switching to degraded mode.", logger=logger) logger.info("Falling back to offline actor system.") actor.use_offline_actor_system() actors = actor.bootstrap_actor_system(try_join=True) else: raise try: runnable(cfg) finally: # We only shutdown the actor system if it was not already running before if not already_running: shutdown_complete = False times_interrupted = 0 while not shutdown_complete and times_interrupted < 2: try: # give some time for any outstanding messages to be delivered to the actor system time.sleep(3) logger.info( "Attempting to shutdown internal actor system.") actors.shutdown() # note that this check will only evaluate to True for a TCP-based actor system. timeout = 15 while actor.actor_system_already_running() and timeout > 0: logger.info( "Actor system is still running. Waiting...") time.sleep(1) timeout -= 1 if timeout > 0: shutdown_complete = True logger.info("Shutdown completed.") else: logger.warning( "Shutdown timed out. Actor system is still running." ) break except KeyboardInterrupt: times_interrupted += 1 logger.warning( "User interrupted shutdown of internal actor system.") console.info( "Please wait a moment for Rally's internal components to shutdown." ) if not shutdown_complete and times_interrupted > 0: logger.warning( "Terminating after user has interrupted actor system shutdown explicitly for [%d] times.", times_interrupted) console.println("") console.warn( "Terminating now at the risk of leaving child processes behind." ) console.println("") console.warn( "The next race may fail due to an unclean shutdown.") console.println("") console.println(SKULL) console.println("") raise exceptions.UserInterrupted( f"User has cancelled the benchmark (shutdown not complete as user interrupted " f"{times_interrupted} times).") from None elif not shutdown_complete: console.warn( "Could not terminate all internal processes within timeout. Please check and force-terminate all Rally processes." )