def _start_process(self, env, node_name, binary_path): if os.geteuid() == 0: raise exceptions.LaunchError("Cannot launch Elasticsearch as root. Please run Rally as a non-root user.") os.chdir(binary_path) startup_event = threading.Event() cmd = ["bin/elasticsearch"] process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.DEVNULL, env=env) t = threading.Thread(target=self._read_output, args=(node_name, process, startup_event)) t.setDaemon(True) t.start() if startup_event.wait(timeout=InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS): process.poll() # has the process terminated? if process.returncode: msg = "Node [%s] has terminated with exit code [%s]." % (node_name, str(process.returncode)) self.logger.error(msg) raise exceptions.LaunchError(msg) else: self.logger.info("Started node [%s] with PID [%s]", node_name, process.pid) return process else: msg = "Could not start node [%s] within timeout period of [%s] seconds." % ( node_name, InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS) # check if the process has terminated already process.poll() if process.returncode: msg += " The process has already terminated with exit code [%s]." % str(process.returncode) else: msg += " The process seems to be still running with PID [%s]." % process.pid self.logger.error(msg) raise exceptions.LaunchError(msg)
def _start_process(self, cmd, env, node_name): if os.geteuid() == 0: raise exceptions.LaunchError( "Cannot launch Elasticsearch as root. Please run Rally as a non-root user." ) install_dir = self.cfg.opts("provisioning", "local.binary.path") os.chdir(install_dir) startup_event = threading.Event() process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.DEVNULL, env=env) t = threading.Thread(target=self._read_output, args=(node_name, process, startup_event)) t.setDaemon(True) t.start() if startup_event.wait( timeout=InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS): logger.info("Started node=%s with pid=%s" % (node_name, process.pid)) return process else: log_dir = self.cfg.opts("system", "log.dir") msg = "Could not start node '%s' within timeout period of %s seconds." % ( node_name, InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS) logger.error(msg) raise exceptions.LaunchError( "%s Please check the logs in '%s' for more details." % (msg, log_dir))
def _start(process, node_name): log = logging.getLogger(__name__) startup_event = threading.Event() watcher = StartupWatcher(node_name, process, startup_event) t = threading.Thread(target=watcher.watch) t.setDaemon(True) t.start() if startup_event.wait(timeout=InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS): process.poll() # has the process terminated? if process.returncode: msg = "Node [%s] has terminated with exit code [%s]." % (node_name, str(process.returncode)) log.error(msg) raise exceptions.LaunchError(msg) else: log.info("Started node [%s] with PID [%s].", node_name, process.pid) return process else: msg = "Could not start node [%s] within timeout period of [%s] seconds." % ( node_name, InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS) # check if the process has terminated already process.poll() if process.returncode: msg += " The process has already terminated with exit code [%s]." % str(process.returncode) else: msg += " The process seems to be still running with PID [%s]." % process.pid log.error(msg) raise exceptions.LaunchError(msg)
def _start_process(binary_path, env): if os.name == "posix" and os.geteuid() == 0: raise exceptions.LaunchError("Cannot launch Elasticsearch as root. Please run Rally as a non-root user.") os.chdir(binary_path) cmd = [io.escape_path(os.path.join(".", "bin", "elasticsearch"))] cmd.extend(["-d", "-p", "pid"]) ret = ProcessLauncher._run_subprocess(command_line=" ".join(cmd), env=env) if ret != 0: msg = "Daemon startup failed with exit code [{}]".format(ret) logging.error(msg) raise exceptions.LaunchError(msg) return wait_for_pidfile(io.escape_path(os.path.join(".", "pid")))
def _start_process(binary_path, env): if os.geteuid() == 0: raise exceptions.LaunchError("Cannot launch Elasticsearch as root. Please run Rally as a non-root user.") os.chdir(binary_path) cmd = ["bin/elasticsearch"] cmd.extend(["-d", "-p", "pid"]) ret = process.run_subprocess_with_logging(command_line=" ".join(cmd), env=env) if ret != 0: msg = "Daemon startup failed with exit code[{}]".format(ret) logging.error(msg) raise exceptions.LaunchError(msg) return wait_for_pidfile("./pid")
def start(self, car, binary, data_paths): self.binary_path = binary hosts = self.cfg.opts("client", "hosts") client_options = self.cfg.opts("client", "options") es = self.client_factory(hosts, client_options).create() # Cannot enable custom telemetry devices here t = telemetry.Telemetry(devices=[ # Be aware that some the meta-data are taken from the host system, not the container (e.g. number of CPU cores) so if the # Docker container constrains these, the metrics are actually wrong. telemetry.EnvironmentInfo(es, self.metrics_store), telemetry.NodeStats(es, self.metrics_store), telemetry.IndexStats(es, self.metrics_store), telemetry.DiskIo(self.metrics_store), telemetry.CpuUsage(self.metrics_store) ]) c = cluster.Cluster(hosts, [], t) self._start_process(cmd="docker-compose -f %s up" % self.binary_path, node_name="rally0") logger.info("Docker container has successfully started. Checking if REST API is available.") if wait_for_rest_layer(es): logger.info("REST API is available. Attaching telemetry devices to cluster.") t.attach_to_cluster(c) logger.info("Telemetry devices are now attached to the cluster.") else: logger.error("REST API layer is not yet available. Forcefully terminating cluster.") self.stop(c) raise exceptions.LaunchError("Elasticsearch REST API layer is not available. Forcefully terminated cluster.") return c
def start(self, car, binary, data_paths): hosts = self.cfg.opts("client", "hosts") client_options = self.cfg.opts("client", "options") es = client.EsClientFactory(hosts, client_options).create() # we're very specific which nodes we kill as there is potentially also an Elasticsearch based metrics store running on this machine node_prefix = self.cfg.opts("provisioning", "node.name.prefix") process.kill_running_es_instances(node_prefix) logger.info("Starting a cluster based on car [%s] with [%d] nodes." % (car, car.nodes)) # TODO dm: Get rid of these... enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") cluster_telemetry = [ # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.MergeParts(self.metrics_store, self.node_log_dir), telemetry.EnvironmentInfo(es, self.metrics_store), telemetry.NodeStats(es, self.metrics_store), telemetry.IndexStats(es, self.metrics_store), # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster telemetry.IndexSize(data_paths, self.metrics_store) ] t = telemetry.Telemetry(enabled_devices, devices=cluster_telemetry) c = cluster.Cluster(hosts, [self._start_node(node, car, es, binary) for node in range(car.nodes)], t) logger.info("All cluster nodes have successfully started. Checking if REST API is available.") if wait_for_rest_layer(es): logger.info("REST API is available. Attaching telemetry devices to cluster.") t.attach_to_cluster(c) logger.info("Telemetry devices are now attached to the cluster.") else: logger.error("REST API layer is not yet available. Forcefully terminating cluster.") self.stop(c) raise exceptions.LaunchError("Elasticsearch REST API layer is not available. Forcefully terminated cluster.") return c
def receiveMsg_StartEngine(self, msg, sender): self.logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg cls = metrics.metrics_store_class(self.cfg) self.metrics_store = cls(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) self.car, _ = load_team(self.cfg, msg.external) # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. hosts = self.cfg.opts("client", "hosts").default if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") if msg.external: self.logger.info("Cluster will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor(NodeMechanicActor, targetActorRequirements={"coordinator": True}) self.children.append(m) self.send(m, msg.for_nodes(ip=hosts)) else: self.logger.info( "Cluster consisting of %s will be provisioned by Rally.", hosts) msg.hosts = hosts # Initialize the children array to have the right size to # ensure waiting for all responses self.children = [None] * len(nodes_by_host(to_ip_port(hosts))) self.send(self.createActor(Dispatcher), msg) self.status = "starting" self.received_responses = []
def _do_wait(self, expected_cluster_status): reached_cluster_status = None for attempt in range(10): try: result = self.client.cluster.health( wait_for_status=expected_cluster_status, wait_for_relocating_shards=0, timeout="3s") except (socket.timeout, elasticsearch.exceptions.ConnectionError, elasticsearch.exceptions.TransportError): pass else: reached_cluster_status = result["status"] relocating_shards = result["relocating_shards"] logger.info("GOT: %s" % str(result)) logger.info("ALLOC:\n%s" % self.client.cat.allocation(v=True)) logger.info("RECOVERY:\n%s" % self.client.cat.recovery(v=True)) logger.info("SHARDS:\n%s" % self.client.cat.shards(v=True)) if reached_cluster_status == expected_cluster_status and relocating_shards == 0: return reached_cluster_status, relocating_shards else: time.sleep(0.5) msg = "Cluster did not reach status [%s]. Last reached status: [%s]" % ( expected_cluster_status, reached_cluster_status) logger.error(msg) raise exceptions.LaunchError(msg)
def start(self): hosts = self.cfg.opts("client", "hosts") client_options = self.cfg.opts("client", "options") es = self.client_factory(hosts, client_options).create() t = telemetry.Telemetry(devices=[ telemetry.ClusterMetaDataInfo(es), telemetry.ClusterEnvironmentInfo(es, self.metrics_store), telemetry.NodeStats(es, self.metrics_store), telemetry.IndexStats(es, self.metrics_store) ]) # The list of nodes will be populated by ClusterMetaDataInfo, so no need to do it here c = cluster.Cluster(hosts, [], t) logger.info( "All cluster nodes have successfully started. Checking if REST API is available." ) if wait_for_rest_layer(es, max_attempts=20): logger.info( "REST API is available. Attaching telemetry devices to cluster." ) t.attach_to_cluster(c) logger.info("Telemetry devices are now attached to the cluster.") else: # Just stop the cluster here and raise. The caller is responsible for terminating individual nodes. logger.error( "REST API layer is not yet available. Forcefully terminating cluster." ) self.stop(c) raise exceptions.LaunchError( "Elasticsearch REST API layer is not available. Forcefully terminated cluster." ) return c
def receiveMsg_StartEngine(self, msg, sender): self.logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg self.car, _ = load_team(self.cfg, msg.external) # TODO: This is implicitly set by #load_team() - can we gather this elsewhere? self.team_revision = self.cfg.opts("mechanic", "repository.revision") # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. hosts = self.cfg.opts("client", "hosts").default if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") self.externally_provisioned = msg.external if self.externally_provisioned: self.logger.info("Cluster will not be provisioned by Rally.") self.status = "nodes_started" self.received_responses = [] self.on_all_nodes_started() self.status = "cluster_started" else: console.info("Preparing for race ...", flush=True) self.logger.info("Cluster consisting of %s will be provisioned by Rally.", hosts) msg.hosts = hosts # Initialize the children array to have the right size to # ensure waiting for all responses self.children = [None] * len(nodes_by_host(to_ip_port(hosts))) self.send(self.createActor(Dispatcher), msg) self.status = "starting" self.received_responses = []
def on_start_engine(self, msg, sender): logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg cls = metrics.metrics_store_class(self.cfg) self.metrics_store = cls(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. mechanics_and_start_message = [] hosts = self.cfg.opts("client", "hosts") if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") if msg.external: logger.info("Cluster will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/external", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(ip=hosts))) else: logger.info("Cluster consisting of %s will be provisioned by Rally." % hosts) all_ips_and_ports = to_ip_port(hosts) all_node_ips = extract_all_node_ips(all_ips_and_ports) for ip_port, nodes in nodes_by_host(all_ips_and_ports).items(): ip, port = ip_port if ip == "127.0.0.1": m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/localhost", targetActorRequirements={"coordinator": True}) self.children.append(m) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) else: if self.cfg.opts("system", "remote.benchmarking.supported"): logger.info("Benchmarking against %s with external Rally daemon." % hosts) else: logger.error("User tried to benchmark against %s but no external Rally daemon has been started." % hosts) raise exceptions.SystemSetupError("To benchmark remote hosts (e.g. %s) you need to start the Rally daemon " "on each machine including this one." % ip) already_running = actor.actor_system_already_running(ip=ip) logger.info("Actor system on [%s] already running? [%s]" % (ip, str(already_running))) if not already_running: console.println("Waiting for Rally daemon on [%s] " % ip, end="", flush=True) while not actor.actor_system_already_running(ip=ip): console.println(".", end="", flush=True) time.sleep(3) if not already_running: console.println(" [OK]") m = self.createActor(NodeMechanicActor, #globalName="/rally/mechanic/worker/%s" % ip, targetActorRequirements={"ip": ip}) mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes))) self.children.append(m) self.status = "starting" self.received_responses = [] for mechanic_actor, start_message in mechanics_and_start_message: self.send(mechanic_actor, start_message)
def _start_process(self, cmd, env, node_name, binary_path): if os.geteuid() == 0: raise exceptions.LaunchError("Cannot launch Elasticsearch as root. Please run Rally as a non-root user.") os.chdir(binary_path) startup_event = threading.Event() process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.DEVNULL, env=env) t = threading.Thread(target=self._read_output, args=(node_name, process, startup_event)) t.setDaemon(True) t.start() if startup_event.wait(timeout=InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS): logger.info("Started node [%s] with PID [%s]" % (node_name, process.pid)) return process else: msg = "Could not start node [%s] within timeout period of [%s] seconds." % ( node_name, InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS) logger.error(msg) raise exceptions.LaunchError(msg)
def cmd_line_opt(self, distribution_version, key): best_version = versions.best_match(InProcessLauncher.ES_CMD_LINE_OPTS_PER_VERSION.keys(), distribution_version) if best_version: return InProcessLauncher.ES_CMD_LINE_OPTS_PER_VERSION[best_version][key] else: raise exceptions.LaunchError("Cannot start cluster. Unsupported distribution version %s. " "Please raise a bug at %s." % (distribution_version, console.format.link("https://github.com/elastic/rally")))
def start(self): """ Performs final startup tasks. Precondition: All cluster nodes have been started. Postcondition: The cluster is ready to receive HTTP requests or a ``LaunchError`` is raised. :return: A representation of the launched cluster. """ enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") telemetry_params = self.cfg.opts("mechanic", "telemetry.params") all_hosts = self.cfg.opts("client", "hosts").all_hosts default_hosts = self.cfg.opts("client", "hosts").default preserve = self.cfg.opts("mechanic", "preserve.install") skip_rest_api_check = self.cfg.opts("mechanic", "skip.rest.api.check") es = {} for cluster_name, cluster_hosts in all_hosts.items(): all_client_options = self.cfg.opts("client", "options").all_client_options cluster_client_options = dict(all_client_options[cluster_name]) # Use retries to avoid aborts on long living connections for telemetry devices cluster_client_options["retry-on-timeout"] = True es[cluster_name] = self.client_factory(cluster_hosts, cluster_client_options).create() es_default = es["default"] t = telemetry.Telemetry(enabled_devices, devices=[ telemetry.NodeStats(telemetry_params, es, self.metrics_store), telemetry.ClusterMetaDataInfo(es_default), telemetry.ClusterEnvironmentInfo(es_default, self.metrics_store), telemetry.JvmStatsSummary(es_default, self.metrics_store), telemetry.IndexStats(es_default, self.metrics_store), telemetry.MlBucketProcessingTime(es_default, self.metrics_store), telemetry.CcrStats(telemetry_params, es, self.metrics_store), telemetry.RecoveryStats(telemetry_params, es, self.metrics_store) ]) # The list of nodes will be populated by ClusterMetaDataInfo, so no need to do it here c = cluster.Cluster(default_hosts, [], t, preserve) if skip_rest_api_check: self.logger.info("Skipping REST API check and attaching telemetry devices to cluster.") t.attach_to_cluster(c) self.logger.info("Telemetry devices are now attached to the cluster.") else: self.logger.info("All cluster nodes have successfully started. Checking if REST API is available.") if wait_for_rest_layer(es_default, max_attempts=40): self.logger.info("REST API is available. Attaching telemetry devices to cluster.") t.attach_to_cluster(c) self.logger.info("Telemetry devices are now attached to the cluster.") else: # Just stop the cluster here and raise. The caller is responsible for terminating individual nodes. self.logger.error("REST API layer is not yet available. Forcefully terminating cluster.") self.stop(c) raise exceptions.LaunchError( "Elasticsearch REST API layer is not available. Forcefully terminated cluster.") return c
def _start_process(self, binary_path): compose_cmd = _get_docker_compose_cmd(binary_path, "up -d") ret = process.run_subprocess_with_logging(compose_cmd) if ret != 0: msg = "Docker daemon startup failed with exit code[{}]".format(ret) logging.error(msg) raise exceptions.LaunchError(msg) container_id = _get_container_id(binary_path) _wait_for_healthy_running_container(container_id)
def _wait_for_healthy_running_container(container_id, timeout=60): cmd = 'docker ps -a --filter "id={}" --filter "status=running" --filter "health=healthy" -q'.format(container_id) endtime = _time() + timeout while _time() < endtime: output = subprocess.check_output(shlex.split(cmd)) containers = output.decode("utf-8").rstrip() if len(containers) > 0: return time.sleep(0.5) msg = "No healthy running container after {} seconds!".format(timeout) logging.error(msg) raise exceptions.LaunchError(msg)
def wait_for_pidfile(pidfilename, timeout=60): endtime = _time() + timeout while _time() < endtime: try: with open(pidfilename, "rb") as f: return int(f.read()) except FileNotFoundError: time.sleep(0.5) msg = "pid file not available after {} seconds!".format(timeout) logging.error(msg) raise exceptions.LaunchError(msg)
def _wait_for_healthy_running_container(self, container_id, timeout): cmd = 'docker ps -a --filter "id={}" --filter "status=running" --filter "health=healthy" -q'.format(container_id) stop_watch = self.clock.stop_watch() stop_watch.start() while stop_watch.split_time() < timeout: containers = process.run_subprocess_with_output(cmd) if len(containers) > 0: return time.sleep(0.5) msg = "No healthy running container after {} seconds!".format(timeout) logging.error(msg) raise exceptions.LaunchError(msg)
def wait_for_pidfile(pidfilename, timeout=60, clock=time.Clock): stop_watch = clock.stop_watch() stop_watch.start() while stop_watch.split_time() < timeout: try: with open(pidfilename, "rb") as f: return int(f.read()) except FileNotFoundError: time.sleep(0.5) msg = "pid file not available after {} seconds!".format(timeout) logging.error(msg) raise exceptions.LaunchError(msg)
def _start_process(self, env, node_name, binary_path): if os.geteuid() == 0: raise exceptions.LaunchError( "Cannot launch Elasticsearch as root. Please run Rally as a non-root user." ) os.chdir(binary_path) cmd = ["bin/elasticsearch"] return _start( subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.DEVNULL, env=env), node_name)
def _start_process(self, cmd, node_name, log_dir): startup_event = threading.Event() p = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.DEVNULL) t = threading.Thread(target=self._read_output, args=(node_name, p, startup_event)) t.setDaemon(True) t.start() if startup_event.wait(timeout=DockerLauncher.PROCESS_WAIT_TIMEOUT_SECONDS): self.logger.info("Started node=%s with pid=%s", node_name, p.pid) return p else: msg = "Could not start node '%s' within timeout period of %s seconds." % ( node_name, InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS) self.logger.error(msg) raise exceptions.LaunchError("%s Please check the logs in '%s' for more details." % (msg, log_dir))
def wait_for_rest_layer(es, max_attempts=20): for attempt in range(max_attempts): import elasticsearch try: es.info() return True except elasticsearch.ConnectionError as e: if "SSL: UNKNOWN_PROTOCOL" in str(e): raise exceptions.LaunchError("Could not connect to cluster via https. Is this a https endpoint?", e) else: time.sleep(1) except elasticsearch.TransportError as e: if e.status_code == 503: time.sleep(1) elif e.status_code == 401: time.sleep(1) else: raise e return False
def receiveMsg_StartEngine(self, msg, sender): self.logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg cls = metrics.metrics_store_class(self.cfg) self.metrics_store = cls(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) name = self.cfg.opts("race", "pipeline") self.car, _ = load_team(self.cfg, msg.external) self.team_revision = self.cfg.opts("mechanic", "repository.revision") # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. hosts = self.cfg.opts("client", "hosts").default if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") if msg.external: self.logger.info("Cluster will not be provisioned by Rally.") if msg.cluster_settings: pretty_settings = json.dumps(msg.cluster_settings, indent=2) warning = "Ensure that these settings are defined in elasticsearch.yml:\n\n{}\n\nIf they are absent, running this track " \ "will fail or lead to unexpected results.".format(pretty_settings) console.warn(warning, logger=self.logger) # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor(NodeMechanicActor, targetActorRequirements={"coordinator": True}) self.children.append(m) self.send(m, msg.for_nodes(ip=hosts)) else: console.info("Preparing for race ...", flush=True) self.logger.info( "Cluster consisting of %s will be provisioned by Rally.", hosts) msg.hosts = hosts # Initialize the children array to have the right size to # ensure waiting for all responses self.children = [None] * len(nodes_by_host(to_ip_port(hosts))) self.send(self.createActor(Dispatcher), msg) self.status = "starting" self.received_responses = []
def receiveMsg_StartEngine(self, msg, sender): self.logger.info("Received signal from race control to start engine.") self.race_control = sender self.cfg = msg.cfg cls = metrics.metrics_store_class(self.cfg) self.metrics_store = cls(self.cfg) self.metrics_store.open(ctx=msg.open_metrics_context) self.car, _ = load_team(self.cfg, msg.external) self.team_revision = self.cfg.opts("mechanic", "repository.revision") # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue. hosts = self.cfg.opts("client", "hosts").default if len(hosts) == 0: raise exceptions.LaunchError("No target hosts are configured.") self.externally_provisioned = msg.external if self.externally_provisioned: self.logger.info("Cluster will not be provisioned by Rally.") # TODO: This needs to be handled later - we should probably disallow this entirely if msg.cluster_settings: pretty_settings = json.dumps(msg.cluster_settings, indent=2) warning = "Ensure that these settings are defined in elasticsearch.yml:\n\n{}\n\nIf they are absent, running this track " \ "will fail or lead to unexpected results.".format(pretty_settings) console.warn(warning, logger=self.logger) self.status = "nodes_started" self.received_responses = [] self.on_all_nodes_started() self.status = "cluster_started" else: console.info("Preparing for race ...", flush=True) self.logger.info( "Cluster consisting of %s will be provisioned by Rally.", hosts) msg.hosts = hosts # Initialize the children array to have the right size to # ensure waiting for all responses self.children = [None] * len(nodes_by_host(to_ip_port(hosts))) self.send(self.createActor(Dispatcher), msg) self.status = "starting" self.received_responses = []
def receiveMessage(self, msg, sender): try: logger.debug( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): self.on_start_engine(msg, sender) elif isinstance(msg, NodesStarted): self.metrics_store.merge_meta_info(msg.system_meta_info) self.transition_when_all_children_responded( sender, msg, "starting", "nodes_started", self.on_all_nodes_started) elif isinstance(msg, MetricsMetaInfoApplied): self.transition_when_all_children_responded( sender, msg, "apply_meta_info", "cluster_started", self.on_cluster_started) elif isinstance(msg, OnBenchmarkStart): self.metrics_store.lap = msg.lap self.cluster.on_benchmark_start() # in the first lap, we are in state "cluster_started", after that in "benchmark_stopped" self.send_to_children_and_transition( sender, msg, ["cluster_started", "benchmark_stopped"], "benchmark_starting") elif isinstance(msg, BenchmarkStarted): self.transition_when_all_children_responded( sender, msg, "benchmark_starting", "benchmark_started", self.on_benchmark_started) elif isinstance(msg, Failure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): self.send_to_children_and_transition(sender, msg, "benchmark_started", "benchmark_stopping") elif isinstance(msg, BenchmarkStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "benchmark_stopping", "benchmark_stopped", self.on_benchmark_stopped) elif isinstance(msg, StopEngine): # detach from cluster and gather all system metrics self.cluster_launcher.stop(self.cluster) # we might have experienced a launch error, hence we need to allow to stop the cluster also after a launch self.send_to_children_and_transition( sender, StopNodes(), ["nodes_started", "benchmark_stopped"], "cluster_stopping") elif isinstance(msg, NodesStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "cluster_stopping", "cluster_stopped", self.on_all_nodes_stopped) elif isinstance(msg, thespian.actors.ChildActorExited): if self.is_current_status_expected("cluster_stopping"): logger.info( "Child actor exited while engine is stopping: [%s]" % msg) else: raise exceptions.RallyError( "Child actor exited with [%s] while in status [%s]." % (msg, self.status)) elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) except BaseException: logger.exception("Cannot process message [%s]" % msg) # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor. recipient = self.race_control if sender in self.mechanics else sender ex_type, ex_value, ex_traceback = sys.exc_info() # avoid "can't pickle traceback objects" import traceback self.send( recipient, Failure("Could not execute command (%s)" % ex_value, traceback.format_exc()))
def receiveMessage(self, msg, sender): try: logger.info( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): self.on_start_engine(msg, sender) elif isinstance(msg, NodesStarted): self.metrics_store.merge_meta_info(msg.system_meta_info) self.transition_when_all_children_responded( sender, msg, "starting", "nodes_started", self.on_all_nodes_started) elif isinstance(msg, MetricsMetaInfoApplied): self.transition_when_all_children_responded( sender, msg, "apply_meta_info", "cluster_started", self.on_cluster_started) elif isinstance(msg, OnBenchmarkStart): self.metrics_store.lap = msg.lap self.cluster.on_benchmark_start() # in the first lap, we are in state "cluster_started", after that in "benchmark_stopped" self.send_to_children_and_transition( sender, msg, ["cluster_started", "benchmark_stopped"], "benchmark_starting") elif isinstance(msg, BenchmarkStarted): self.transition_when_all_children_responded( sender, msg, "benchmark_starting", "benchmark_started", self.on_benchmark_started) elif isinstance(msg, ResetRelativeTime): if msg.reset_in_seconds > 0: self.wakeupAfter(msg.reset_in_seconds) else: self.reset_relative_time() elif isinstance(msg, thespian.actors.WakeupMessage): self.reset_relative_time() elif isinstance(msg, actor.BenchmarkFailure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): self.send_to_children_and_transition(sender, msg, "benchmark_started", "benchmark_stopping") elif isinstance(msg, BenchmarkStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "benchmark_stopping", "benchmark_stopped", self.on_benchmark_stopped) elif isinstance(msg, StopEngine): # detach from cluster and gather all system metrics self.cluster_launcher.stop(self.cluster) # we might have experienced a launch error or the user has cancelled the benchmark. Hence we need to allow to stop the # cluster from various states and we don't check here for a specific one. self.send_to_children_and_transition(sender, StopNodes(), [], "cluster_stopping") elif isinstance(msg, NodesStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "cluster_stopping", "cluster_stopped", self.on_all_nodes_stopped) elif isinstance(msg, thespian.actors.ActorExitRequest): # due to early termination by race control. If it's self-initiated we already took care of the rest. if sender != self.myAddress: self.send_to_children_and_transition( self.myAddress, msg, expected_status=None, new_status="cluster_stopping") elif isinstance(msg, thespian.actors.ChildActorExited): if self.is_current_status_expected("cluster_stopping"): logger.info( "Child actor exited while engine is stopping: [%s]" % msg) else: raise exceptions.RallyError( "Child actor exited with [%s] while in status [%s]." % (msg, self.status)) elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) except BaseException: # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor. recipient = self.race_control if sender in self.children else sender logger.exception("Cannot process message [%s]. Notifying [%s]." % (msg, recipient)) ex_type, ex_value, ex_traceback = sys.exc_info() # avoid "can't pickle traceback objects" import traceback self.send( recipient, actor.BenchmarkFailure( "Could not execute command (%s)" % ex_value, traceback.format_exc()))
def receiveMessage(self, msg, sender): try: logger.info( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): self.on_start_engine(msg, sender) elif isinstance(msg, NodesStarted): self.metrics_store.merge_meta_info(msg.system_meta_info) self.transition_when_all_children_responded( sender, msg, "starting", "nodes_started", self.on_all_nodes_started) elif isinstance(msg, MetricsMetaInfoApplied): self.transition_when_all_children_responded( sender, msg, "apply_meta_info", "cluster_started", self.on_cluster_started) elif isinstance(msg, OnBenchmarkStart): self.metrics_store.lap = msg.lap # in the first lap, we are in state "cluster_started", after that in "benchmark_stopped" self.send_to_children_and_transition( sender, msg, ["cluster_started", "benchmark_stopped"], "benchmark_starting") elif isinstance(msg, BenchmarkStarted): self.transition_when_all_children_responded( sender, msg, "benchmark_starting", "benchmark_started", self.on_benchmark_started) elif isinstance(msg, ResetRelativeTime): if msg.reset_in_seconds > 0: self.wakeupAfter(msg.reset_in_seconds) else: self.reset_relative_time() elif isinstance(msg, thespian.actors.WakeupMessage): self.reset_relative_time() elif isinstance(msg, actor.BenchmarkFailure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): self.send_to_children_and_transition(sender, msg, "benchmark_started", "benchmark_stopping") elif isinstance(msg, BenchmarkStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "benchmark_stopping", "benchmark_stopped", self.on_benchmark_stopped) elif isinstance(msg, StopEngine): # detach from cluster and gather all system metrics self.cluster_launcher.stop(self.cluster) # we might have experienced a launch error or the user has cancelled the benchmark. Hence we need to allow to stop the # cluster from various states and we don't check here for a specific one. self.send_to_children_and_transition(sender, StopNodes(), [], "cluster_stopping") elif isinstance(msg, NodesStopped): self.metrics_store.bulk_add(msg.system_metrics) self.transition_when_all_children_responded( sender, msg, "cluster_stopping", "cluster_stopped", self.on_all_nodes_stopped) elif isinstance(msg, thespian.actors.ActorExitRequest): # due to early termination by race control. If it's self-initiated we already took care of the rest. if sender != self.myAddress: self.send_to_children_and_transition( self.myAddress, msg, expected_status=None, new_status="cluster_stopping") elif isinstance(msg, thespian.actors.ChildActorExited): if self.is_current_status_expected( ["cluster_stopping", "cluster_stopped"]): logger.info( "Child actor exited while engine is stopping or stopped: [%s]" % msg) else: raise exceptions.RallyError( "Child actor exited with [%s] while in status [%s]." % (msg, self.status)) elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) else: logger.info( "MechanicActor received unknown message [%s] (ignoring)." % (str(msg))) except BaseException as e: logger.exception("Cannot process message") logger.error("Failed message details: [%s]. Notifying [%s]." % (msg, self.race_control)) self.send( self.race_control, actor.BenchmarkFailure( "Error in Elasticsearch cluster coordinator", e))
def receiveMessage(self, msg, sender): try: logger.debug( "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" % (str(type(msg)), str(sender))) if isinstance(msg, StartEngine): logger.info( "Received signal from race control to start engine.") self.race_control = sender # In our startup procedure we first create all mechanics. Only if this succeeds mechanics_and_start_message = [] if msg.external: logger.info( "Target node(s) will not be provisioned by Rally.") # just create one actor for this special case and run it on the coordinator node (i.e. here) m = self.createActor( LocalNodeMechanicActor, globalName="/rally/mechanic/worker/external", targetActorRequirements={"coordinator": True}) self.mechanics.append(m) # we can use the original message in this case mechanics_and_start_message.append((m, msg)) else: hosts = msg.cfg.opts("client", "hosts") logger.info( "Target node(s) %s will be provisioned by Rally." % hosts) if len(hosts) == 0: raise exceptions.LaunchError( "No target hosts are configured.") for host in hosts: ip = host["host"] port = int(host["port"]) # user may specify "localhost" on the command line but the problem is that we auto-register the actor system # with "ip": "127.0.0.1" so we convert this special case automatically. In all other cases the user needs to # start the actor system on the other host and is aware that the parameter for the actor system and the # --target-hosts parameter need to match. if ip == "localhost" or ip == "127.0.0.1": m = self.createActor( LocalNodeMechanicActor, globalName="/rally/mechanic/worker/localhost", targetActorRequirements={"coordinator": True}) self.mechanics.append(m) mechanics_and_start_message.append( (m, msg.with_port(port))) else: if msg.cfg.opts("system", "remote.benchmarking.supported"): logger.info( "Benchmarking against %s with external Rally daemon." % hosts) else: logger.error( "User tried to benchmark against %s but no external Rally daemon has been started." % hosts) raise exceptions.SystemSetupError( "To benchmark remote hosts (e.g. %s) you need to start the Rally daemon " "on each machine including this one." % ip) already_running = actor.actor_system_already_running( ip=ip) logger.info( "Actor system on [%s] already running? [%s]" % (ip, str(already_running))) if not already_running: console.println( "Waiting for Rally daemon on [%s] " % ip, end="", flush=True) while not actor.actor_system_already_running( ip=ip): console.println(".", end="", flush=True) time.sleep(3) if not already_running: console.println(" [OK]") m = self.createActor( RemoteNodeMechanicActor, globalName="/rally/mechanic/worker/%s" % ip, targetActorRequirements={"ip": ip}) mechanics_and_start_message.append( (m, msg.with_port(port))) self.mechanics.append(m) for mechanic_actor, start_message in mechanics_and_start_message: self.send(mechanic_actor, start_message) elif isinstance(msg, EngineStarted): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStart): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, Success): self.send(self.race_control, msg) elif isinstance(msg, Failure): self.send(self.race_control, msg) elif isinstance(msg, OnBenchmarkStop): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, BenchmarkStopped): # TODO dm: Actually we need to wait for all BenchmarkStopped messages from all our mechanic actors # TODO dm: We will actually duplicate cluster level metrics if each of our mechanic actors gathers these... self.send(self.race_control, msg) elif isinstance(msg, StopEngine): for m in self.mechanics: self.send(m, msg) elif isinstance(msg, EngineStopped): self.send(self.race_control, msg) # clear all state as the mechanic might get reused later for m in self.mechanics: self.send(m, thespian.actors.ActorExitRequest()) self.mechanics = [] # self terminate + slave nodes self.send(self.myAddress, thespian.actors.ActorExitRequest()) elif isinstance(msg, thespian.actors.ChildActorExited): # TODO dm: Depending on our state model this can be fine (e.g. when it exited due to our ActorExitRequest message # or it could be problematic and mean that an exception has occured. pass elif isinstance(msg, thespian.actors.PoisonMessage): # something went wrong with a child actor if isinstance(msg.poisonMessage, StartEngine): raise exceptions.LaunchError( "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?" ) else: logger.error( "[%s] sent to a child actor has resulted in PoisonMessage" % str(msg.poisonMessage)) raise exceptions.RallyError( "Could not communicate with benchmark candidate (unknown reason)" ) except BaseException: logger.exception("Cannot process message [%s]" % msg) # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor. recipient = self.race_control if sender in self.mechanics else sender ex_type, ex_value, ex_traceback = sys.exc_info() # avoid "can't pickle traceback objects" import traceback self.send( recipient, Failure("Could not execute command (%s)" % ex_value, traceback.format_exc()))