Ejemplo n.º 1
0
 def _start_process(self, env, node_name, binary_path):
     if os.geteuid() == 0:
         raise exceptions.LaunchError("Cannot launch Elasticsearch as root. Please run Rally as a non-root user.")
     os.chdir(binary_path)
     startup_event = threading.Event()
     cmd = ["bin/elasticsearch"]
     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.DEVNULL, env=env)
     t = threading.Thread(target=self._read_output, args=(node_name, process, startup_event))
     t.setDaemon(True)
     t.start()
     if startup_event.wait(timeout=InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS):
         process.poll()
         # has the process terminated?
         if process.returncode:
             msg = "Node [%s] has terminated with exit code [%s]." % (node_name, str(process.returncode))
             self.logger.error(msg)
             raise exceptions.LaunchError(msg)
         else:
             self.logger.info("Started node [%s] with PID [%s]", node_name, process.pid)
             return process
     else:
         msg = "Could not start node [%s] within timeout period of [%s] seconds." % (
             node_name, InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS)
         # check if the process has terminated already
         process.poll()
         if process.returncode:
             msg += " The process has already terminated with exit code [%s]." % str(process.returncode)
         else:
             msg += " The process seems to be still running with PID [%s]." % process.pid
         self.logger.error(msg)
         raise exceptions.LaunchError(msg)
Ejemplo n.º 2
0
 def _start_process(self, cmd, env, node_name):
     if os.geteuid() == 0:
         raise exceptions.LaunchError(
             "Cannot launch Elasticsearch as root. Please run Rally as a non-root user."
         )
     install_dir = self.cfg.opts("provisioning", "local.binary.path")
     os.chdir(install_dir)
     startup_event = threading.Event()
     process = subprocess.Popen(cmd,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.STDOUT,
                                stdin=subprocess.DEVNULL,
                                env=env)
     t = threading.Thread(target=self._read_output,
                          args=(node_name, process, startup_event))
     t.setDaemon(True)
     t.start()
     if startup_event.wait(
             timeout=InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS):
         logger.info("Started node=%s with pid=%s" %
                     (node_name, process.pid))
         return process
     else:
         log_dir = self.cfg.opts("system", "log.dir")
         msg = "Could not start node '%s' within timeout period of %s seconds." % (
             node_name, InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS)
         logger.error(msg)
         raise exceptions.LaunchError(
             "%s Please check the logs in '%s' for more details." %
             (msg, log_dir))
Ejemplo n.º 3
0
def _start(process, node_name):
    log = logging.getLogger(__name__)
    startup_event = threading.Event()
    watcher = StartupWatcher(node_name, process, startup_event)
    t = threading.Thread(target=watcher.watch)
    t.setDaemon(True)
    t.start()
    if startup_event.wait(timeout=InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS):
        process.poll()
        # has the process terminated?
        if process.returncode:
            msg = "Node [%s] has terminated with exit code [%s]." % (node_name, str(process.returncode))
            log.error(msg)
            raise exceptions.LaunchError(msg)
        else:
            log.info("Started node [%s] with PID [%s].", node_name, process.pid)
            return process
    else:
        msg = "Could not start node [%s] within timeout period of [%s] seconds." % (
            node_name, InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS)
        # check if the process has terminated already
        process.poll()
        if process.returncode:
            msg += " The process has already terminated with exit code [%s]." % str(process.returncode)
        else:
            msg += " The process seems to be still running with PID [%s]." % process.pid
        log.error(msg)
        raise exceptions.LaunchError(msg)
Ejemplo n.º 4
0
    def _start_process(binary_path, env):
        if os.name == "posix" and os.geteuid() == 0:
            raise exceptions.LaunchError("Cannot launch Elasticsearch as root. Please run Rally as a non-root user.")
        os.chdir(binary_path)
        cmd = [io.escape_path(os.path.join(".", "bin", "elasticsearch"))]
        cmd.extend(["-d", "-p", "pid"])
        ret = ProcessLauncher._run_subprocess(command_line=" ".join(cmd), env=env)
        if ret != 0:
            msg = "Daemon startup failed with exit code [{}]".format(ret)
            logging.error(msg)
            raise exceptions.LaunchError(msg)

        return wait_for_pidfile(io.escape_path(os.path.join(".", "pid")))
Ejemplo n.º 5
0
    def _start_process(binary_path, env):
        if os.geteuid() == 0:
            raise exceptions.LaunchError("Cannot launch Elasticsearch as root. Please run Rally as a non-root user.")
        os.chdir(binary_path)
        cmd = ["bin/elasticsearch"]
        cmd.extend(["-d", "-p", "pid"])
        ret = process.run_subprocess_with_logging(command_line=" ".join(cmd), env=env)
        if ret != 0:
            msg = "Daemon startup failed with exit code[{}]".format(ret)
            logging.error(msg)
            raise exceptions.LaunchError(msg)

        return wait_for_pidfile("./pid")
Ejemplo n.º 6
0
    def start(self, car, binary, data_paths):
        self.binary_path = binary

        hosts = self.cfg.opts("client", "hosts")
        client_options = self.cfg.opts("client", "options")
        es = self.client_factory(hosts, client_options).create()

        # Cannot enable custom telemetry devices here
        t = telemetry.Telemetry(devices=[
            # Be aware that some the meta-data are taken from the host system, not the container (e.g. number of CPU cores) so if the
            # Docker container constrains these, the metrics are actually wrong.
            telemetry.EnvironmentInfo(es, self.metrics_store),
            telemetry.NodeStats(es, self.metrics_store),
            telemetry.IndexStats(es, self.metrics_store),
            telemetry.DiskIo(self.metrics_store),
            telemetry.CpuUsage(self.metrics_store)
        ])

        c = cluster.Cluster(hosts, [], t)
        self._start_process(cmd="docker-compose -f %s up" % self.binary_path, node_name="rally0")
        logger.info("Docker container has successfully started. Checking if REST API is available.")
        if wait_for_rest_layer(es):
            logger.info("REST API is available. Attaching telemetry devices to cluster.")
            t.attach_to_cluster(c)
            logger.info("Telemetry devices are now attached to the cluster.")
        else:
            logger.error("REST API layer is not yet available. Forcefully terminating cluster.")
            self.stop(c)
            raise exceptions.LaunchError("Elasticsearch REST API layer is not available. Forcefully terminated cluster.")
        return c
Ejemplo n.º 7
0
    def start(self, car, binary, data_paths):
        hosts = self.cfg.opts("client", "hosts")
        client_options = self.cfg.opts("client", "options")
        es = client.EsClientFactory(hosts, client_options).create()

        # we're very specific which nodes we kill as there is potentially also an Elasticsearch based metrics store running on this machine
        node_prefix = self.cfg.opts("provisioning", "node.name.prefix")
        process.kill_running_es_instances(node_prefix)

        logger.info("Starting a cluster based on car [%s] with [%d] nodes." % (car, car.nodes))

        # TODO dm: Get rid of these...
        enabled_devices = self.cfg.opts("mechanic", "telemetry.devices")

        cluster_telemetry = [
            # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster
            telemetry.MergeParts(self.metrics_store, self.node_log_dir),
            telemetry.EnvironmentInfo(es, self.metrics_store),
            telemetry.NodeStats(es, self.metrics_store),
            telemetry.IndexStats(es, self.metrics_store),
            # TODO dm: Once we do distributed launching, this needs to be done per node not per cluster
            telemetry.IndexSize(data_paths, self.metrics_store)
        ]
        t = telemetry.Telemetry(enabled_devices, devices=cluster_telemetry)
        c = cluster.Cluster(hosts, [self._start_node(node, car, es, binary) for node in range(car.nodes)], t)
        logger.info("All cluster nodes have successfully started. Checking if REST API is available.")
        if wait_for_rest_layer(es):
            logger.info("REST API is available. Attaching telemetry devices to cluster.")
            t.attach_to_cluster(c)
            logger.info("Telemetry devices are now attached to the cluster.")
        else:
            logger.error("REST API layer is not yet available. Forcefully terminating cluster.")
            self.stop(c)
            raise exceptions.LaunchError("Elasticsearch REST API layer is not available. Forcefully terminated cluster.")
        return c
Ejemplo n.º 8
0
    def receiveMsg_StartEngine(self, msg, sender):
        self.logger.info("Received signal from race control to start engine.")
        self.race_control = sender
        self.cfg = msg.cfg
        cls = metrics.metrics_store_class(self.cfg)
        self.metrics_store = cls(self.cfg)
        self.metrics_store.open(ctx=msg.open_metrics_context)
        self.car, _ = load_team(self.cfg, msg.external)

        # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue.
        hosts = self.cfg.opts("client", "hosts").default
        if len(hosts) == 0:
            raise exceptions.LaunchError("No target hosts are configured.")

        if msg.external:
            self.logger.info("Cluster will not be provisioned by Rally.")
            # just create one actor for this special case and run it on the coordinator node (i.e. here)
            m = self.createActor(NodeMechanicActor,
                                 targetActorRequirements={"coordinator": True})
            self.children.append(m)
            self.send(m, msg.for_nodes(ip=hosts))
        else:
            self.logger.info(
                "Cluster consisting of %s will be provisioned by Rally.",
                hosts)
            msg.hosts = hosts
            # Initialize the children array to have the right size to
            # ensure waiting for all responses
            self.children = [None] * len(nodes_by_host(to_ip_port(hosts)))
            self.send(self.createActor(Dispatcher), msg)
        self.status = "starting"
        self.received_responses = []
Ejemplo n.º 9
0
 def _do_wait(self, expected_cluster_status):
     reached_cluster_status = None
     for attempt in range(10):
         try:
             result = self.client.cluster.health(
                 wait_for_status=expected_cluster_status,
                 wait_for_relocating_shards=0,
                 timeout="3s")
         except (socket.timeout, elasticsearch.exceptions.ConnectionError,
                 elasticsearch.exceptions.TransportError):
             pass
         else:
             reached_cluster_status = result["status"]
             relocating_shards = result["relocating_shards"]
             logger.info("GOT: %s" % str(result))
             logger.info("ALLOC:\n%s" % self.client.cat.allocation(v=True))
             logger.info("RECOVERY:\n%s" % self.client.cat.recovery(v=True))
             logger.info("SHARDS:\n%s" % self.client.cat.shards(v=True))
             if reached_cluster_status == expected_cluster_status and relocating_shards == 0:
                 return reached_cluster_status, relocating_shards
             else:
                 time.sleep(0.5)
     msg = "Cluster did not reach status [%s]. Last reached status: [%s]" % (
         expected_cluster_status, reached_cluster_status)
     logger.error(msg)
     raise exceptions.LaunchError(msg)
Ejemplo n.º 10
0
    def start(self):
        hosts = self.cfg.opts("client", "hosts")
        client_options = self.cfg.opts("client", "options")
        es = self.client_factory(hosts, client_options).create()

        t = telemetry.Telemetry(devices=[
            telemetry.ClusterMetaDataInfo(es),
            telemetry.ClusterEnvironmentInfo(es, self.metrics_store),
            telemetry.NodeStats(es, self.metrics_store),
            telemetry.IndexStats(es, self.metrics_store)
        ])

        # The list of nodes will be populated by ClusterMetaDataInfo, so no need to do it here
        c = cluster.Cluster(hosts, [], t)
        logger.info(
            "All cluster nodes have successfully started. Checking if REST API is available."
        )
        if wait_for_rest_layer(es, max_attempts=20):
            logger.info(
                "REST API is available. Attaching telemetry devices to cluster."
            )
            t.attach_to_cluster(c)
            logger.info("Telemetry devices are now attached to the cluster.")
        else:
            # Just stop the cluster here and raise. The caller is responsible for terminating individual nodes.
            logger.error(
                "REST API layer is not yet available. Forcefully terminating cluster."
            )
            self.stop(c)
            raise exceptions.LaunchError(
                "Elasticsearch REST API layer is not available. Forcefully terminated cluster."
            )

        return c
Ejemplo n.º 11
0
    def receiveMsg_StartEngine(self, msg, sender):
        self.logger.info("Received signal from race control to start engine.")
        self.race_control = sender
        self.cfg = msg.cfg
        self.car, _ = load_team(self.cfg, msg.external)
        # TODO: This is implicitly set by #load_team() - can we gather this elsewhere?
        self.team_revision = self.cfg.opts("mechanic", "repository.revision")

        # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue.
        hosts = self.cfg.opts("client", "hosts").default
        if len(hosts) == 0:
            raise exceptions.LaunchError("No target hosts are configured.")

        self.externally_provisioned = msg.external
        if self.externally_provisioned:
            self.logger.info("Cluster will not be provisioned by Rally.")
            self.status = "nodes_started"
            self.received_responses = []
            self.on_all_nodes_started()
            self.status = "cluster_started"
        else:
            console.info("Preparing for race ...", flush=True)
            self.logger.info("Cluster consisting of %s will be provisioned by Rally.", hosts)
            msg.hosts = hosts
            # Initialize the children array to have the right size to
            # ensure waiting for all responses
            self.children = [None] * len(nodes_by_host(to_ip_port(hosts)))
            self.send(self.createActor(Dispatcher), msg)
            self.status = "starting"
            self.received_responses = []
Ejemplo n.º 12
0
    def on_start_engine(self, msg, sender):
        logger.info("Received signal from race control to start engine.")
        self.race_control = sender
        self.cfg = msg.cfg
        cls = metrics.metrics_store_class(self.cfg)
        self.metrics_store = cls(self.cfg)
        self.metrics_store.open(ctx=msg.open_metrics_context)

        # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue.
        mechanics_and_start_message = []
        hosts = self.cfg.opts("client", "hosts")
        if len(hosts) == 0:
            raise exceptions.LaunchError("No target hosts are configured.")

        if msg.external:
            logger.info("Cluster will not be provisioned by Rally.")
            # just create one actor for this special case and run it on the coordinator node (i.e. here)
            m = self.createActor(NodeMechanicActor,
                                 #globalName="/rally/mechanic/worker/external",
                                 targetActorRequirements={"coordinator": True})
            self.children.append(m)
            mechanics_and_start_message.append((m, msg.for_nodes(ip=hosts)))
        else:
            logger.info("Cluster consisting of %s will be provisioned by Rally." % hosts)
            all_ips_and_ports = to_ip_port(hosts)
            all_node_ips = extract_all_node_ips(all_ips_and_ports)
            for ip_port, nodes in nodes_by_host(all_ips_and_ports).items():
                ip, port = ip_port
                if ip == "127.0.0.1":
                    m = self.createActor(NodeMechanicActor,
                                         #globalName="/rally/mechanic/worker/localhost",
                                         targetActorRequirements={"coordinator": True})
                    self.children.append(m)
                    mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes)))
                else:
                    if self.cfg.opts("system", "remote.benchmarking.supported"):
                        logger.info("Benchmarking against %s with external Rally daemon." % hosts)
                    else:
                        logger.error("User tried to benchmark against %s but no external Rally daemon has been started." % hosts)
                        raise exceptions.SystemSetupError("To benchmark remote hosts (e.g. %s) you need to start the Rally daemon "
                                                          "on each machine including this one." % ip)
                    already_running = actor.actor_system_already_running(ip=ip)
                    logger.info("Actor system on [%s] already running? [%s]" % (ip, str(already_running)))
                    if not already_running:
                        console.println("Waiting for Rally daemon on [%s] " % ip, end="", flush=True)
                    while not actor.actor_system_already_running(ip=ip):
                        console.println(".", end="", flush=True)
                        time.sleep(3)
                    if not already_running:
                        console.println(" [OK]")
                    m = self.createActor(NodeMechanicActor,
                                         #globalName="/rally/mechanic/worker/%s" % ip,
                                         targetActorRequirements={"ip": ip})
                    mechanics_and_start_message.append((m, msg.for_nodes(all_node_ips, ip, port, nodes)))
                    self.children.append(m)
        self.status = "starting"
        self.received_responses = []
        for mechanic_actor, start_message in mechanics_and_start_message:
            self.send(mechanic_actor, start_message)
Ejemplo n.º 13
0
 def _start_process(self, cmd, env, node_name, binary_path):
     if os.geteuid() == 0:
         raise exceptions.LaunchError("Cannot launch Elasticsearch as root. Please run Rally as a non-root user.")
     os.chdir(binary_path)
     startup_event = threading.Event()
     process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.DEVNULL, env=env)
     t = threading.Thread(target=self._read_output, args=(node_name, process, startup_event))
     t.setDaemon(True)
     t.start()
     if startup_event.wait(timeout=InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS):
         logger.info("Started node [%s] with PID [%s]" % (node_name, process.pid))
         return process
     else:
         msg = "Could not start node [%s] within timeout period of [%s] seconds." % (
             node_name, InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS)
         logger.error(msg)
         raise exceptions.LaunchError(msg)
Ejemplo n.º 14
0
 def cmd_line_opt(self, distribution_version, key):
     best_version = versions.best_match(InProcessLauncher.ES_CMD_LINE_OPTS_PER_VERSION.keys(), distribution_version)
     if best_version:
         return InProcessLauncher.ES_CMD_LINE_OPTS_PER_VERSION[best_version][key]
     else:
         raise exceptions.LaunchError("Cannot start cluster. Unsupported distribution version %s. "
                                      "Please raise a bug at %s." %
                                      (distribution_version, console.format.link("https://github.com/elastic/rally")))
Ejemplo n.º 15
0
    def start(self):
        """
        Performs final startup tasks.

        Precondition: All cluster nodes have been started.
        Postcondition: The cluster is ready to receive HTTP requests or a ``LaunchError`` is raised.

        :return: A representation of the launched cluster.
        """
        enabled_devices = self.cfg.opts("mechanic", "telemetry.devices")
        telemetry_params = self.cfg.opts("mechanic", "telemetry.params")
        all_hosts = self.cfg.opts("client", "hosts").all_hosts
        default_hosts = self.cfg.opts("client", "hosts").default
        preserve = self.cfg.opts("mechanic", "preserve.install")
        skip_rest_api_check = self.cfg.opts("mechanic", "skip.rest.api.check")

        es = {}
        for cluster_name, cluster_hosts in all_hosts.items():
            all_client_options = self.cfg.opts("client", "options").all_client_options
            cluster_client_options = dict(all_client_options[cluster_name])
            # Use retries to avoid aborts on long living connections for telemetry devices
            cluster_client_options["retry-on-timeout"] = True
            es[cluster_name] = self.client_factory(cluster_hosts, cluster_client_options).create()

        es_default = es["default"]

        t = telemetry.Telemetry(enabled_devices, devices=[
            telemetry.NodeStats(telemetry_params, es, self.metrics_store),
            telemetry.ClusterMetaDataInfo(es_default),
            telemetry.ClusterEnvironmentInfo(es_default, self.metrics_store),
            telemetry.JvmStatsSummary(es_default, self.metrics_store),
            telemetry.IndexStats(es_default, self.metrics_store),
            telemetry.MlBucketProcessingTime(es_default, self.metrics_store),
            telemetry.CcrStats(telemetry_params, es, self.metrics_store),
            telemetry.RecoveryStats(telemetry_params, es, self.metrics_store)
        ])

        # The list of nodes will be populated by ClusterMetaDataInfo, so no need to do it here
        c = cluster.Cluster(default_hosts, [], t, preserve)

        if skip_rest_api_check:
            self.logger.info("Skipping REST API check and attaching telemetry devices to cluster.")
            t.attach_to_cluster(c)
            self.logger.info("Telemetry devices are now attached to the cluster.")
        else:
            self.logger.info("All cluster nodes have successfully started. Checking if REST API is available.")
            if wait_for_rest_layer(es_default, max_attempts=40):
                self.logger.info("REST API is available. Attaching telemetry devices to cluster.")
                t.attach_to_cluster(c)
                self.logger.info("Telemetry devices are now attached to the cluster.")
            else:
                # Just stop the cluster here and raise. The caller is responsible for terminating individual nodes.
                self.logger.error("REST API layer is not yet available. Forcefully terminating cluster.")
                self.stop(c)
                raise exceptions.LaunchError(
                    "Elasticsearch REST API layer is not available. Forcefully terminated cluster.")
        return c
Ejemplo n.º 16
0
    def _start_process(self, binary_path):
        compose_cmd = _get_docker_compose_cmd(binary_path, "up -d")

        ret = process.run_subprocess_with_logging(compose_cmd)
        if ret != 0:
            msg = "Docker daemon startup failed with exit code[{}]".format(ret)
            logging.error(msg)
            raise exceptions.LaunchError(msg)

        container_id = _get_container_id(binary_path)
        _wait_for_healthy_running_container(container_id)
Ejemplo n.º 17
0
def _wait_for_healthy_running_container(container_id, timeout=60):
    cmd = 'docker ps -a --filter "id={}" --filter "status=running" --filter "health=healthy" -q'.format(container_id)
    endtime = _time() + timeout
    while _time() < endtime:
        output = subprocess.check_output(shlex.split(cmd))
        containers = output.decode("utf-8").rstrip()
        if len(containers) > 0:
            return
        time.sleep(0.5)
    msg = "No healthy running container after {} seconds!".format(timeout)
    logging.error(msg)
    raise exceptions.LaunchError(msg)
Ejemplo n.º 18
0
def wait_for_pidfile(pidfilename, timeout=60):
    endtime = _time() + timeout
    while _time() < endtime:
        try:
            with open(pidfilename, "rb") as f:
                return int(f.read())
        except FileNotFoundError:
            time.sleep(0.5)

    msg = "pid file not available after {} seconds!".format(timeout)
    logging.error(msg)
    raise exceptions.LaunchError(msg)
Ejemplo n.º 19
0
 def _wait_for_healthy_running_container(self, container_id, timeout):
     cmd = 'docker ps -a --filter "id={}" --filter "status=running" --filter "health=healthy" -q'.format(container_id)
     stop_watch = self.clock.stop_watch()
     stop_watch.start()
     while stop_watch.split_time() < timeout:
         containers = process.run_subprocess_with_output(cmd)
         if len(containers) > 0:
             return
         time.sleep(0.5)
     msg = "No healthy running container after {} seconds!".format(timeout)
     logging.error(msg)
     raise exceptions.LaunchError(msg)
Ejemplo n.º 20
0
def wait_for_pidfile(pidfilename, timeout=60, clock=time.Clock):
    stop_watch = clock.stop_watch()
    stop_watch.start()
    while stop_watch.split_time() < timeout:
        try:
            with open(pidfilename, "rb") as f:
                return int(f.read())
        except FileNotFoundError:
            time.sleep(0.5)

    msg = "pid file not available after {} seconds!".format(timeout)
    logging.error(msg)
    raise exceptions.LaunchError(msg)
Ejemplo n.º 21
0
 def _start_process(self, env, node_name, binary_path):
     if os.geteuid() == 0:
         raise exceptions.LaunchError(
             "Cannot launch Elasticsearch as root. Please run Rally as a non-root user."
         )
     os.chdir(binary_path)
     cmd = ["bin/elasticsearch"]
     return _start(
         subprocess.Popen(cmd,
                          stdout=subprocess.PIPE,
                          stderr=subprocess.STDOUT,
                          stdin=subprocess.DEVNULL,
                          env=env), node_name)
Ejemplo n.º 22
0
 def _start_process(self, cmd, node_name, log_dir):
     startup_event = threading.Event()
     p = subprocess.Popen(shlex.split(cmd), stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.DEVNULL)
     t = threading.Thread(target=self._read_output, args=(node_name, p, startup_event))
     t.setDaemon(True)
     t.start()
     if startup_event.wait(timeout=DockerLauncher.PROCESS_WAIT_TIMEOUT_SECONDS):
         self.logger.info("Started node=%s with pid=%s", node_name, p.pid)
         return p
     else:
         msg = "Could not start node '%s' within timeout period of %s seconds." % (
             node_name, InProcessLauncher.PROCESS_WAIT_TIMEOUT_SECONDS)
         self.logger.error(msg)
         raise exceptions.LaunchError("%s Please check the logs in '%s' for more details." % (msg, log_dir))
Ejemplo n.º 23
0
def wait_for_rest_layer(es, max_attempts=20):
    for attempt in range(max_attempts):
        import elasticsearch
        try:
            es.info()
            return True
        except elasticsearch.ConnectionError as e:
            if "SSL: UNKNOWN_PROTOCOL" in str(e):
                raise exceptions.LaunchError("Could not connect to cluster via https. Is this a https endpoint?", e)
            else:
                time.sleep(1)
        except elasticsearch.TransportError as e:
            if e.status_code == 503:
                time.sleep(1)
            elif e.status_code == 401:
                time.sleep(1)
            else:
                raise e
    return False
Ejemplo n.º 24
0
    def receiveMsg_StartEngine(self, msg, sender):
        self.logger.info("Received signal from race control to start engine.")
        self.race_control = sender
        self.cfg = msg.cfg
        cls = metrics.metrics_store_class(self.cfg)
        self.metrics_store = cls(self.cfg)
        self.metrics_store.open(ctx=msg.open_metrics_context)
        name = self.cfg.opts("race", "pipeline")
        self.car, _ = load_team(self.cfg, msg.external)
        self.team_revision = self.cfg.opts("mechanic", "repository.revision")

        # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue.
        hosts = self.cfg.opts("client", "hosts").default
        if len(hosts) == 0:
            raise exceptions.LaunchError("No target hosts are configured.")

        if msg.external:
            self.logger.info("Cluster will not be provisioned by Rally.")
            if msg.cluster_settings:
                pretty_settings = json.dumps(msg.cluster_settings, indent=2)
                warning = "Ensure that these settings are defined in elasticsearch.yml:\n\n{}\n\nIf they are absent, running this track " \
                          "will fail or lead to unexpected results.".format(pretty_settings)
                console.warn(warning, logger=self.logger)
            # just create one actor for this special case and run it on the coordinator node (i.e. here)
            m = self.createActor(NodeMechanicActor,
                                 targetActorRequirements={"coordinator": True})
            self.children.append(m)
            self.send(m, msg.for_nodes(ip=hosts))
        else:
            console.info("Preparing for race ...", flush=True)
            self.logger.info(
                "Cluster consisting of %s will be provisioned by Rally.",
                hosts)
            msg.hosts = hosts
            # Initialize the children array to have the right size to
            # ensure waiting for all responses
            self.children = [None] * len(nodes_by_host(to_ip_port(hosts)))
            self.send(self.createActor(Dispatcher), msg)
        self.status = "starting"
        self.received_responses = []
Ejemplo n.º 25
0
    def receiveMsg_StartEngine(self, msg, sender):
        self.logger.info("Received signal from race control to start engine.")
        self.race_control = sender
        self.cfg = msg.cfg
        cls = metrics.metrics_store_class(self.cfg)
        self.metrics_store = cls(self.cfg)
        self.metrics_store.open(ctx=msg.open_metrics_context)
        self.car, _ = load_team(self.cfg, msg.external)
        self.team_revision = self.cfg.opts("mechanic", "repository.revision")

        # In our startup procedure we first create all mechanics. Only if this succeeds we'll continue.
        hosts = self.cfg.opts("client", "hosts").default
        if len(hosts) == 0:
            raise exceptions.LaunchError("No target hosts are configured.")

        self.externally_provisioned = msg.external
        if self.externally_provisioned:
            self.logger.info("Cluster will not be provisioned by Rally.")
            # TODO: This needs to be handled later - we should probably disallow this entirely
            if msg.cluster_settings:
                pretty_settings = json.dumps(msg.cluster_settings, indent=2)
                warning = "Ensure that these settings are defined in elasticsearch.yml:\n\n{}\n\nIf they are absent, running this track " \
                          "will fail or lead to unexpected results.".format(pretty_settings)
                console.warn(warning, logger=self.logger)
            self.status = "nodes_started"
            self.received_responses = []
            self.on_all_nodes_started()
            self.status = "cluster_started"
        else:
            console.info("Preparing for race ...", flush=True)
            self.logger.info(
                "Cluster consisting of %s will be provisioned by Rally.",
                hosts)
            msg.hosts = hosts
            # Initialize the children array to have the right size to
            # ensure waiting for all responses
            self.children = [None] * len(nodes_by_host(to_ip_port(hosts)))
            self.send(self.createActor(Dispatcher), msg)
            self.status = "starting"
            self.received_responses = []
Ejemplo n.º 26
0
 def receiveMessage(self, msg, sender):
     try:
         logger.debug(
             "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" %
             (str(type(msg)), str(sender)))
         if isinstance(msg, StartEngine):
             self.on_start_engine(msg, sender)
         elif isinstance(msg, NodesStarted):
             self.metrics_store.merge_meta_info(msg.system_meta_info)
             self.transition_when_all_children_responded(
                 sender, msg, "starting", "nodes_started",
                 self.on_all_nodes_started)
         elif isinstance(msg, MetricsMetaInfoApplied):
             self.transition_when_all_children_responded(
                 sender, msg, "apply_meta_info", "cluster_started",
                 self.on_cluster_started)
         elif isinstance(msg, OnBenchmarkStart):
             self.metrics_store.lap = msg.lap
             self.cluster.on_benchmark_start()
             # in the first lap, we are in state "cluster_started", after that in "benchmark_stopped"
             self.send_to_children_and_transition(
                 sender, msg, ["cluster_started", "benchmark_stopped"],
                 "benchmark_starting")
         elif isinstance(msg, BenchmarkStarted):
             self.transition_when_all_children_responded(
                 sender, msg, "benchmark_starting", "benchmark_started",
                 self.on_benchmark_started)
         elif isinstance(msg, Failure):
             self.send(self.race_control, msg)
         elif isinstance(msg, OnBenchmarkStop):
             self.send_to_children_and_transition(sender, msg,
                                                  "benchmark_started",
                                                  "benchmark_stopping")
         elif isinstance(msg, BenchmarkStopped):
             self.metrics_store.bulk_add(msg.system_metrics)
             self.transition_when_all_children_responded(
                 sender, msg, "benchmark_stopping", "benchmark_stopped",
                 self.on_benchmark_stopped)
         elif isinstance(msg, StopEngine):
             # detach from cluster and gather all system metrics
             self.cluster_launcher.stop(self.cluster)
             # we might have experienced a launch error, hence we need to allow to stop the cluster also after a launch
             self.send_to_children_and_transition(
                 sender, StopNodes(),
                 ["nodes_started", "benchmark_stopped"], "cluster_stopping")
         elif isinstance(msg, NodesStopped):
             self.metrics_store.bulk_add(msg.system_metrics)
             self.transition_when_all_children_responded(
                 sender, msg, "cluster_stopping", "cluster_stopped",
                 self.on_all_nodes_stopped)
         elif isinstance(msg, thespian.actors.ChildActorExited):
             if self.is_current_status_expected("cluster_stopping"):
                 logger.info(
                     "Child actor exited while engine is stopping: [%s]" %
                     msg)
             else:
                 raise exceptions.RallyError(
                     "Child actor exited with [%s] while in status [%s]." %
                     (msg, self.status))
         elif isinstance(msg, thespian.actors.PoisonMessage):
             # something went wrong with a child actor
             if isinstance(msg.poisonMessage, StartEngine):
                 raise exceptions.LaunchError(
                     "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?"
                 )
             else:
                 logger.error(
                     "[%s] sent to a child actor has resulted in PoisonMessage"
                     % str(msg.poisonMessage))
                 raise exceptions.RallyError(
                     "Could not communicate with benchmark candidate (unknown reason)"
                 )
     except BaseException:
         logger.exception("Cannot process message [%s]" % msg)
         # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather
         # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor.
         recipient = self.race_control if sender in self.mechanics else sender
         ex_type, ex_value, ex_traceback = sys.exc_info()
         # avoid "can't pickle traceback objects"
         import traceback
         self.send(
             recipient,
             Failure("Could not execute command (%s)" % ex_value,
                     traceback.format_exc()))
Ejemplo n.º 27
0
 def receiveMessage(self, msg, sender):
     try:
         logger.info(
             "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" %
             (str(type(msg)), str(sender)))
         if isinstance(msg, StartEngine):
             self.on_start_engine(msg, sender)
         elif isinstance(msg, NodesStarted):
             self.metrics_store.merge_meta_info(msg.system_meta_info)
             self.transition_when_all_children_responded(
                 sender, msg, "starting", "nodes_started",
                 self.on_all_nodes_started)
         elif isinstance(msg, MetricsMetaInfoApplied):
             self.transition_when_all_children_responded(
                 sender, msg, "apply_meta_info", "cluster_started",
                 self.on_cluster_started)
         elif isinstance(msg, OnBenchmarkStart):
             self.metrics_store.lap = msg.lap
             self.cluster.on_benchmark_start()
             # in the first lap, we are in state "cluster_started", after that in "benchmark_stopped"
             self.send_to_children_and_transition(
                 sender, msg, ["cluster_started", "benchmark_stopped"],
                 "benchmark_starting")
         elif isinstance(msg, BenchmarkStarted):
             self.transition_when_all_children_responded(
                 sender, msg, "benchmark_starting", "benchmark_started",
                 self.on_benchmark_started)
         elif isinstance(msg, ResetRelativeTime):
             if msg.reset_in_seconds > 0:
                 self.wakeupAfter(msg.reset_in_seconds)
             else:
                 self.reset_relative_time()
         elif isinstance(msg, thespian.actors.WakeupMessage):
             self.reset_relative_time()
         elif isinstance(msg, actor.BenchmarkFailure):
             self.send(self.race_control, msg)
         elif isinstance(msg, OnBenchmarkStop):
             self.send_to_children_and_transition(sender, msg,
                                                  "benchmark_started",
                                                  "benchmark_stopping")
         elif isinstance(msg, BenchmarkStopped):
             self.metrics_store.bulk_add(msg.system_metrics)
             self.transition_when_all_children_responded(
                 sender, msg, "benchmark_stopping", "benchmark_stopped",
                 self.on_benchmark_stopped)
         elif isinstance(msg, StopEngine):
             # detach from cluster and gather all system metrics
             self.cluster_launcher.stop(self.cluster)
             # we might have experienced a launch error or the user has cancelled the benchmark. Hence we need to allow to stop the
             # cluster from various states and we don't check here for a specific one.
             self.send_to_children_and_transition(sender, StopNodes(), [],
                                                  "cluster_stopping")
         elif isinstance(msg, NodesStopped):
             self.metrics_store.bulk_add(msg.system_metrics)
             self.transition_when_all_children_responded(
                 sender, msg, "cluster_stopping", "cluster_stopped",
                 self.on_all_nodes_stopped)
         elif isinstance(msg, thespian.actors.ActorExitRequest):
             # due to early termination by race control. If it's self-initiated we already took care of the rest.
             if sender != self.myAddress:
                 self.send_to_children_and_transition(
                     self.myAddress,
                     msg,
                     expected_status=None,
                     new_status="cluster_stopping")
         elif isinstance(msg, thespian.actors.ChildActorExited):
             if self.is_current_status_expected("cluster_stopping"):
                 logger.info(
                     "Child actor exited while engine is stopping: [%s]" %
                     msg)
             else:
                 raise exceptions.RallyError(
                     "Child actor exited with [%s] while in status [%s]." %
                     (msg, self.status))
         elif isinstance(msg, thespian.actors.PoisonMessage):
             # something went wrong with a child actor
             if isinstance(msg.poisonMessage, StartEngine):
                 raise exceptions.LaunchError(
                     "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?"
                 )
             else:
                 logger.error(
                     "[%s] sent to a child actor has resulted in PoisonMessage"
                     % str(msg.poisonMessage))
                 raise exceptions.RallyError(
                     "Could not communicate with benchmark candidate (unknown reason)"
                 )
     except BaseException:
         # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather
         # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor.
         recipient = self.race_control if sender in self.children else sender
         logger.exception("Cannot process message [%s]. Notifying [%s]." %
                          (msg, recipient))
         ex_type, ex_value, ex_traceback = sys.exc_info()
         # avoid "can't pickle traceback objects"
         import traceback
         self.send(
             recipient,
             actor.BenchmarkFailure(
                 "Could not execute command (%s)" % ex_value,
                 traceback.format_exc()))
Ejemplo n.º 28
0
 def receiveMessage(self, msg, sender):
     try:
         logger.info(
             "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" %
             (str(type(msg)), str(sender)))
         if isinstance(msg, StartEngine):
             self.on_start_engine(msg, sender)
         elif isinstance(msg, NodesStarted):
             self.metrics_store.merge_meta_info(msg.system_meta_info)
             self.transition_when_all_children_responded(
                 sender, msg, "starting", "nodes_started",
                 self.on_all_nodes_started)
         elif isinstance(msg, MetricsMetaInfoApplied):
             self.transition_when_all_children_responded(
                 sender, msg, "apply_meta_info", "cluster_started",
                 self.on_cluster_started)
         elif isinstance(msg, OnBenchmarkStart):
             self.metrics_store.lap = msg.lap
             # in the first lap, we are in state "cluster_started", after that in "benchmark_stopped"
             self.send_to_children_and_transition(
                 sender, msg, ["cluster_started", "benchmark_stopped"],
                 "benchmark_starting")
         elif isinstance(msg, BenchmarkStarted):
             self.transition_when_all_children_responded(
                 sender, msg, "benchmark_starting", "benchmark_started",
                 self.on_benchmark_started)
         elif isinstance(msg, ResetRelativeTime):
             if msg.reset_in_seconds > 0:
                 self.wakeupAfter(msg.reset_in_seconds)
             else:
                 self.reset_relative_time()
         elif isinstance(msg, thespian.actors.WakeupMessage):
             self.reset_relative_time()
         elif isinstance(msg, actor.BenchmarkFailure):
             self.send(self.race_control, msg)
         elif isinstance(msg, OnBenchmarkStop):
             self.send_to_children_and_transition(sender, msg,
                                                  "benchmark_started",
                                                  "benchmark_stopping")
         elif isinstance(msg, BenchmarkStopped):
             self.metrics_store.bulk_add(msg.system_metrics)
             self.transition_when_all_children_responded(
                 sender, msg, "benchmark_stopping", "benchmark_stopped",
                 self.on_benchmark_stopped)
         elif isinstance(msg, StopEngine):
             # detach from cluster and gather all system metrics
             self.cluster_launcher.stop(self.cluster)
             # we might have experienced a launch error or the user has cancelled the benchmark. Hence we need to allow to stop the
             # cluster from various states and we don't check here for a specific one.
             self.send_to_children_and_transition(sender, StopNodes(), [],
                                                  "cluster_stopping")
         elif isinstance(msg, NodesStopped):
             self.metrics_store.bulk_add(msg.system_metrics)
             self.transition_when_all_children_responded(
                 sender, msg, "cluster_stopping", "cluster_stopped",
                 self.on_all_nodes_stopped)
         elif isinstance(msg, thespian.actors.ActorExitRequest):
             # due to early termination by race control. If it's self-initiated we already took care of the rest.
             if sender != self.myAddress:
                 self.send_to_children_and_transition(
                     self.myAddress,
                     msg,
                     expected_status=None,
                     new_status="cluster_stopping")
         elif isinstance(msg, thespian.actors.ChildActorExited):
             if self.is_current_status_expected(
                 ["cluster_stopping", "cluster_stopped"]):
                 logger.info(
                     "Child actor exited while engine is stopping or stopped: [%s]"
                     % msg)
             else:
                 raise exceptions.RallyError(
                     "Child actor exited with [%s] while in status [%s]." %
                     (msg, self.status))
         elif isinstance(msg, thespian.actors.PoisonMessage):
             # something went wrong with a child actor
             if isinstance(msg.poisonMessage, StartEngine):
                 raise exceptions.LaunchError(
                     "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?"
                 )
             else:
                 logger.error(
                     "[%s] sent to a child actor has resulted in PoisonMessage"
                     % str(msg.poisonMessage))
                 raise exceptions.RallyError(
                     "Could not communicate with benchmark candidate (unknown reason)"
                 )
         else:
             logger.info(
                 "MechanicActor received unknown message [%s] (ignoring)." %
                 (str(msg)))
     except BaseException as e:
         logger.exception("Cannot process message")
         logger.error("Failed message details: [%s]. Notifying [%s]." %
                      (msg, self.race_control))
         self.send(
             self.race_control,
             actor.BenchmarkFailure(
                 "Error in Elasticsearch cluster coordinator", e))
Ejemplo n.º 29
0
    def receiveMessage(self, msg, sender):
        try:
            logger.debug(
                "MechanicActor#receiveMessage(msg = [%s] sender = [%s])" %
                (str(type(msg)), str(sender)))
            if isinstance(msg, StartEngine):
                logger.info(
                    "Received signal from race control to start engine.")
                self.race_control = sender
                # In our startup procedure we first create all mechanics. Only if this succeeds
                mechanics_and_start_message = []

                if msg.external:
                    logger.info(
                        "Target node(s) will not be provisioned by Rally.")
                    # just create one actor for this special case and run it on the coordinator node (i.e. here)
                    m = self.createActor(
                        LocalNodeMechanicActor,
                        globalName="/rally/mechanic/worker/external",
                        targetActorRequirements={"coordinator": True})
                    self.mechanics.append(m)
                    # we can use the original message in this case
                    mechanics_and_start_message.append((m, msg))
                else:
                    hosts = msg.cfg.opts("client", "hosts")
                    logger.info(
                        "Target node(s) %s will be provisioned by Rally." %
                        hosts)
                    if len(hosts) == 0:
                        raise exceptions.LaunchError(
                            "No target hosts are configured.")
                    for host in hosts:
                        ip = host["host"]
                        port = int(host["port"])
                        # user may specify "localhost" on the command line but the problem is that we auto-register the actor system
                        # with "ip": "127.0.0.1" so we convert this special case automatically. In all other cases the user needs to
                        # start the actor system on the other host and is aware that the parameter for the actor system and the
                        # --target-hosts parameter need to match.
                        if ip == "localhost" or ip == "127.0.0.1":
                            m = self.createActor(
                                LocalNodeMechanicActor,
                                globalName="/rally/mechanic/worker/localhost",
                                targetActorRequirements={"coordinator": True})
                            self.mechanics.append(m)
                            mechanics_and_start_message.append(
                                (m, msg.with_port(port)))
                        else:
                            if msg.cfg.opts("system",
                                            "remote.benchmarking.supported"):
                                logger.info(
                                    "Benchmarking against %s with external Rally daemon."
                                    % hosts)
                            else:
                                logger.error(
                                    "User tried to benchmark against %s but no external Rally daemon has been started."
                                    % hosts)
                                raise exceptions.SystemSetupError(
                                    "To benchmark remote hosts (e.g. %s) you need to start the Rally daemon "
                                    "on each machine including this one." % ip)
                            already_running = actor.actor_system_already_running(
                                ip=ip)
                            logger.info(
                                "Actor system on [%s] already running? [%s]" %
                                (ip, str(already_running)))
                            if not already_running:
                                console.println(
                                    "Waiting for Rally daemon on [%s] " % ip,
                                    end="",
                                    flush=True)
                            while not actor.actor_system_already_running(
                                    ip=ip):
                                console.println(".", end="", flush=True)
                                time.sleep(3)
                            if not already_running:
                                console.println(" [OK]")
                            m = self.createActor(
                                RemoteNodeMechanicActor,
                                globalName="/rally/mechanic/worker/%s" % ip,
                                targetActorRequirements={"ip": ip})
                            mechanics_and_start_message.append(
                                (m, msg.with_port(port)))
                            self.mechanics.append(m)
                for mechanic_actor, start_message in mechanics_and_start_message:
                    self.send(mechanic_actor, start_message)
            elif isinstance(msg, EngineStarted):
                self.send(self.race_control, msg)
            elif isinstance(msg, OnBenchmarkStart):
                for m in self.mechanics:
                    self.send(m, msg)
            elif isinstance(msg, Success):
                self.send(self.race_control, msg)
            elif isinstance(msg, Failure):
                self.send(self.race_control, msg)
            elif isinstance(msg, OnBenchmarkStop):
                for m in self.mechanics:
                    self.send(m, msg)
            elif isinstance(msg, BenchmarkStopped):
                # TODO dm: Actually we need to wait for all BenchmarkStopped messages from all our mechanic actors
                # TODO dm: We will actually duplicate cluster level metrics if each of our mechanic actors gathers these...
                self.send(self.race_control, msg)
            elif isinstance(msg, StopEngine):
                for m in self.mechanics:
                    self.send(m, msg)
            elif isinstance(msg, EngineStopped):
                self.send(self.race_control, msg)
                # clear all state as the mechanic might get reused later
                for m in self.mechanics:
                    self.send(m, thespian.actors.ActorExitRequest())
                self.mechanics = []
                # self terminate + slave nodes
                self.send(self.myAddress, thespian.actors.ActorExitRequest())
            elif isinstance(msg, thespian.actors.ChildActorExited):
                # TODO dm: Depending on our state model this can be fine (e.g. when it exited due to our ActorExitRequest message
                # or it could be problematic and mean that an exception has occured.
                pass
            elif isinstance(msg, thespian.actors.PoisonMessage):
                # something went wrong with a child actor
                if isinstance(msg.poisonMessage, StartEngine):
                    raise exceptions.LaunchError(
                        "Could not start benchmark candidate. Are Rally daemons on all targeted machines running?"
                    )
                else:
                    logger.error(
                        "[%s] sent to a child actor has resulted in PoisonMessage"
                        % str(msg.poisonMessage))
                    raise exceptions.RallyError(
                        "Could not communicate with benchmark candidate (unknown reason)"
                    )
        except BaseException:
            logger.exception("Cannot process message [%s]" % msg)
            # usually, we'll notify the sender but in case a child sent something that caused an exception we'd rather
            # have it bubble up to race control. Otherwise, we could play ping-pong with our child actor.
            recipient = self.race_control if sender in self.mechanics else sender
            ex_type, ex_value, ex_traceback = sys.exc_info()
            # avoid "can't pickle traceback objects"
            import traceback
            self.send(
                recipient,
                Failure("Could not execute command (%s)" % ex_value,
                        traceback.format_exc()))