Esempio n. 1
0
 def stop(self, nodes, metrics_store):
     self.logger.info("Shutting down [%d] nodes running in Docker on this host.", len(nodes))
     for node in nodes:
         self.logger.info("Stopping node [%s].", node.node_name)
         telemetry.add_metadata_for_node(metrics_store, node.node_name, node.host_name)
         node.telemetry.detach_from_node(node, running=True)
         process.run_subprocess_with_logging(self._docker_compose(node.binary_path, "down"))
         node.telemetry.detach_from_node(node, running=False)
         node.telemetry.store_system_metrics(node, metrics_store)
Esempio n. 2
0
    def stop(self, nodes, metrics_store):
        if self.keep_running:
            self.logger.info("Keeping [%d] nodes on this host running.",
                             len(nodes))
        else:
            self.logger.info("Shutting down [%d] nodes on this host.",
                             len(nodes))
        stopped_nodes = []
        for node in nodes:
            node_name = node.node_name
            if metrics_store:
                telemetry.add_metadata_for_node(metrics_store, node_name,
                                                node.host_name)
            try:
                es = psutil.Process(pid=node.pid)
                node.telemetry.detach_from_node(node, running=True)
            except psutil.NoSuchProcess:
                self.logger.warning(
                    "No process found with PID [%s] for node [%s].", node.pid,
                    node_name)
                es = None

            if not self.keep_running:
                if es:
                    stop_watch = self._clock.stop_watch()
                    stop_watch.start()
                    try:
                        es.terminate()
                        es.wait(10.0)
                        stopped_nodes.append(node)
                    except psutil.NoSuchProcess:
                        self.logger.warning(
                            "No process found with PID [%s] for node [%s].",
                            es.pid, node_name)
                    except psutil.TimeoutExpired:
                        self.logger.info("kill -KILL node [%s]", node_name)
                        try:
                            # kill -9
                            es.kill()
                            stopped_nodes.append(node)
                        except psutil.NoSuchProcess:
                            self.logger.warning(
                                "No process found with PID [%s] for node [%s].",
                                es.pid, node_name)
                    self.logger.info(
                        "Done shutting down node [%s] in [%.1f] s.", node_name,
                        stop_watch.split_time())

                node.telemetry.detach_from_node(node, running=False)
            # store system metrics in any case (telemetry devices may derive system metrics while the node is running)
            if metrics_store:
                node.telemetry.store_system_metrics(node, metrics_store)
        return stopped_nodes
Esempio n. 3
0
 def start(self, node_configurations):
     nodes = []
     for node_configuration in node_configurations:
         node_name = node_configuration.node_name
         host_name = node_configuration.ip
         binary_path = node_configuration.binary_path
         self.binary_paths[node_name] = binary_path
         self._start_process(binary_path)
         node_telemetry = [
             # Don't attach any telemetry devices for now but keep the infrastructure in place
         ]
         t = telemetry.Telemetry(devices=node_telemetry)
         telemetry.add_metadata_for_node(self.metrics_store, node_name,
                                         host_name)
         node = cluster.Node(0, host_name, node_name, t)
         t.attach_to_node(node)
         nodes.append(node)
     return nodes
Esempio n. 4
0
    def _start_node(self, node_configuration, node_count_on_host):
        host_name = node_configuration.ip
        node_name = node_configuration.node_name
        car = node_configuration.car
        binary_path = node_configuration.binary_path
        data_paths = node_configuration.data_paths
        node_telemetry_dir = os.path.join(node_configuration.node_root_path,
                                          "telemetry")

        java_major_version, java_home = java_resolver.java_home(car, self.cfg)

        telemetry.add_metadata_for_node(self.metrics_store, node_name,
                                        host_name)

        self.logger.info("Starting node [%s] based on car [%s].", node_name,
                         car)

        enabled_devices = self.cfg.opts("telemetry", "devices")
        telemetry_params = self.cfg.opts("telemetry", "params")
        node_telemetry = [
            telemetry.FlightRecorder(telemetry_params, node_telemetry_dir,
                                     java_major_version),
            telemetry.JitCompiler(node_telemetry_dir),
            telemetry.Gc(node_telemetry_dir, java_major_version),
            telemetry.Heapdump(node_telemetry_dir),
            telemetry.DiskIo(self.metrics_store, node_count_on_host,
                             node_telemetry_dir, node_name),
            telemetry.IndexSize(data_paths, self.metrics_store),
            telemetry.StartupTime(self.metrics_store),
        ]

        t = telemetry.Telemetry(enabled_devices, devices=node_telemetry)
        env = self._prepare_env(car, node_name, java_home, t)
        t.on_pre_node_start(node_name)
        node_pid = self._start_process(binary_path, env)
        node = cluster.Node(node_pid, host_name, node_name, t)

        self.logger.info("Attaching telemetry devices to node [%s].",
                         node_name)
        t.attach_to_node(node)

        return node
Esempio n. 5
0
 def start(self, node_configurations):
     nodes = []
     for node_configuration in node_configurations:
         node_name = node_configuration.node_name
         host_name = node_configuration.ip
         binary_path = node_configuration.binary_path
         node_telemetry_dir = os.path.join(
             node_configuration.node_root_path, "telemetry")
         self.binary_paths[node_name] = binary_path
         self._start_process(binary_path)
         # only support a subset of telemetry for Docker hosts
         # (specifically, we do not allow users to enable any devices)
         node_telemetry = [
             telemetry.DiskIo(self.metrics_store, len(node_configurations),
                              node_telemetry_dir, node_name),
         ]
         t = telemetry.Telemetry(devices=node_telemetry)
         telemetry.add_metadata_for_node(self.metrics_store, node_name,
                                         host_name)
         nodes.append(cluster.Node(0, host_name, node_name, t))
     return nodes
Esempio n. 6
0
    def stop(self, nodes, metrics_store):
        if self.keep_running:
            self.logger.info("Keeping [%d] nodes on this host running.",
                             len(nodes))
        else:
            self.logger.info("Shutting down [%d] nodes on this host.",
                             len(nodes))
        for node in nodes:
            proc = psutil.Process(pid=node.pid)
            node_name = node.node_name
            telemetry.add_metadata_for_node(metrics_store, node_name,
                                            node.host_name)

            node.telemetry.detach_from_node(node, running=True)
            if not self.keep_running:
                stop_watch = self._clock.stop_watch()
                stop_watch.start()
                try:
                    os.kill(proc.pid, signal.SIGTERM)
                    proc.wait(10.0)
                except ProcessLookupError:
                    self.logger.warning(
                        "No process found with PID [%s] for node [%s]",
                        proc.pid, node_name)
                except psutil.TimeoutExpired:
                    self.logger.info("kill -KILL node [%s]", node_name)
                    try:
                        # kill -9
                        proc.kill()
                    except ProcessLookupError:
                        self.logger.warning(
                            "No process found with PID [%s] for node [%s]",
                            proc.pid, node_name)
                node.telemetry.detach_from_node(node, running=False)
                node.telemetry.store_system_metrics(node, metrics_store)
                self.logger.info("Done shutting down node [%s] in [%.1f] s.",
                                 node_name, stop_watch.split_time())