def test_stores_index_size_for_data_paths(self, run_subprocess, metrics_store_node_count, get_size): get_size.side_effect = [2048, 16384] cfg = create_config() metrics_store = metrics.EsMetricsStore(cfg) device = telemetry.IndexSize( ["/var/elasticsearch/data/1", "/var/elasticsearch/data/2"], metrics_store) t = telemetry.Telemetry(enabled_devices=[], devices=[device]) node = cluster.Node(process=None, host_name="localhost", node_name="rally-node-0", telemetry=t) t.attach_to_node(node) t.on_benchmark_start() t.on_benchmark_stop() t.detach_from_node(node, running=True) t.detach_from_node(node, running=False) metrics_store_node_count.assert_has_calls([ mock.call("rally-node-0", "final_index_size_bytes", 18432, "byte") ]) run_subprocess.assert_has_calls([ mock.call("find /var/elasticsearch/data/1 -ls", header="index files:"), mock.call("find /var/elasticsearch/data/2 -ls", header="index files:") ])
def test_stores_node_level_metrics_on_attach(self, cpu_model, physical_cpu_cores, logical_cpu_cores, os_version, os_name, metrics_store_add_meta_info): cpu_model.return_value = "Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz" physical_cpu_cores.return_value = 4 logical_cpu_cores.return_value = 8 os_version.return_value = "4.2.0-18-generic" os_name.return_value = "Linux" metrics_store = metrics.EsMetricsStore(create_config()) node = cluster.Node(None, "io", "rally0", None) env_device = telemetry.NodeEnvironmentInfo(metrics_store) env_device.attach_to_node(node) calls = [ mock.call(metrics.MetaInfoScope.node, "rally0", "os_name", "Linux"), mock.call(metrics.MetaInfoScope.node, "rally0", "os_version", "4.2.0-18-generic"), mock.call(metrics.MetaInfoScope.node, "rally0", "cpu_logical_cores", 8), mock.call(metrics.MetaInfoScope.node, "rally0", "cpu_physical_cores", 4), mock.call(metrics.MetaInfoScope.node, "rally0", "cpu_model", "Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz"), mock.call(metrics.MetaInfoScope.node, "rally0", "node_name", "rally0"), mock.call(metrics.MetaInfoScope.node, "rally0", "host_name", "io"), ] metrics_store_add_meta_info.assert_has_calls(calls)
def _start_node(self, node, car, es, binary_path): node_name = self._node_name(node) host_name = socket.gethostname() enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") node_telemetry = [ telemetry.FlightRecorder(self.node_telemetry_dir), telemetry.JitCompiler(self.node_telemetry_dir), telemetry.Gc(self.node_telemetry_dir), telemetry.PerfStat(self.node_telemetry_dir), telemetry.DiskIo(self.metrics_store), telemetry.CpuUsage(self.metrics_store), telemetry.EnvironmentInfo(es, self.metrics_store), ] t = telemetry.Telemetry(enabled_devices, devices=node_telemetry) env = self._prepare_env(car, node_name, t) cmd = self.prepare_cmd(car, node_name) process = self._start_process(cmd, env, node_name, binary_path) node = cluster.Node(process, host_name, node_name, t) logger.info("Cluster node [%s] has successfully started. Attaching telemetry devices to node." % node_name) t.attach_to_node(node) logger.info("Telemetry devices are now attached to node [%s]." % node_name) return node
def _start_node(self, node_configuration, node_count_on_host): host_name = node_configuration.ip node_name = node_configuration.node_name car = node_configuration.car binary_path = node_configuration.binary_path data_paths = node_configuration.data_paths node_telemetry_dir = "%s/telemetry" % node_configuration.node_root_path java_major_version, java_home = java_resolver.java_home(car, self.cfg) self.logger.info("Starting node [%s] based on car [%s].", node_name, car) enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") telemetry_params = self.cfg.opts("mechanic", "telemetry.params") node_telemetry = [ telemetry.DiskIo(self.metrics_store, node_count_on_host, node_telemetry_dir, node_name), telemetry.NodeEnvironmentInfo(self.metrics_store), telemetry.IndexSize(data_paths, self.metrics_store), telemetry.MergeParts(self.metrics_store, node_configuration.log_path), telemetry.StartupTime(self.metrics_store), ] t = telemetry.Telemetry(enabled_devices, devices=node_telemetry) env = self._prepare_env(car, node_name, java_home, t) t.on_pre_node_start(node_name) node_pid = self._start_process(binary_path, env) node = cluster.Node(node_pid, host_name, node_name, t) self.logger.info("Attaching telemetry devices to node [%s].", node_name) t.attach_to_node(node) return node
def _start_node(self, node_configuration, node_count_on_host): host_name = node_configuration.ip node_name = node_configuration.node_name binary_path = node_configuration.binary_path data_paths = node_configuration.data_paths node_telemetry_dir = os.path.join(node_configuration.node_root_path, "telemetry") java_major_version, java_home = java_resolver.java_home(node_configuration.car_runtime_jdks, self.cfg) self.logger.info("Starting node [%s].", node_name) enabled_devices = self.cfg.opts("telemetry", "devices") telemetry_params = self.cfg.opts("telemetry", "params") node_telemetry = [ telemetry.FlightRecorder(telemetry_params, node_telemetry_dir, java_major_version), telemetry.JitCompiler(node_telemetry_dir), telemetry.Gc(node_telemetry_dir, java_major_version), telemetry.Heapdump(node_telemetry_dir), telemetry.DiskIo(node_count_on_host), telemetry.IndexSize(data_paths), telemetry.StartupTime(), ] t = telemetry.Telemetry(enabled_devices, devices=node_telemetry) # TODO #822: Remove reference to car's environment env = self._prepare_env(node_configuration.car_env, node_name, java_home, t) t.on_pre_node_start(node_name) node_pid = self._start_process(binary_path, env) self.logger.info("Successfully started node [%s] with PID [%s].", node_name, node_pid) node = cluster.Node(node_pid, binary_path, host_name, node_name, t) self.logger.info("Attaching telemetry devices to node [%s].", node_name) t.attach_to_node(node) return node
def test_store_calculated_metrics(self, listdir_mock, open_mock, metrics_store_put_value, metrics_store_put_count): log_file = ''' INFO: System starting up INFO: 100 msec to merge doc values [500 docs] INFO: Something unrelated INFO: 250 msec to merge doc values [1350 docs] INFO: System shutting down ''' listdir_mock.return_value = [open_mock] open_mock.side_effect = [ mock.mock_open(read_data=log_file).return_value ] metrics_store = metrics.EsMetricsStore(self.cfg) node = cluster.Node(None, "io", "rally0", None) merge_parts_device = telemetry.MergeParts(metrics_store, node_log_dir="/var/log") merge_parts_device.attach_to_node(node) merge_parts_device.on_benchmark_stop() metrics_store_put_value.assert_called_with( "rally0", "merge_parts_total_time_doc_values", 350, "ms") metrics_store_put_count.assert_called_with( "rally0", "merge_parts_total_docs_doc_values", 1850)
def _start_node(self, node_configuration, node_count_on_host, java_major_version): host_name = node_configuration.ip node_name = node_configuration.node_name car = node_configuration.car binary_path = node_configuration.binary_path data_paths = node_configuration.data_paths node_telemetry_dir = "%s/telemetry" % node_configuration.node_root_path self.logger.info("Starting node [%s] based on car [%s].", node_name, car) enabled_devices = self.cfg.opts("mechanic", "telemetry.devices") telemetry_params = self.cfg.opts("mechanic", "telemetry.params") node_telemetry = [ telemetry.FlightRecorder(telemetry_params, node_telemetry_dir, java_major_version), telemetry.JitCompiler(node_telemetry_dir), telemetry.Gc(node_telemetry_dir, java_major_version), telemetry.PerfStat(node_telemetry_dir), telemetry.DiskIo(self.metrics_store, node_count_on_host), telemetry.CpuUsage(self.metrics_store), telemetry.NodeEnvironmentInfo(self.metrics_store), telemetry.IndexSize(data_paths, self.metrics_store), telemetry.MergeParts(self.metrics_store, node_configuration.log_path), telemetry.StartupTime(self.metrics_store), ] t = telemetry.Telemetry(enabled_devices, devices=node_telemetry) env = self._prepare_env(car, node_name, t) t.on_pre_node_start(node_name) node_process = self._start_process(env, node_name, binary_path) node = cluster.Node(node_process, host_name, node_name, t) self.logger.info("Node [%s] has successfully started. Attaching telemetry devices.", node_name) t.attach_to_node(node) self.logger.info("Telemetry devices are now attached to node [%s].", node_name) return node
def _start_node(self, host, node, es): node_name = self._node_name(node) p = self._start_process(cmd="docker-compose -f %s up" % self.binary_path, node_name=node_name) # only support a subset of telemetry for Docker hosts (specifically, we do not allow users to enable any devices) node_telemetry = [ telemetry.DiskIo(self.metrics_store), telemetry.CpuUsage(self.metrics_store), telemetry.EnvironmentInfo(es, self.metrics_store) ] t = telemetry.Telemetry(devices=node_telemetry) return cluster.Node(p, host["host"], node_name, t)
def test_stops_container_successfully(self, run_subprocess_with_logging, add_metadata_for_node): cfg = config.Config() metrics_store = None docker = launcher.DockerLauncher(cfg) nodes = [cluster.Node(0, "/bin", "127.0.0.1", "testnode", telemetry.Telemetry())] docker.stop(nodes, metrics_store=metrics_store) add_metadata_for_node.assert_called_once_with(metrics_store, "testnode", "127.0.0.1") run_subprocess_with_logging.assert_called_once_with("docker-compose -f /bin/docker-compose.yml down")
def test_store_nothing_if_no_metrics_present(self, listdir_mock, open_mock, metrics_store_put_value, metrics_store_put_count): listdir_mock.return_value = [open_mock] open_mock.side_effect = [ mock.mock_open(read_data="no data to parse").return_value ] metrics_store = metrics.EsMetricsStore(self.cfg) node = cluster.Node(None, "io", "rally0", None) merge_parts_device = telemetry.MergeParts(self.cfg, metrics_store) merge_parts_device.attach_to_node(node) merge_parts_device.on_benchmark_stop() metrics_store_put_value.assert_not_called() metrics_store_put_count.assert_not_called()
def test_store_calculated_metrics(self, metrics_store_put_value, stop_watch): stop_watch.total_time.return_value = 2 metrics_store = metrics.EsMetricsStore(create_config()) node = cluster.Node(None, "io", "rally0", None) startup_time = telemetry.StartupTime(metrics_store) # replace with mock startup_time.timer = stop_watch startup_time.on_pre_node_start(node.node_name) # ... nodes starts up ... startup_time.attach_to_node(node) metrics_store_put_value.assert_called_with("rally0", "node_startup_time", 2, "s")
def test_daemon_stop_with_already_terminated_process(self): cfg = config.Config() cfg.add(config.Scope.application, "node", "root.dir", "test") cfg.add(config.Scope.application, "telemetry", "devices", []) cfg.add(config.Scope.application, "telemetry", "params", None) cfg.add(config.Scope.application, "system", "env.name", "test") ms = get_metrics_store(cfg) proc_launcher = launcher.ProcessLauncher(cfg) nodes = [cluster.Node(pid=-1, binary_path="/bin", host_name="localhost", node_name="rally-0", telemetry=telemetry.Telemetry())] stopped_nodes = proc_launcher.stop(nodes, ms) # no nodes should have been stopped (they were already stopped) assert stopped_nodes == []
def start(self, node_configurations): nodes = [] for node_configuration in node_configurations: node_name = node_configuration.node_name host_name = node_configuration.ip binary_path = node_configuration.binary_path self.logger.info("Starting node [%s] in Docker.", node_name) self._start_process(binary_path) node_telemetry = [ # Don't attach any telemetry devices for now but keep the infrastructure in place ] t = telemetry.Telemetry(devices=node_telemetry) node = cluster.Node(0, binary_path, host_name, node_name, t) t.attach_to_node(node) nodes.append(node) return nodes
def start(self, node_configurations): nodes = [] for node_configuration in node_configurations: node_name = node_configuration.node_name host_name = node_configuration.ip binary_path = node_configuration.binary_path self.binary_paths[node_name] = binary_path self._start_process(binary_path) # only support a subset of telemetry for Docker hosts # (specifically, we do not allow users to enable any devices) node_telemetry = [ telemetry.DiskIo(self.metrics_store, len(node_configurations)), telemetry.NodeEnvironmentInfo(self.metrics_store) ] t = telemetry.Telemetry(devices=node_telemetry) nodes.append(cluster.Node(0, host_name, node_name, t)) return nodes
def start(self, node_configurations): nodes = [] for node_configuration in node_configurations: node_name = node_configuration.node_name host_name = node_configuration.ip binary_path = node_configuration.binary_path self.binary_paths[node_name] = binary_path self._start_process(binary_path) node_telemetry = [ # Don't attach any telemetry devices for now but keep the infrastructure in place ] t = telemetry.Telemetry(devices=node_telemetry) telemetry.add_metadata_for_node(self.metrics_store, node_name, host_name) node = cluster.Node(0, host_name, node_name, t) t.attach_to_node(node) nodes.append(node) return nodes
def test_stores_nothing_if_no_data_path(self, run_subprocess, metrics_store_cluster_count, get_size): get_size.return_value = 2048 cfg = create_config() metrics_store = metrics.EsMetricsStore(cfg) device = telemetry.IndexSize(data_paths=[], metrics_store=metrics_store) t = telemetry.Telemetry(devices=[device]) node = cluster.Node(process=None, host_name="localhost", node_name="rally-node-0", telemetry=t) t.attach_to_node(node) t.on_benchmark_start() t.on_benchmark_stop() t.detach_from_node(node, running=True) t.detach_from_node(node, running=False) run_subprocess.assert_not_called() metrics_store_cluster_count.assert_not_called() get_size.assert_not_called()
def _start_node(self, node_configuration, node_count_on_host): host_name = node_configuration.ip node_name = node_configuration.node_name car = node_configuration.car binary_path = node_configuration.binary_path data_paths = node_configuration.data_paths node_telemetry_dir = os.path.join(node_configuration.node_root_path, "telemetry") java_major_version, java_home = java_resolver.java_home(car, self.cfg) telemetry.add_metadata_for_node(self.metrics_store, node_name, host_name) self.logger.info("Starting node [%s] based on car [%s].", node_name, car) enabled_devices = self.cfg.opts("telemetry", "devices") telemetry_params = self.cfg.opts("telemetry", "params") node_telemetry = [ telemetry.FlightRecorder(telemetry_params, node_telemetry_dir, java_major_version), telemetry.JitCompiler(node_telemetry_dir), telemetry.Gc(node_telemetry_dir, java_major_version), telemetry.Heapdump(node_telemetry_dir), telemetry.DiskIo(self.metrics_store, node_count_on_host, node_telemetry_dir, node_name), telemetry.IndexSize(data_paths, self.metrics_store), telemetry.StartupTime(self.metrics_store), ] t = telemetry.Telemetry(enabled_devices, devices=node_telemetry) env = self._prepare_env(car, node_name, java_home, t) t.on_pre_node_start(node_name) node_pid = self._start_process(binary_path, env) node = cluster.Node(node_pid, host_name, node_name, t) self.logger.info("Attaching telemetry devices to node [%s].", node_name) t.attach_to_node(node) return node
def start(self, node_configurations): nodes = [] for node_configuration in node_configurations: node_name = node_configuration.node_name host_name = node_configuration.ip binary_path = node_configuration.binary_path node_telemetry_dir = os.path.join( node_configuration.node_root_path, "telemetry") self.binary_paths[node_name] = binary_path self._start_process(binary_path) # only support a subset of telemetry for Docker hosts # (specifically, we do not allow users to enable any devices) node_telemetry = [ telemetry.DiskIo(self.metrics_store, len(node_configurations), node_telemetry_dir, node_name), ] t = telemetry.Telemetry(devices=node_telemetry) telemetry.add_metadata_for_node(self.metrics_store, node_name, host_name) nodes.append(cluster.Node(0, host_name, node_name, t)) return nodes
def _start_node(self, node, car, es): node_name = self._node_name(node) host_name = socket.gethostname() node_telemetry = [ telemetry.FlightRecorder(self.cfg, self.metrics_store), telemetry.JitCompiler(self.cfg, self.metrics_store), telemetry.Gc(self.cfg, self.metrics_store), telemetry.PerfStat(self.cfg, self.metrics_store), telemetry.DiskIo(self.cfg, self.metrics_store), telemetry.CpuUsage(self.cfg, self.metrics_store), telemetry.EnvironmentInfo(self.cfg, es, self.metrics_store), ] t = telemetry.Telemetry(self.cfg, devices=node_telemetry) env = self._prepare_env(car, node_name, t) cmd = self.prepare_cmd(car, node_name) process = self._start_process(cmd, env, node_name) node = cluster.Node(process, host_name, node_name, t) t.attach_to_node(node) return node
def test_enriches_cluster_nodes_for_elasticsearch_1_x(self): nodes_stats = { "nodes": { "FCFjozkeTiOpN-SI88YEcg": { "name": "rally0", "host": "127.0.0.1", "fs": { "data": [{ "mount": "/usr/local/var/elasticsearch/data1", "type": "hfs" }, { "mount": "/usr/local/var/elasticsearch/data2", "type": "ntfs" }] } } } } nodes_info = { "nodes": { "FCFjozkeTiOpN-SI88YEcg": { "name": "rally0", "host": "127.0.0.1", "ip": "127.0.0.1", "os": { "name": "Mac OS X", "version": "10.11.4", "available_processors": 8, "mem": { "total_in_bytes": 17179869184 } }, "jvm": { "version": "1.8.0_74", "vm_vendor": "Oracle Corporation" } } } } cluster_info = { "version": { "build_hash": "c730b59357f8ebc555286794dcd90b3411f517c9", "number": "1.7.5" } } client = Client(nodes=SubClient(stats=nodes_stats, info=nodes_info), info=cluster_info) t = telemetry.Telemetry( devices=[telemetry.ClusterMetaDataInfo(client)]) c = cluster.Cluster(hosts=[{ "host": "localhost", "port": 39200 }], nodes=[ cluster.Node(process=None, host_name="local", node_name="rally0", telemetry=None) ], telemetry=t) t.attach_to_cluster(c) self.assertEqual("1.7.5", c.distribution_version) self.assertEqual("c730b59357f8ebc555286794dcd90b3411f517c9", c.source_revision) self.assertEqual(1, len(c.nodes)) n = c.nodes[0] self.assertEqual("127.0.0.1", n.ip) self.assertEqual("Mac OS X", n.os["name"]) self.assertEqual("10.11.4", n.os["version"]) self.assertEqual("Oracle Corporation", n.jvm["vendor"]) self.assertEqual("1.8.0_74", n.jvm["version"]) self.assertEqual(8, n.cpu["available_processors"]) self.assertIsNone(n.cpu["allocated_processors"]) self.assertEqual(17179869184, n.memory["total_bytes"]) self.assertEqual(2, len(n.fs)) self.assertEqual("/usr/local/var/elasticsearch/data1", n.fs[0]["mount"]) self.assertEqual("hfs", n.fs[0]["type"]) self.assertEqual("unknown", n.fs[0]["spins"]) self.assertEqual("/usr/local/var/elasticsearch/data2", n.fs[1]["mount"]) self.assertEqual("ntfs", n.fs[1]["type"]) self.assertEqual("unknown", n.fs[1]["spins"])
def test_enriches_cluster_nodes_for_elasticsearch_after_1_x(self): nodes_stats = { "nodes": { "FCFjozkeTiOpN-SI88YEcg": { "name": "rally0", "host": "127.0.0.1", "os": { "mem": { "total_in_bytes": 17179869184 } }, "fs": { "data": [{ "mount": "/usr/local/var/elasticsearch/data1", "type": "hfs" }, { "mount": "/usr/local/var/elasticsearch/data2", "type": "ntfs" }] } } } } nodes_info = { "nodes": { "FCFjozkeTiOpN-SI88YEcg": { "name": "rally0", "host": "127.0.0.1", "ip": "127.0.0.1", "os": { "name": "Mac OS X", "version": "10.11.4", "available_processors": 8, "allocated_processors": 4 }, "jvm": { "version": "1.8.0_74", "vm_vendor": "Oracle Corporation" }, "plugins": [{ "name": "analysis-icu", "version": "5.0.0", "description": "The ICU Analysis plugin integrates Lucene ICU module ...", "classname": "org.elasticsearch.plugin.analysis.icu.AnalysisICUPlugin", "has_native_controller": False }, { "name": "ingest-geoip", "version": "5.0.0", "description": "Ingest processor that uses looksup geo data ...", "classname": "org.elasticsearch.ingest.geoip.IngestGeoIpPlugin", "has_native_controller": False }, { "name": "ingest-user-agent", "version": "5.0.0", "description": "Ingest processor that extracts information from a user agent", "classname": "org.elasticsearch.ingest.useragent.IngestUserAgentPlugin", "has_native_controller": False }] } } } cluster_info = { "version": { "build_hash": "253032b", "number": "5.0.0" } } client = Client(nodes=SubClient(stats=nodes_stats, info=nodes_info), info=cluster_info) t = telemetry.Telemetry( devices=[telemetry.ClusterMetaDataInfo(client)]) c = cluster.Cluster(hosts=[{ "host": "localhost", "port": 39200 }], nodes=[ cluster.Node(process=None, host_name="local", node_name="rally0", telemetry=None) ], telemetry=t) t.attach_to_cluster(c) self.assertEqual("5.0.0", c.distribution_version) self.assertEqual("253032b", c.source_revision) self.assertEqual(1, len(c.nodes)) n = c.nodes[0] self.assertEqual("127.0.0.1", n.ip) self.assertEqual("Mac OS X", n.os["name"]) self.assertEqual("10.11.4", n.os["version"]) self.assertEqual("Oracle Corporation", n.jvm["vendor"]) self.assertEqual("1.8.0_74", n.jvm["version"]) self.assertEqual(8, n.cpu["available_processors"]) self.assertEqual(4, n.cpu["allocated_processors"]) self.assertEqual(17179869184, n.memory["total_bytes"]) self.assertEqual(2, len(n.fs)) self.assertEqual("/usr/local/var/elasticsearch/data1", n.fs[0]["mount"]) self.assertEqual("hfs", n.fs[0]["type"]) self.assertEqual("unknown", n.fs[0]["spins"]) self.assertEqual("/usr/local/var/elasticsearch/data2", n.fs[1]["mount"]) self.assertEqual("ntfs", n.fs[1]["type"]) self.assertEqual("unknown", n.fs[1]["spins"]) self.assertEqual(["analysis-icu", "ingest-geoip", "ingest-user-agent"], n.plugins)