def getCommonTestMessages(self): messages = [ wagglemsg.Message( name="test", value=1234, timestamp=time.time_ns(), meta={}, ), wagglemsg.Message( name="e", value=2.71828, timestamp=time.time_ns(), meta={"user": "******"}, ), wagglemsg.Message( name="replace.app.meta.with.sys.meta", value="should replace meta with app and sys meta", timestamp=time.time_ns(), meta={ "vsn": "Z123", "job": "sure", "task": "ok", }, ), ] shuffle(messages) return messages
def add_system_metrics_gps(args, messages): """Add GPS system metrics Args: args: all program arguments messages: the message queue to append metric to """ timestamp = time.time_ns() logging.info("collecting system metrics (GPS)") if "nxcore" not in args.waggle_host_id: logging.warning("skipping GPS publish for non-main host (%s)", args.waggle_host_id) return try: tpv_report = False sat_report = False gpsclient = GPSDClient(host=args.gpsd_host, port=args.gpsd_port) for result in gpsclient.dict_stream(convert_datetime=False): # look for a GPS report if not tpv_report and result["class"] == "TPV": tpv_report = True for vkey in ["lat", "lon", "alt", "epx", "epy", "epv", "mode"]: value = result.get(vkey) if value: messages.append( message.Message( name="sys.gps.{name}".format(name=vkey), value=float(value), timestamp=timestamp, meta={}, ) ) else: logging.info("gps (%s) not found. skipping...", vkey) # report salellite info if not sat_report and result["class"] == "SKY" and result["satellites"]: sat_report = True # loop over the sallites and count number being used used_sats = len([x for x in result["satellites"] if x["used"]]) messages.append( message.Message( name="sys.gps.satellites", value=int(used_sats), timestamp=timestamp, meta={}, ) ) if sat_report and tpv_report: break except Exception: logging.exception("failed to get gps system metrics")
def convert_to_upload_message(msg: wagglemsg.Message, upload_publish_name: str) -> wagglemsg.Message: return wagglemsg.Message( timestamp=msg.timestamp, name=upload_publish_name, meta=msg.meta, # TODO(sean) be careful on ownership here, in case this is mutated value=upload_url_for_message(msg), )
def getPublishTestCases(self): # TODO(sean) should we fuzz test this to try lot's of different arguments messages = self.getCommonTestMessages() app_uid = str(uuid4()) app_meta = { "job": f"sage-{randint(1, 1000000)}", "task": f"testing-{randint(1, 1000000)}", "host": f"{randint(1, 1000000)}.ws-nxcore", "plugin": f"plugin-test:{randtag()}", "vsn": "should be replaced", } self.updateAppMetaCache(app_uid, app_meta) # we expect the same messages, but with the app and sys meta tagged want_messages = [ wagglemsg.Message( name=msg.name, value=msg.value, timestamp=msg.timestamp, # NOTE(sean) the order of meta is important. we should expect: # 1. sys meta overrides msg meta and app meta # 2. app meta overrides msg meta meta={ **msg.meta, **app_meta, **self.service.system_meta }) for msg in messages ] return app_uid, messages, want_messages
def add_system_metrics(args, messages): timestamp = time.time_ns() logging.info("collecting system metrics from %s", args.metrics_url) text = get_node_exporter_metrics(args.metrics_url) for family in text_string_to_metric_families(text): for sample in family.samples: try: name = prom2waggle[sample.name] except KeyError: continue messages.append( message.Message( name=name, value=sample.value, timestamp=timestamp, meta=sample.labels, ) ) add_system_metrics_tegra(args, messages) add_system_metrics_jetson_clocks(args, messages) add_system_metrics_nvme(args, messages) add_system_metrics_gps(args, messages)
def add_system_metrics_nvme(args, messages): """Add system metrics for an optional NVMe drive (/dev/nvme0) Args: args: all program arguments messages: the message queue to append metric to """ timestamp = time.time_ns() logging.info("collecting system metrics (NVMe)") nvmeroot = "/dev/nvme0" type = "nvme-therm" zone = "none" try: if Path(nvmeroot).exists(): nvmedev = Device("/dev/nvme0") messages.append( message.Message( name="sys.thermal", value=float(nvmedev.temperature), timestamp=timestamp, meta={"type": type, "zone": zone}, ) ) else: logging.info("nvme (%s) not found. skipping...", nvmeroot) except Exception: logging.exception("failed to get nvme system metrics")
def testPublishUpload(self): # TODO(sean) clean up! added as a regression test for now. app_uid = str(uuid4()) tag = randtag() app_meta = { "job": f"sage-{randint(1, 1000000)}", "task": f"testing-{randint(1, 1000000)}", "host": f"{randint(1, 1000000)}.ws-nxcore", "plugin": f"plugin-test:{tag}", "vsn": "should be replaced", } self.updateAppMetaCache(app_uid, app_meta) timestamp = time.time_ns() filename = "hello.txt" with TemporaryDirectory() as dir: file = Path(dir, filename) file.write_text("hello") with get_plugin(app_uid) as plugin: plugin.upload_file(file, meta={"user": "******"}, timestamp=timestamp) job = app_meta["job"] task = app_meta["task"] node = self.service.system_meta["node"] self.assertMessages("to-beehive", [ wagglemsg.Message( name="upload", value= f"https://storage.sagecontinuum.org/api/v1/data/{job}/sage-{task}-{tag}/{node}/{timestamp}-{filename}", timestamp=timestamp, meta={ "user": "******", "filename": "hello.txt", **app_meta, **self.service.system_meta, }) ]) self.assertMetrics({ "wes_data_service_messages_total": 1, "wes_data_service_messages_rejected_total": 0, "wes_data_service_messages_published_node_total": 1, "wes_data_service_messages_published_beehive_total": 1, })
def main(args): if args.type in ["int", "i"]: v = int(args.value) elif args.type in ["float", "f"]: v = float(args.value) elif args.type in ["string", "str"]: v = str(args.value) else: raise Exception(f'Wrong type detected: {args.type}') meta = {} for m in args.meta: try: sp = m.split('=') meta[sp[0]] = sp[1] except Exception as ex: raise Exception(f'Failed to parser meta {args.meta}: {str(ex)}') meta.update({ "node": "plugin", "vsn": "W000", }) msg = wagglemsg.Message( name=args.topic, value=v, timestamp=get_timestamp(), meta=meta, ) params = pika.ConnectionParameters(host=args.rabbitmq_host, port=args.rabbitmq_port, credentials=pika.PlainCredentials( "plugin", "plugin"), retry_delay=60, socket_timeout=10.0) conn = pika.BlockingConnection(params) ch = conn.channel() ch.basic_publish("data.topic", args.topic, wagglemsg.dump(msg), properties=pika.BasicProperties( delivery_mode=2, user_id="plugin", )) print("message published") ch.close() conn.close()
def add_uptime_metrics(args, messages): logging.info("collecting uptime metrics") timestamp = time.time_ns() try: uptime = get_uptime_seconds() messages.append( message.Message( name="sys.uptime", value=uptime, timestamp=timestamp, meta={}, ) ) except FileNotFoundError: logging.warning("could not access /host/proc/uptime") except Exception: logging.exception("failed to get uptime")
def getSystemPublishTestCases(self): messages = self.getCommonTestMessages() # we expect the same messages, but for system publishers we only want sys meta tagged want_messages = [ wagglemsg.Message( name=msg.name, value=msg.value, timestamp=msg.timestamp, # NOTE(sean) the order of meta is important. we should expect: # 1. sys meta overrides msg meta meta={ **msg.meta, **self.service.system_meta }) for msg in messages ] return messages, want_messages
def add_version_metrics(args, messages): logging.info("collecting version metrics") timestamp = time.time_ns() try: version = Path("/host/etc/waggle_version_os").read_text().strip() messages.append( message.Message( name="sys.version.os", value=version, timestamp=timestamp, meta={}, ) ) logging.info("added os version") except FileNotFoundError: logging.info("os version not found. skipping...") except Exception: logging.exception("failed to get os version")
def add_provision_metrics(args, messages): logging.info("collecting system provision metrics") timestamp = time.time_ns() try: # check the last line is a complete factory provision log lastline = Path("/host/etc/waggle/factory_provision").read_text().strip().rsplit("\n", 1)[1] if "Factory Provisioning Finish" in lastline: date = lastline.rsplit(":", 1)[0] messages.append( message.Message( name="sys.provision.factory_date", value=date, timestamp=timestamp, meta={}, ) ) logging.info("added factory provision date") except FileNotFoundError: logging.info("factory provision not found, skipping...") except Exception: logging.exception("failed to get factory provision")
def main(): parser = argparse.ArgumentParser() parser.add_argument("--rabbitmq_host", default=getenv("RABBITMQ_HOST", "localhost")) parser.add_argument("--rabbitmq_port", default=getenv("RABBITMQ_PORT", "5672"), type=int) parser.add_argument("--rabbitmq_username", default=getenv("RABBITMQ_USERNAME", "")) parser.add_argument("--rabbitmq_password", default=getenv("RABBITMQ_PASSWORD", "")) parser.add_argument("--rabbitmq_cacertfile", default=getenv("RABBITMQ_CACERTFILE", "")) parser.add_argument("--rabbitmq_certfile", default=getenv("RABBITMQ_CERTFILE", "")) parser.add_argument("--rabbitmq_keyfile", default=getenv("RABBITMQ_KEYFILE", "")) parser.add_argument("--rabbitmq_exchange", default=getenv("RABBITMQ_EXCHANGE", "waggle.msg")) args = parser.parse_args() if args.rabbitmq_username != "": credentials = pika.PlainCredentials(args.rabbitmq_username, args.rabbitmq_password) else: credentials = pika.credentials.ExternalCredentials() if args.rabbitmq_cacertfile != "": context = ssl.create_default_context(cafile=args.rabbitmq_cacertfile) # HACK this allows the host and baked in host to be configured independently context.check_hostname = False if args.rabbitmq_certfile != "": context.load_cert_chain(args.rabbitmq_certfile, args.rabbitmq_keyfile) ssl_options = pika.SSLOptions(context, args.rabbitmq_host) else: ssl_options = None params = pika.ConnectionParameters(host=args.rabbitmq_host, port=args.rabbitmq_port, credentials=credentials, ssl_options=ssl_options, retry_delay=60, socket_timeout=10.0) conn = pika.BlockingConnection(params) ch = conn.channel() while True: msg = message.Message(name="env.temperature.gen", timestamp=time.time_ns(), value=random.uniform(0.0, 5.0), meta={ "node": "0000000000000001", "plugin": "metsense:1.0.2" }) body = message.dump(msg) properties = pika.BasicProperties(user_id="node-0000000000000001") ch.basic_publish("waggle.msg", routing_key="", properties=properties, body=body) msg = message.Message(name="sys.uptime", timestamp=time.time_ns(), value=time.time(), meta={ "node": "0000000000000001", "plugin": "status:1.0.0" }) body = message.dump(msg) properties = pika.BasicProperties(user_id="node-0000000000000001") ch.basic_publish("waggle.msg", routing_key="", properties=properties, body=body) msg = message.Message(name="sys.uptime", timestamp=time.time_ns(), value=time.time() + 1.4, meta={ "node": "0000000000000002", "plugin": "status:1.0.0" }) body = message.dump(msg) properties = pika.BasicProperties(user_id="node-0000000000000002") ch.basic_publish("waggle.msg", routing_key="", properties=properties, body=body) msg = message.Message(name="sys.uptime", timestamp=time.time_ns(), value=time.time() + 2.3, meta={ "node": "0000000000000003", "plugin": "status:1.0.0" }) body = message.dump(msg) properties = pika.BasicProperties(user_id="node-0000000000000003") ch.basic_publish("waggle.msg", routing_key="", properties=properties, body=body) time.sleep(1)
def add_system_metrics_tegra(args, messages): """Add system metrics gathered by the `tegrastats` subprocess Args: args: all program arguments messages: the message queue to append metric to """ timestamp = time.time_ns() logging.info("collecting system metrics (tegra)") tegradata = None try: with subprocess.Popen(["tegrastats"], stdout=subprocess.PIPE) as process: # wait for 10 seconds to get tegrastats info pollresults = select([process.stdout], [], [], 10)[0] if pollresults: output = pollresults[0].readline() if output: tegradata = output.strip().decode() if tegradata: # populate CPU frequency percentages ## ex. CPU [25%@652,15%@806,16%@880,31%@902,19%@960,38%@960] CPU_RE = re.compile(r"CPU \[(.*?)\]") cpudata = CPU_RE.search(tegradata) if cpudata: for idx, cpu_str in enumerate(cpudata.group(1).split(",")): if "off" == cpu_str: continue messages.append( message.Message( name="sys.freq.cpu_perc", value=__val_freq(cpu_str)["perc"], timestamp=timestamp, meta={"cpu": str(idx)}, ) ) # populate the GPU, EMC (external memory controller), # APE (audio processing engine), etc. freqency percentages ## ex. EMC_FREQ 1%@1600 GR3D_FREQ 0%@114 APE 150 VALS_RE = re.compile(r"\b([A-Z0-9_]+) ([0-9%@]+)(?=[^/])\b") for name, val in re.findall(VALS_RE, tegradata): name = name.split("_")[0] if "FREQ" in name else name hz_data = __val_freq(val) # normalize to GPU names if name.lower() == "gr3d": name = "gpu" if hz_data.get("perc", None) is not None: messages.append( message.Message( name="sys.freq.{name}_perc".format(name=name.lower()), value=hz_data["perc"], timestamp=timestamp, meta={}, ) ) # ONLY for APE do we report current frequency as it can't # be found more accurate elsewhere if name == "APE" and hz_data.get("freq"): messages.append( message.Message( name="sys.freq.{name}".format(name=name.lower()), value=hz_data["freq"], timestamp=timestamp, meta={}, ) ) # populate Wattage data (milliwatts) ## ex. VDD_IN 5071/4811 VDD_CPU_GPU_CV 1315/1066 VDD_SOC 1116/1116 WATT_RE = re.compile(r"\b(\w+) ([0-9.]+)\/([0-9.]+)\b") for name, current, avg in re.findall(WATT_RE, tegradata): messages.append( message.Message( name="sys.power", value=int(current), timestamp=timestamp, meta={"name": name.lower()}, ) ) else: logging.info("tegrastats did not return any data. skipping...") except Exception: logging.exception("failed to get tegra system metrics")
def add_system_metrics_jetson_clocks(args, messages): """Add Jetson specific GPU and EMC frequency information to system metrics Args: args: all program arguments messages: the message queue to append metric to """ timestamp = time.time_ns() logging.info("collecting system metrics (Jetson Clocks)") pdata = [] try: with subprocess.Popen(["jetson_clocks", "--show"], stdout=subprocess.PIPE) as process: # wait for 10 seconds to get jetson_clocks info t_end = time.time() + 10 while time.time() < t_end: pollresults = select([process.stdout], [], [], 2)[0] if pollresults: output = pollresults[0].readline() if output: pdata.append(output.strip().decode()) else: break if pdata: # populate the GPU and EMC min, max, current frequency GPU_RE = re.compile(r"GPU MinFreq=(\d+) MaxFreq=(\d+) CurrentFreq=(\d+)") EMC_RE = re.compile(r"EMC MinFreq=(\d+) MaxFreq=(\d+) CurrentFreq=(\d+)") for line in pdata: gpudata = GPU_RE.search(line) emcdata = EMC_RE.search(line) name = "" if gpudata: name = "gpu" freqdata = gpudata elif emcdata: name = "emc" freqdata = emcdata if name: messages.append( message.Message( name="sys.freq.{name}_min".format(name=name.lower()), value=int(freqdata.group(1)), timestamp=timestamp, meta={}, ) ) messages.append( message.Message( name="sys.freq.{name}_max".format(name=name.lower()), value=int(freqdata.group(2)), timestamp=timestamp, meta={}, ) ) messages.append( message.Message( name="sys.freq.{name}".format(name=name.lower()), value=int(freqdata.group(3)), timestamp=timestamp, meta={}, ) ) else: logging.info("jetson_clocks did not return any data. skipping...") except Exception: logging.exception("failed to get jetson clock system metrics")