async def setUp(self) -> None: self.mc_server = mock.MagicMock() self.mc_server.sensors = SensorSet() self.mc_server.orig_sensors = SensorSet() self.mc_server.sensors.add( Sensor(str, 'products', '', default='["product1", "product2"]', initial_status=Sensor.Status.NOMINAL)) self.mc_server.sensors.add( Sensor(str, 'gui-urls', '', default=json.dumps(ROOT_GUI_URLS), initial_status=Sensor.Status.NOMINAL)) self.mc_server.orig_sensors.add( Sensor(str, 'product1.gui-urls', '', default=json.dumps(PRODUCT1_GUI_URLS), initial_status=Sensor.Status.NOMINAL)) self.mc_server.orig_sensors.add( Sensor(str, 'product2.cal.1.gui-urls', '', default=json.dumps(PRODUCT2_CAL_GUI_URLS), initial_status=Sensor.Status.NOMINAL)) self.mc_server.orig_sensors.add( Sensor(str, 'product2.ingest.1.gui-urls', '', default=json.dumps(PRODUCT2_CAL_GUI_URLS), initial_status=Sensor.Status.UNKNOWN)) for sensor in self.mc_server.orig_sensors.values(): if self.haproxy_bind is not None and sensor.name.endswith( '.gui-urls'): new_value = web.rewrite_gui_urls(EXTERNAL_URL, sensor) new_sensor = Sensor(sensor.stype, sensor.name, sensor.description, sensor.units) new_sensor.set_value(new_value, timestamp=sensor.timestamp, status=sensor.status) self.mc_server.sensors.add(new_sensor) else: self.mc_server.sensors.add(sensor) self.app = web.make_app(self.mc_server, self.haproxy_bind) self.server = TestServer(self.app) self.client = TestClient(self.server) await self.client.start_server() self.addCleanup(self.client.close) self.mc_server.add_interface_changed_callback.assert_called_once() self.dirty_set = self.mc_server.add_interface_changed_callback.mock_calls[ 0][1][0] self.dirty_set() await self.advance(1)
def __init__(self, sensor: aiokatcp.Sensor, value_metric: _LabelWrapper, status_metric: _LabelWrapper, label_values: Iterable[str]) -> None: self._sensor = sensor self._old_value = 0.0 self._old_timestamp: Optional[float] = None self._value_metric_root = value_metric self._status_metric_root = status_metric self._value_metric: _Metric = value_metric.labels(*label_values) self._status_metric: _Metric = status_metric.labels(*label_values) self._label_values = tuple(label_values) sensor.attach(self)
def __init__(self, host: str, port: int, loop: asyncio.AbstractEventLoop, endpoints: List[Endpoint], flag_interface: Optional[str], flags_ibv: bool, chunk_store: katdal.chunkstore.ChunkStore, chunk_params: spead_write.ChunkParams, telstate: katsdptelstate.TelescopeState, input_name: str, output_name: str, rename_src: Mapping[str, str], s3_endpoint_url: Optional[str], max_workers: int, buffer_dumps: int) -> None: super().__init__(host, port, loop=loop) self._chunk_store = chunk_store self._telstate = telstate # track the status of each capture block we have seen to date self._capture_block_state = {} # type: Dict[str, State] self._input_name = input_name self._output_name = output_name # rechunker group for each CBID self._flag_streams = {} # type: Dict[str, RechunkerGroup] self._executor = ThreadPoolExecutor(max_workers=max_workers) self.sensors.add( Sensor(Status, "status", "The current status of the flag writer process.")) self.sensors.add( Sensor( str, "capture-block-state", "JSON dict with the state of each capture block seen in this session.", default='{}', initial_status=Sensor.Status.NOMINAL)) for sensor in spead_write.io_sensors(): self.sensors.add(sensor) self.sensors.add(spead_write.device_status_sensor()) telstate_input = telstate.view(input_name) in_chunks = spead_write.chunks_from_telstate(telstate_input) DATA_LOST = 1 << FLAG_NAMES.index('data_lost') self._arrays = [ spead_write.make_array('flags', in_chunks, DATA_LOST, np.uint8, chunk_params) ] dump_size = sum(array.nbytes for array in self._arrays) self._executor_queue_space = QueueSpace(buffer_dumps * dump_size, loop=self.loop) spead_write.write_telstate(telstate, input_name, output_name, rename_src, s3_endpoint_url) rx = spead_write.make_receiver( endpoints, self._arrays, katsdpservices.get_interface_address(flag_interface), flags_ibv) self._writer = FlagWriter(self.sensors, rx, self) self._capture_task = loop.create_task(self._do_capture())
async def test_add_sensor(self) -> None: self.server.sensors.add( Sensor(int, 'another', 'another sensor', '', 234)) # Rather than having server send an interface-changed inform, we invoke # it directly on the client so that we don't need to worry about timing. changed = aiokatcp.Message.inform('interface-changed', b'sensor-list') self.client.handle_inform(changed) await self.client.wait_synced() self._check_sensors()
def __init__(self, logical_task, sdp_controller, subarray_product, capture_block_id): # Turn .status into a property that updates a sensor self._status = None super().__init__(logical_task) self.sdp_controller = sdp_controller self.subarray_product = subarray_product self.capture_block_id = capture_block_id # Only useful for batch tasks self.logger = logging.LoggerAdapter( logger, dict(child_task=self.name)) if capture_block_id is None: self.name = logical_task.name else: self.name = '.'.join([capture_block_id, logical_task.name]) self.gui_urls = [] # dict of exposed KATCP sensors. This excludes the state sensors, which # are present even when the process is not running. self.sensors = {} # Capture block names for CBs that haven't terminated on this node yet. # Names are used rather than the objects to reduce the number of cyclic # references. self._capture_blocks = set() # Event set to true whenever _capture_blocks is empty self._capture_blocks_empty = asyncio.Event() self._capture_blocks_empty.set() self._state_sensor = Sensor(scheduler.TaskState, self.name + '.state', "State of the state machine", "", default=self.state, initial_status=Sensor.Status.NOMINAL) self._mesos_state_sensor = Sensor( str, self.name + '.mesos-state', 'Mesos-reported task state', '') if self.logical_node.metadata_katcp_sensors: # Note: these sensors are added to the subarray product and not self # so that they don't get removed when the task dies. The sensors # themselves are created unconditionally because it avoids having to # make all the updates conditional. self.subarray_product.add_sensor(self._state_sensor) self.subarray_product.add_sensor(self._mesos_state_sensor) self.katcp_connection = None self.consul_services = [] self.capture_block_state_observer = None self.device_status_observer = None
def __init__(self, host: str, port: int, loop: asyncio.AbstractEventLoop, endpoints: List[Endpoint], interface: Optional[str], ibv: bool, chunk_store: katdal.chunkstore.ChunkStore, chunk_params: spead_write.ChunkParams, telstate: katsdptelstate.TelescopeState, input_name: str, output_name: str, rename_src: Mapping[str, str], s3_endpoint_url: Optional[str], max_workers: int, buffer_dumps: int) -> None: super().__init__(host, port, loop=loop) self._endpoints = endpoints self._interface_address = katsdpservices.get_interface_address( interface) self._ibv = ibv self._chunk_store = chunk_store self._input_name = input_name self._output_name = output_name self._telstate = telstate self._rx = None # type: Optional[spead2.recv.asyncio.Stream] self._max_workers = max_workers telstate_input = telstate.view(input_name) in_chunks = spead_write.chunks_from_telstate(telstate_input) DATA_LOST = 1 << FLAG_NAMES.index('data_lost') self._arrays = [ spead_write.make_array('correlator_data', in_chunks, 0, np.complex64, chunk_params), spead_write.make_array('flags', in_chunks, DATA_LOST, np.uint8, chunk_params), spead_write.make_array('weights', in_chunks, 0, np.uint8, chunk_params), spead_write.make_array('weights_channel', in_chunks[:2], 0, np.float32, chunk_params) ] dump_size = sum(array.nbytes for array in self._arrays) self._buffer_size = buffer_dumps * dump_size spead_write.write_telstate(telstate, input_name, output_name, rename_src, s3_endpoint_url) self._capture_task = None # type: Optional[asyncio.Task] self._n_substreams = len(in_chunks[1]) self.sensors.add( Sensor(Status, 'status', 'The current status of the capture process', default=Status.IDLE, initial_status=Sensor.Status.NOMINAL, status_func=_status_status)) for sensor in spead_write.io_sensors(): self.sensors.add(sensor) self.sensors.add(spead_write.device_status_sensor())
def _add_sensors(sensors: SensorSet) -> None: sensors.add(Sensor(int, 'int-sensor', 'Integer sensor', 'frogs')) sensors.add( Sensor(float, 'float-sensor', 'Float sensor', default=3.0, initial_status=Sensor.Status.NOMINAL)) sensors.add( Sensor(float, 'histogram-sensor', 'Float sensor used for histogram')) sensors.add( Sensor(str, 'str-sensor', 'String sensor', default='hello', initial_status=Sensor.Status.ERROR)) sensors.add(Sensor(bytes, 'bytes-sensor', 'Raw bytes sensor')) sensors.add(Sensor(bool, 'bool-sensor', 'Boolean sensor')) sensors.add(Sensor(Address, 'address-sensor', 'Address sensor')) sensors.add(Sensor(MyEnum, 'enum-sensor', 'Enum sensor')) sensors['enum-sensor'].set_value(MyEnum.NO, timestamp=123456789)
async def resolve(self, resolver, graph, image_path=None): await super().resolve(resolver, graph, image_path) self.gui_urls = gui_urls = [] for entry in self.logical_node.gui_urls: gui_urls.append({}) for key, value in entry.items(): if isinstance(value, str): gui_urls[-1][key] = value.format(self) else: gui_urls[-1][key] = value if gui_urls: gui_urls_sensor = Sensor(str, self.name + '.gui-urls', 'URLs for GUIs') gui_urls_sensor.set_value(json.dumps(gui_urls)) self._add_sensor(gui_urls_sensor) for key, value in self.ports.items(): endpoint_sensor = Sensor( aiokatcp.Address, '{}.{}'.format(self.name, key), 'IP endpoint for {}'.format(key)) try: addrinfo = await asyncio.get_event_loop().getaddrinfo(self.host, value) host, port = addrinfo[0][4][:2] endpoint_sensor.set_value(aiokatcp.Address(ipaddress.ip_address(host), port)) except socket.gaierror as error: self.logger.warning('Could not resolve %s: %s', self.host, error) endpoint_sensor.set_value(aiokatcp.Address(ipaddress.IPv4Address('0.0.0.0')), status=Sensor.Status.FAILURE) self._add_sensor(endpoint_sensor) # Provide info about which container this is for logspout to collect. labels = { 'task': self.logical_node.name, 'task_type': self.logical_node.task_type, 'task_id': self.taskinfo.task_id.value, 'subarray_product_id': self.subarray_product_id } if self.capture_block_id is not None: labels['capture_block_id'] = self.capture_block_id self.taskinfo.container.docker.setdefault('parameters', []).extend([ {'key': 'label', 'value': 'za.ac.kat.sdp.katsdpcontroller.{}={}'.format(key, value)} for (key, value) in labels.items()]) # Set extra fields for SDP services to log to logstash if self.logical_node.katsdpservices_logging and 'KATSDP_LOG_GELF_ADDRESS' in os.environ: extras = { **json.loads(os.environ.get('KATSDP_LOG_GELF_EXTRA', '{}')), **labels, 'docker.image': self.taskinfo.container.docker.image } env = { 'KATSDP_LOG_GELF_ADDRESS': os.environ['KATSDP_LOG_GELF_ADDRESS'], 'KATSDP_LOG_GELF_EXTRA': json.dumps(extras), 'KATSDP_LOG_GELF_LOCALNAME': self.host, 'LOGSPOUT': 'ignore' } self.taskinfo.command.environment.setdefault('variables', []).extend([ {'name': key, 'value': value} for (key, value) in env.items() ]) # Apply overrides to taskinfo given by the user overrides = resolver.service_overrides.get( self.logical_node.name, product_config.ServiceOverride()).taskinfo if overrides: self.logger.warning('Applying overrides to taskinfo of %s', self.name) self.taskinfo = Dict(product_config.override(self.taskinfo.to_dict(), overrides)) # Add some useful sensors if self.logical_node.metadata_katcp_sensors: self._add_sensor( Sensor(str, self.name + '.version', "Image of executing container.", "", default=self.taskinfo.container.docker.image, initial_status=Sensor.Status.NOMINAL)) self.sdp_controller.mass_inform('interface-changed', 'sensor-list')
def test_add_sensor(self) -> None: # A non-Prometheus sensor, just to check that this doesn't break anything self.sensors.add(Sensor(int, 'another', 'another sensor', '')) self.sensors.add(Sensor(float, 'dynamic-sensor', 'dynamic sensor', '')) self.sensors['dynamic-sensor'].value = 345.0 self._check_prom('test_dynamic_sensor', 345.0)
async def test_replace_sensor(self) -> None: self.server.sensors.add(Sensor(bool, 'int-sensor', 'Replaced by bool')) changed = aiokatcp.Message.inform('interface-changed', b'sensor-list') self.client.handle_inform(changed) await self.client.wait_synced() self._check_sensors()
def make_sensor(*args, **kwargs) -> Sensor: kwargs['auto_strategy'] = SensorSampler.Strategy.EVENT_RATE kwargs['auto_strategy_parameters'] = (0.05, 10.0) return Sensor(*args, **kwargs)
def device_status_sensor() -> Sensor: """Create a sensor to track device status""" return Sensor(DeviceStatus, 'device-status', 'Health sensor', default=DeviceStatus.OK, initial_status=Sensor.Status.NOMINAL, status_func=_device_status_status)
def __init__(self, host, port, botocore_dict, rdb_path, telstate): self._botocore_dict = botocore_dict self._async_tasks = deque() self._rdb_path = rdb_path self._telstate = telstate self._build_state_sensor = Sensor(str, "build-state", "SDP Controller build state.") self._device_status_sensor = Sensor( DeviceStatus, "status", "The current status of the meta writer process") self._last_write_stream_sensor = Sensor( str, "last-write-stream", "The stream name of the last meta data dump.") self._last_write_cbid_sensor = Sensor( str, "last-write-cbid", "The capture block ID of the last meta data dump.") self._key_failures_sensor = Sensor( int, "key-failures", "Count of the number of failures to write a desired key to the RDB dump. " "(prometheus: counter)") self._last_transfer_rate = Sensor( float, "last-transfer-rate", "Rate of last data transfer to S3 endpoint in Bps. (prometheus: gauge)" ) self._last_dump_duration = Sensor( float, "last-dump-duration", "Time taken to write the last dump to disk. (prometheus: gauge)", "s") super().__init__(host, port) self._build_state_sensor.set_value(self.BUILD_STATE) self.sensors.add(self._build_state_sensor) self._device_status_sensor.set_value(DeviceStatus.IDLE) self.sensors.add(self._device_status_sensor) self.sensors.add(self._last_write_stream_sensor) self.sensors.add(self._last_write_cbid_sensor) self.sensors.add(self._last_transfer_rate) self.sensors.add(self._last_dump_duration) self._key_failures_sensor.set_value(0) self.sensors.add(self._key_failures_sensor)
class MetaWriterServer(DeviceServer): VERSION = "sdp-meta-writer-0.1" BUILD_STATE = "katsdpmetawriter-" + __version__ def __init__(self, host, port, botocore_dict, rdb_path, telstate): self._botocore_dict = botocore_dict self._async_tasks = deque() self._rdb_path = rdb_path self._telstate = telstate self._build_state_sensor = Sensor(str, "build-state", "SDP Controller build state.") self._device_status_sensor = Sensor( DeviceStatus, "status", "The current status of the meta writer process") self._last_write_stream_sensor = Sensor( str, "last-write-stream", "The stream name of the last meta data dump.") self._last_write_cbid_sensor = Sensor( str, "last-write-cbid", "The capture block ID of the last meta data dump.") self._key_failures_sensor = Sensor( int, "key-failures", "Count of the number of failures to write a desired key to the RDB dump. " "(prometheus: counter)") self._last_transfer_rate = Sensor( float, "last-transfer-rate", "Rate of last data transfer to S3 endpoint in Bps. (prometheus: gauge)" ) self._last_dump_duration = Sensor( float, "last-dump-duration", "Time taken to write the last dump to disk. (prometheus: gauge)", "s") super().__init__(host, port) self._build_state_sensor.set_value(self.BUILD_STATE) self.sensors.add(self._build_state_sensor) self._device_status_sensor.set_value(DeviceStatus.IDLE) self.sensors.add(self._device_status_sensor) self.sensors.add(self._last_write_stream_sensor) self.sensors.add(self._last_write_cbid_sensor) self.sensors.add(self._last_transfer_rate) self.sensors.add(self._last_dump_duration) self._key_failures_sensor.set_value(0) self.sensors.add(self._key_failures_sensor) def _fail_if_busy(self): """Raise a FailReply if there are too many asynchronous operations in progress.""" busy_tasks = 0 for task in self._async_tasks: if not task.done(): busy_tasks += 1 if busy_tasks >= MAX_ASYNC_TASKS: raise FailReply(( 'Meta-data writer has too many operations in progress (max {}). ' 'Please wait for one to complete first.' ).format(MAX_ASYNC_TASKS)) def _clear_async_task(self, future): """Clear the specified async task. Parameters ---------- future : :class:`asyncio.Future` The expected value of :attr:`_async_task`. """ try: self._async_tasks.remove(future) except IndexError: pass if not self._async_tasks: self._device_status_sensor.set_value(DeviceStatus.IDLE) async def _write_meta(self, ctx, capture_block_id, stream_name, lite=True): """Write meta-data extracted from the current telstate object to a binary dump and place this in the currently connected S3 bucket for storage. """ additional_name = "full." if not lite else "" dump_folder = os.path.join(self._rdb_path, capture_block_id) os.makedirs(dump_folder, exist_ok=True) basename = "{}_{}.{}rdb".format(capture_block_id, stream_name, additional_name) dump_filename = os.path.join(dump_folder, basename + '.uploading') st = timer() # Generate local RDB dump and write into S3 - note that # capture_block_id is used as the bucket name for storing meta-data # regardless of the stream selected. # The full capture_block_stream_name is used as the bucket for payload # data for the particular stream. (rate_b, key_errors) = await _write_rdb(ctx, self._telstate, dump_filename, capture_block_id, stream_name, self._botocore_dict, basename, lite) et = timer() sensor_timestamp = time.time() self._last_write_stream_sensor.set_value(stream_name, timestamp=sensor_timestamp) self._last_write_cbid_sensor.set_value(capture_block_id, timestamp=sensor_timestamp) self._last_dump_duration.set_value(et - st, timestamp=sensor_timestamp) if key_errors > 0: self._key_failures_sensor.set_value( self._key_failures_sensor.value + key_errors, Sensor.Status.ERROR) if not rate_b: try: trawler_filename = os.path.join(dump_folder, basename) # Prepare to rename file so that the trawler process can # attempt the S3 upload at a later date. os.rename(dump_filename, trawler_filename) except FileNotFoundError: msg = ( "Failed to store RDB dump, and couldn't find file to rename. " "This error cannot be recovered from.") logger.error(msg) raise FailReply(msg) else: logger.info("RDB file written to bucket %s with key %s", capture_block_id, os.path.basename(dump_filename)) try: os.remove(dump_filename) except Exception as e: # it won't interfere with the trawler so we just continue logger.warning( "Failed to remove transferred RDB file %s. (%s)", dump_filename, e) return rate_b async def write_meta(self, ctx, capture_block_id, streams, lite=True): """Implementation of request_write_meta.""" rate_per_stream = {} for stream in streams: task = asyncio.ensure_future( self._write_meta(ctx, capture_block_id, stream, lite)) self._device_status_sensor.set_value(DeviceStatus.QUEUED) # we risk queue depth expansion at this point, but we are really # only checking to prevent outrageous failures. self._async_tasks.append(task) try: rate_b = await task finally: self._clear_async_task(task) rate_per_stream[stream] = rate_b dump_folder = os.path.join(self._rdb_path, capture_block_id) if not lite and os.path.exists(dump_folder): # We treat writing the streams for a full meta dump as the # completion of meta data for that particular capture block id # (assuming at least one stream was written). touch_file = os.path.join(dump_folder, "complete") pathlib.Path(touch_file).touch(exist_ok=True) return rate_per_stream async def request_write_meta(self, ctx, capture_block_id: str, lite: bool = True, stream_name: str = None) -> None: """Write a dump of a subset of currently active telescope state to disk and optionally archive it to the preconfigured S3 endpoint. The precise subset is controlled through the selection of capture_block_id, stream_name and the lite boolean. Method may take some time so is run asynchronously. Parameters ---------- capture_block_id : string The capture block id generated by master controller to identify a specific data capture. Typically this will be an integer representing the start time of the observation in epoch seconds (+/- to allow for uniqueness if required). lite : bool, optional If True then a very limited subset of telstate keys are written to the dump, otherwise a 'full' dump is produced. Currently 'full' is the entire telescope state database, but in the future may be restricted to meta-data relevant only to the chosen capture_block_id and stream_name. stream_name : string, optional The specific stream name to use in extracting stream specific meta-data. (e.g. sdp_l0) If no stream is specified, all sdp.vis streams with attached writers will be saved individually. """ self._fail_if_busy() if not stream_name: streams = await self._telstate.get('sdp_archived_streams') if not streams: raise FailReply( "No stream specified, and cannot determine available streams from telstate." ) streams = [ stream for stream in streams if await self._telstate.view( stream).get('stream_type') == 'sdp.vis' ] else: streams = [stream_name] ctx.inform( ("Starting write of {} metadata for CB: {} and Streams: {} to S3. " "This may take a minute or two...").format( "lightweight" if lite else "full", capture_block_id, streams)) rate_per_stream = await self.write_meta(ctx, capture_block_id, streams, lite) peak_rate = 0 dump_type_name = "Lightweight" if lite else "Full dump" for stream, rate_b in rate_per_stream.items(): if not rate_b: ctx.inform( "{} meta-data for CB: {}_{} written to local disk only". format(dump_type_name, capture_block_id, stream)) else: ctx.inform( "{} meta-data for CB: {}_{} written to S3 @ {:.2f}MBps". format(dump_type_name, capture_block_id, stream, rate_b / 1e6)) peak_rate = max(peak_rate, rate_b) if peak_rate > 0: self._last_transfer_rate.set_value(peak_rate)
def __init__(self, args: argparse.Namespace, loop: asyncio.AbstractEventLoop) -> None: CaptureServer.__init__(self, args, loop) aiokatcp.DeviceServer.__init__(self, args.host, args.port, loop=loop) sensors = [ Sensor(int, "input-heaps-total", "Number of payload heaps received from CBF in this session " "(prometheus: counter)", initial_status=Sensor.Status.NOMINAL), Sensor(int, "input-bytes-total", "Number of payload bytes received from CBF in this session " "(prometheus: counter)", initial_status=Sensor.Status.NOMINAL), Sensor(int, "input-missing-heaps-total", "Number of heaps we expected but never saw " "(prometheus: counter)", initial_status=Sensor.Status.NOMINAL, status_func=_warn_if_positive), Sensor(int, "input-too-old-heaps-total", "Number of heaps rejected because they arrived too late " "(prometheus: counter)", initial_status=Sensor.Status.NOMINAL, status_func=_warn_if_positive), Sensor(int, "input-incomplete-heaps-total", "Number of heaps rejected due to missing packets " "(prometheus: counter)", initial_status=Sensor.Status.NOMINAL, status_func=_warn_if_positive), Sensor(int, "input-metadata-heaps-total", "Number of heaps that do not contain data " "(prometheus: counter)", initial_status=Sensor.Status.NOMINAL), Sensor(int, "input-bad-timestamp-heaps-total", "Number of heaps rejected due to bad timestamp " "(prometheus: counter)", initial_status=Sensor.Status.NOMINAL, status_func=_warn_if_positive), Sensor(int, "input-bad-channel-heaps-total", "Number of heaps rejected due to bad channel offset " "(prometheus: counter)", initial_status=Sensor.Status.NOMINAL, status_func=_warn_if_positive), Sensor(int, "input-bad-length-heaps-total", "Number of heaps rejected due to bad payload length " "(prometheus: counter)", initial_status=Sensor.Status.NOMINAL, status_func=_warn_if_positive), Sensor(int, "input-packets-total", "Total number of packets received (prometheus: counter)", initial_status=Sensor.Status.NOMINAL), Sensor( int, "input-batches-total", "Number of batches of packets processed (prometheus: counter)", initial_status=Sensor.Status.NOMINAL), Sensor( int, "input-max-batch", "Maximum number of packets processed in a batch (prometheus: gauge)", initial_status=Sensor.Status.NOMINAL) ] for sensor in sensors: self.sensors.add(sensor)