async def delete_orphaned_disks( compute_client: aiogoogle.GoogleComputeClient, zones: List[str], inst_coll_manager: InstanceCollectionManager, namespace: str, ): log.info('deleting orphaned disks') params = {'filter': f'(labels.namespace = {namespace})'} for zone in zones: async for disk in await compute_client.list(f'/zones/{zone}/disks', params=params): disk_name = disk['name'] instance_name = disk['labels']['instance-name'] instance = inst_coll_manager.get_instance(instance_name) creation_timestamp_msecs = parse_timestamp_msecs( disk.get('creationTimestamp')) last_attach_timestamp_msecs = parse_timestamp_msecs( disk.get('lastAttachTimestamp')) last_detach_timestamp_msecs = parse_timestamp_msecs( disk.get('lastDetachTimestamp')) now_msecs = time_msecs() if instance is None: log.exception( f'deleting disk {disk_name} from instance that no longer exists' ) elif last_attach_timestamp_msecs is None and now_msecs - creation_timestamp_msecs > 60 * 60 * 1000: log.exception( f'deleting disk {disk_name} that has not attached within 60 minutes' ) elif last_detach_timestamp_msecs is not None and now_msecs - last_detach_timestamp_msecs > 5 * 60 * 1000: log.exception( f'deleting detached disk {disk_name} that has not been cleaned up within 5 minutes' ) else: continue try: await compute_client.delete_disk( f'/zones/{zone}/disks/{disk_name}') except asyncio.CancelledError: raise except Exception as e: if isinstance(e, aiohttp.ClientResponseError) and e.status == 404: # pylint: disable=no-member continue log.exception( f'error while deleting orphaned disk {disk_name}')
async def delete_orphaned_disks(self): log.info('deleting orphaned disks') params = {'filter': f'(labels.namespace = {DEFAULT_NAMESPACE})'} for zone in self.zone_monitor.zones: async for disk in await self.compute_client.list( f'/zones/{zone}/disks', params=params): disk_name = disk['name'] instance_name = disk['labels']['instance-name'] instance = self.inst_coll_manager.get_instance(instance_name) creation_timestamp_msecs = parse_timestamp_msecs( disk.get('creationTimestamp')) last_attach_timestamp_msecs = parse_timestamp_msecs( disk.get('lastAttachTimestamp')) last_detach_timestamp_msecs = parse_timestamp_msecs( disk.get('lastDetachTimestamp')) now_msecs = time_msecs() if instance is None: log.exception( f'deleting disk {disk_name} from instance that no longer exists' ) elif (last_attach_timestamp_msecs is None and now_msecs - creation_timestamp_msecs > 60 * 60 * 1000): log.exception( f'deleting disk {disk_name} that has not attached within 60 minutes' ) elif (last_detach_timestamp_msecs is not None and now_msecs - last_detach_timestamp_msecs > 5 * 60 * 1000): log.exception( f'deleting detached disk {disk_name} that has not been cleaned up within 5 minutes' ) else: continue try: await self.compute_client.delete_disk( f'/zones/{zone}/disks/{disk_name}') except aiohttp.ClientResponseError as e: if e.status == 404: continue log.exception( f'error while deleting orphaned disk {disk_name}')
async def monitor_disks(app): log.info('monitoring disks') compute_client: aiogoogle.GoogleComputeClient = app['compute_client'] disk_counts = defaultdict(list) for zone in app['zones']: async for disk in await compute_client.list(f'/zones/{zone}/disks', params={ 'filter': '(labels.batch = 1)' }): namespace = disk['labels']['namespace'] size_gb = int(disk['sizeGb']) creation_timestamp_msecs = parse_timestamp_msecs( disk.get('creationTimestamp')) last_attach_timestamp_msecs = parse_timestamp_msecs( disk.get('lastAttachTimestamp')) last_detach_timestamp_msecs = parse_timestamp_msecs( disk.get('lastDetachTimestamp')) if creation_timestamp_msecs is None: state = 'creating' elif last_attach_timestamp_msecs is None: state = 'created' elif last_attach_timestamp_msecs is not None and last_detach_timestamp_msecs is None: state = 'attached' elif last_attach_timestamp_msecs is not None and last_detach_timestamp_msecs is not None: state = 'detached' else: state = 'unknown' log.exception(f'disk is in unknown state {disk}') disk_labels = DiskLabels(zone=zone, namespace=namespace, state=state) disk_counts[disk_labels].append(size_gb) DISK_SIZES_GB.clear() for labels, sizes in disk_counts.items(): for size in sizes: DISK_SIZES_GB.labels(**labels._asdict()).observe(size)
async def handle_event(self, event): payload = event.get('protoPayload') if payload is None: log.warning(f'event has no payload {json.dumps(event)}') return timestamp_msecs = parse_timestamp_msecs(event['timestamp']) resource_type = event['resource']['type'] if resource_type != 'gce_instance': log.warning( f'unknown event resource type {resource_type} {json.dumps(event)}' ) return operation = event.get('operation') if operation is None: # occurs when deleting a worker that does not exist log.info( f'received an event with no operation {json.dumps(event)}') return operation_started = operation.get('first', False) if operation_started: event_type = 'STARTED' else: event_type = 'COMPLETED' event_subtype = payload['methodName'] resource = event['resource'] name = parse_resource_name(payload['resourceName'])['name'] log.info(f'event {resource_type} {event_type} {event_subtype} {name}') if not name.startswith(self.machine_name_prefix): log.warning(f'event for unknown machine {name}') return if event_subtype == 'v1.compute.instances.insert': if event_type == 'COMPLETED': severity = event['severity'] operation_id = event['operation']['id'] success = severity != 'ERROR' self.zone_monitor.zone_success_rate.push( resource['labels']['zone'], operation_id, success) else: instance = self.inst_coll_manager.get_instance(name) if not instance: record = await self.db.select_and_fetchone( 'SELECT name FROM instances WHERE name = %s;', (name, )) if not record: log.error( f'event for unknown instance {name}: {json.dumps(event)}' ) return if event_subtype == 'compute.instances.preempted': log.info(f'event handler: handle preempt {instance}') await self.handle_preempt_event(instance, timestamp_msecs) elif event_subtype == 'v1.compute.instances.delete': if event_type == 'COMPLETED': log.info(f'event handler: delete {instance} done') await self.handle_delete_done_event( instance, timestamp_msecs) elif event_type == 'STARTED': log.info(f'event handler: handle call delete {instance}') await self.handle_call_delete_event( instance, timestamp_msecs)
async def handle_activity_log_event( event: Dict[str, Any], db: Database, inst_coll_manager: InstanceCollectionManager, zone_success_rate: ZoneSuccessRate, machine_name_prefix: str, ): payload = event.get('protoPayload') if payload is None: log.warning(f'event has no payload {json.dumps(event)}') return timestamp_msecs = parse_timestamp_msecs(event['timestamp']) resource_type = event['resource']['type'] if resource_type != 'gce_instance': log.warning( f'unknown event resource type {resource_type} {json.dumps(event)}') return operation = event.get('operation') if operation is None: # occurs when deleting a worker that does not exist log.info(f'received an event with no operation {json.dumps(event)}') return operation_started = operation.get('first', False) if operation_started: event_type = 'STARTED' else: event_type = 'COMPLETED' event_subtype = payload['methodName'] resource = event['resource'] name = parse_resource_name(payload['resourceName'])['name'] log.info(f'event {resource_type} {event_type} {event_subtype} {name}') if not name.startswith(machine_name_prefix): log.warning(f'event for unknown machine {name}') return if event_subtype == 'v1.compute.instances.insert': if event_type == 'COMPLETED': severity = event['severity'] operation_id = event['operation']['id'] success = severity != 'ERROR' zone_success_rate.push(resource['labels']['zone'], operation_id, success) else: instance = inst_coll_manager.get_instance(name) if not instance: record = await db.select_and_fetchone( 'SELECT name FROM instances WHERE name = %s;', (name, )) if not record: log.error( f'event for unknown instance {name}: {json.dumps(event)}') return if event_subtype == 'compute.instances.preempted': await instance.inst_coll.call_delete_instance( instance, 'preempted', timestamp=timestamp_msecs) elif event_subtype == 'v1.compute.instances.delete': if event_type == 'COMPLETED': await instance.inst_coll.remove_instance( instance, 'deleted', timestamp_msecs) elif event_type == 'STARTED': await instance.mark_deleted('deleted', timestamp_msecs)