Esempio n. 1
0
async def delete_orphaned_disks(
    compute_client: aiogoogle.GoogleComputeClient,
    zones: List[str],
    inst_coll_manager: InstanceCollectionManager,
    namespace: str,
):
    log.info('deleting orphaned disks')

    params = {'filter': f'(labels.namespace = {namespace})'}

    for zone in zones:
        async for disk in await compute_client.list(f'/zones/{zone}/disks',
                                                    params=params):
            disk_name = disk['name']
            instance_name = disk['labels']['instance-name']
            instance = inst_coll_manager.get_instance(instance_name)

            creation_timestamp_msecs = parse_timestamp_msecs(
                disk.get('creationTimestamp'))
            last_attach_timestamp_msecs = parse_timestamp_msecs(
                disk.get('lastAttachTimestamp'))
            last_detach_timestamp_msecs = parse_timestamp_msecs(
                disk.get('lastDetachTimestamp'))

            now_msecs = time_msecs()
            if instance is None:
                log.exception(
                    f'deleting disk {disk_name} from instance that no longer exists'
                )
            elif last_attach_timestamp_msecs is None and now_msecs - creation_timestamp_msecs > 60 * 60 * 1000:
                log.exception(
                    f'deleting disk {disk_name} that has not attached within 60 minutes'
                )
            elif last_detach_timestamp_msecs is not None and now_msecs - last_detach_timestamp_msecs > 5 * 60 * 1000:
                log.exception(
                    f'deleting detached disk {disk_name} that has not been cleaned up within 5 minutes'
                )
            else:
                continue

            try:
                await compute_client.delete_disk(
                    f'/zones/{zone}/disks/{disk_name}')
            except asyncio.CancelledError:
                raise
            except Exception as e:
                if isinstance(e,
                              aiohttp.ClientResponseError) and e.status == 404:  # pylint: disable=no-member
                    continue
                log.exception(
                    f'error while deleting orphaned disk {disk_name}')
Esempio n. 2
0
    async def delete_orphaned_disks(self):
        log.info('deleting orphaned disks')

        params = {'filter': f'(labels.namespace = {DEFAULT_NAMESPACE})'}

        for zone in self.zone_monitor.zones:
            async for disk in await self.compute_client.list(
                    f'/zones/{zone}/disks', params=params):
                disk_name = disk['name']
                instance_name = disk['labels']['instance-name']
                instance = self.inst_coll_manager.get_instance(instance_name)

                creation_timestamp_msecs = parse_timestamp_msecs(
                    disk.get('creationTimestamp'))
                last_attach_timestamp_msecs = parse_timestamp_msecs(
                    disk.get('lastAttachTimestamp'))
                last_detach_timestamp_msecs = parse_timestamp_msecs(
                    disk.get('lastDetachTimestamp'))

                now_msecs = time_msecs()
                if instance is None:
                    log.exception(
                        f'deleting disk {disk_name} from instance that no longer exists'
                    )
                elif (last_attach_timestamp_msecs is None and
                      now_msecs - creation_timestamp_msecs > 60 * 60 * 1000):
                    log.exception(
                        f'deleting disk {disk_name} that has not attached within 60 minutes'
                    )
                elif (last_detach_timestamp_msecs is not None and
                      now_msecs - last_detach_timestamp_msecs > 5 * 60 * 1000):
                    log.exception(
                        f'deleting detached disk {disk_name} that has not been cleaned up within 5 minutes'
                    )
                else:
                    continue

                try:
                    await self.compute_client.delete_disk(
                        f'/zones/{zone}/disks/{disk_name}')
                except aiohttp.ClientResponseError as e:
                    if e.status == 404:
                        continue
                    log.exception(
                        f'error while deleting orphaned disk {disk_name}')
Esempio n. 3
0
async def monitor_disks(app):
    log.info('monitoring disks')
    compute_client: aiogoogle.GoogleComputeClient = app['compute_client']

    disk_counts = defaultdict(list)

    for zone in app['zones']:
        async for disk in await compute_client.list(f'/zones/{zone}/disks',
                                                    params={
                                                        'filter':
                                                        '(labels.batch = 1)'
                                                    }):
            namespace = disk['labels']['namespace']
            size_gb = int(disk['sizeGb'])

            creation_timestamp_msecs = parse_timestamp_msecs(
                disk.get('creationTimestamp'))
            last_attach_timestamp_msecs = parse_timestamp_msecs(
                disk.get('lastAttachTimestamp'))
            last_detach_timestamp_msecs = parse_timestamp_msecs(
                disk.get('lastDetachTimestamp'))

            if creation_timestamp_msecs is None:
                state = 'creating'
            elif last_attach_timestamp_msecs is None:
                state = 'created'
            elif last_attach_timestamp_msecs is not None and last_detach_timestamp_msecs is None:
                state = 'attached'
            elif last_attach_timestamp_msecs is not None and last_detach_timestamp_msecs is not None:
                state = 'detached'
            else:
                state = 'unknown'
                log.exception(f'disk is in unknown state {disk}')

            disk_labels = DiskLabels(zone=zone,
                                     namespace=namespace,
                                     state=state)
            disk_counts[disk_labels].append(size_gb)

    DISK_SIZES_GB.clear()
    for labels, sizes in disk_counts.items():
        for size in sizes:
            DISK_SIZES_GB.labels(**labels._asdict()).observe(size)
Esempio n. 4
0
    async def handle_event(self, event):
        payload = event.get('protoPayload')
        if payload is None:
            log.warning(f'event has no payload {json.dumps(event)}')
            return

        timestamp_msecs = parse_timestamp_msecs(event['timestamp'])

        resource_type = event['resource']['type']
        if resource_type != 'gce_instance':
            log.warning(
                f'unknown event resource type {resource_type} {json.dumps(event)}'
            )
            return

        operation = event.get('operation')
        if operation is None:
            # occurs when deleting a worker that does not exist
            log.info(
                f'received an event with no operation {json.dumps(event)}')
            return

        operation_started = operation.get('first', False)
        if operation_started:
            event_type = 'STARTED'
        else:
            event_type = 'COMPLETED'

        event_subtype = payload['methodName']
        resource = event['resource']
        name = parse_resource_name(payload['resourceName'])['name']

        log.info(f'event {resource_type} {event_type} {event_subtype} {name}')

        if not name.startswith(self.machine_name_prefix):
            log.warning(f'event for unknown machine {name}')
            return

        if event_subtype == 'v1.compute.instances.insert':
            if event_type == 'COMPLETED':
                severity = event['severity']
                operation_id = event['operation']['id']
                success = severity != 'ERROR'
                self.zone_monitor.zone_success_rate.push(
                    resource['labels']['zone'], operation_id, success)
        else:
            instance = self.inst_coll_manager.get_instance(name)
            if not instance:
                record = await self.db.select_and_fetchone(
                    'SELECT name FROM instances WHERE name = %s;', (name, ))
                if not record:
                    log.error(
                        f'event for unknown instance {name}: {json.dumps(event)}'
                    )
                return

            if event_subtype == 'compute.instances.preempted':
                log.info(f'event handler: handle preempt {instance}')
                await self.handle_preempt_event(instance, timestamp_msecs)
            elif event_subtype == 'v1.compute.instances.delete':
                if event_type == 'COMPLETED':
                    log.info(f'event handler: delete {instance} done')
                    await self.handle_delete_done_event(
                        instance, timestamp_msecs)
                elif event_type == 'STARTED':
                    log.info(f'event handler: handle call delete {instance}')
                    await self.handle_call_delete_event(
                        instance, timestamp_msecs)
Esempio n. 5
0
async def handle_activity_log_event(
    event: Dict[str, Any],
    db: Database,
    inst_coll_manager: InstanceCollectionManager,
    zone_success_rate: ZoneSuccessRate,
    machine_name_prefix: str,
):
    payload = event.get('protoPayload')
    if payload is None:
        log.warning(f'event has no payload {json.dumps(event)}')
        return

    timestamp_msecs = parse_timestamp_msecs(event['timestamp'])

    resource_type = event['resource']['type']
    if resource_type != 'gce_instance':
        log.warning(
            f'unknown event resource type {resource_type} {json.dumps(event)}')
        return

    operation = event.get('operation')
    if operation is None:
        # occurs when deleting a worker that does not exist
        log.info(f'received an event with no operation {json.dumps(event)}')
        return

    operation_started = operation.get('first', False)
    if operation_started:
        event_type = 'STARTED'
    else:
        event_type = 'COMPLETED'

    event_subtype = payload['methodName']
    resource = event['resource']
    name = parse_resource_name(payload['resourceName'])['name']

    log.info(f'event {resource_type} {event_type} {event_subtype} {name}')

    if not name.startswith(machine_name_prefix):
        log.warning(f'event for unknown machine {name}')
        return

    if event_subtype == 'v1.compute.instances.insert':
        if event_type == 'COMPLETED':
            severity = event['severity']
            operation_id = event['operation']['id']
            success = severity != 'ERROR'
            zone_success_rate.push(resource['labels']['zone'], operation_id,
                                   success)
    else:
        instance = inst_coll_manager.get_instance(name)
        if not instance:
            record = await db.select_and_fetchone(
                'SELECT name FROM instances WHERE name = %s;', (name, ))
            if not record:
                log.error(
                    f'event for unknown instance {name}: {json.dumps(event)}')
            return

        if event_subtype == 'compute.instances.preempted':
            await instance.inst_coll.call_delete_instance(
                instance, 'preempted', timestamp=timestamp_msecs)
        elif event_subtype == 'v1.compute.instances.delete':
            if event_type == 'COMPLETED':
                await instance.inst_coll.remove_instance(
                    instance, 'deleted', timestamp_msecs)
            elif event_type == 'STARTED':
                await instance.mark_deleted('deleted', timestamp_msecs)