コード例 #1
0
    def task_log_error(self, task_id, obj=None, msg='', detail='', **kwargs):
        from api.task.utils import task_log_error

        if obj is None:
            obj = self.server_class(self.dc)

        task_log_error(task_id, msg, obj=obj, detail=detail, **kwargs)
コード例 #2
0
def _delete_oldest(model, define, view_function, view_item, task_id, msg):
    """
    Helper for finding oldest snapshots/backups and running DELETE view_function().

    @type model: django.db.models.Model
    """
    vm = define.vm
    # TODO: check indexes
    # noinspection PyUnresolvedReferences
    total = model.objects.filter(vm=vm,
                                 disk_id=define.disk_id,
                                 define=define,
                                 status=model.OK).count()
    to_delete = total - define.retention

    if to_delete < 1:
        return None

    # List of snapshot or backup names to delete TODO: check indexes
    # noinspection PyUnresolvedReferences
    oldest = model.objects.filter(vm=vm, disk_id=define.disk_id, define=define, status=model.OK)\
        .values_list('name', flat=True).order_by('id')[:to_delete]
    view_name = view_function.__name__
    view_data = {'disk_id': define.array_disk_id, view_item: tuple(oldest)}
    request = get_dummy_request(vm.dc, method='DELETE', system_user=True)
    request.define_id = define.id  # Automatic task
    # Go!
    logger.info('Running DELETE %s(%s, %s), because %s>%s', view_name, vm,
                view_data, total, define.retention)
    res = call_api_view(request,
                        'DELETE',
                        view_function,
                        vm.hostname,
                        data=view_data)

    if res.status_code in (200, 201):
        logger.warn('DELETE %s(%s, %s) was successful: %s', view_name, vm,
                    view_data, res.data)
    else:
        logger.error('Running DELETE %s(%s, %s) failed: %s (%s): %s',
                     view_name, vm, view_data, res.status_code,
                     res.status_text, res.data)
        MonitoringBackend.vm_send_alert(
            vm, 'Automatic deletion of old %ss %s/disk-%s failed to start.' %
            (model.__name__.lower(), vm.hostname, define.array_disk_id))
        # Need to log this, because nobody else does (+ there is no PENDING task)
        detail = 'hostname=%s, %s=%s, disk_id=%s, Error: %s' % (
            vm.hostname, view_item, ','.join(oldest), define.array_disk_id,
            get_task_error_message(res.data))
        task_log_error(task_id,
                       msg,
                       vm=vm,
                       detail=detail,
                       update_user_tasks=False)

    return res
コード例 #3
0
def vm_snapshot_beat(snap_define_id):
    """
    This is a periodic beat task. Run POST vm_snapshot according to snapshot definition.
    """
    from api.vm.snapshot.views import vm_snapshot

    snap_define = SnapshotDefine.objects.get(id=snap_define_id)
    snap_name = snap_define.generate_snapshot_name()
    vm = snap_define.vm
    disk_id = snap_define.array_disk_id
    request = get_dummy_request(vm.dc, method='POST', system_user=True)
    request.define_id = snap_define.id  # Automatic task
    # Go!
    res = call_api_view(request,
                        'POST',
                        vm_snapshot,
                        vm.hostname,
                        snap_name,
                        data={
                            'disk_id': disk_id,
                            'fsfreeze': snap_define.fsfreeze
                        })

    if res.status_code == 201:
        logger.info(
            'POST vm_snapshot(%s, %s, {disk_id=%s}) was successful: %s', vm,
            snap_name, disk_id, res.data)
    else:
        # Need to log this, because nobody else does (+ there is no PENDING task)
        detail = 'snapname=%s, disk_id=%s, type=%s. Error: %s' % (
            snap_name, disk_id, Snapshot.AUTO, get_task_error_message(
                res.data))
        task_log_error(task_id_from_task_id(vm_snapshot_beat.request.id,
                                            dc_id=vm.dc.id),
                       LOG_SNAP_CREATE,
                       vm=vm,
                       detail=detail,
                       update_user_tasks=False)

        if res.status_code == HTTP_423_LOCKED:
            logger.warning(
                'Running POST vm_snapshot(%s, %s, {disk_id=%s}) failed: %s (%s): %s',
                vm, snap_name, disk_id, res.status_code, res.status_text,
                res.data)
        else:
            logger.error(
                'Running POST vm_snapshot(%s, %s, {disk_id=%s}) failed: %s (%s): %s',
                vm, snap_name, disk_id, res.status_code, res.status_text,
                res.data)
            MonitoringBackend.vm_send_alert(
                vm, 'Automatic snapshot %s/disk-%s@%s failed to start.' %
                (vm.hostname, disk_id, snap_define.name))
コード例 #4
0
def vm_backup_beat(bkp_define_id):
    """
    This is a periodic beat task. Run POST vm_backup according to backup definition.
    """
    from api.vm.backup.views import vm_backup

    bkp_define = BackupDefine.objects.get(id=bkp_define_id)
    vm = bkp_define.vm
    disk_id = bkp_define.array_disk_id
    defname = bkp_define.name
    request = get_dummy_request(vm.dc, method='POST', system_user=True)
    request.define_id = bkp_define.id  # Automatic task
    # Go!
    res = call_api_view(request,
                        'POST',
                        vm_backup,
                        vm.hostname,
                        defname,
                        data={
                            'disk_id': disk_id,
                            'fsfreeze': bkp_define.fsfreeze
                        })

    if res.status_code == 201:
        logger.info('POST vm_backup(%s, %s, {disk_id=%s}) was successful: %s',
                    vm, defname, disk_id, res.data)
    else:
        # Need to log this, because nobody else does (+ there is no PENDING task)
        detail = 'hostname=%s, bkpname=%s, disk_id=%s, Error: %s' % (
            vm.hostname, bkp_define.generate_backup_name(), disk_id,
            get_task_error_message(res.data))
        task_log_error(task_id_from_task_id(vm_backup_beat.request.id,
                                            dc_id=vm.dc.id),
                       LOG_BKP_CREATE,
                       vm=vm,
                       detail=detail,
                       update_user_tasks=False)

        if res.status_code == HTTP_423_LOCKED:
            logger.warning(
                'Running POST vm_backup(%s, %s, {disk_id=%s}) failed: %s (%s): %s',
                vm, defname, disk_id, res.status_code, res.status_text,
                res.data)
        else:
            logger.error(
                'Running POST vm_backup(%s, %s, {disk_id=%s}) failed: %s (%s): %s',
                vm, defname, disk_id, res.status_code, res.status_text,
                res.data)
            Zabbix.vm_send_alert(
                vm, 'Automatic backup %s/disk-%s@%s failed to start.' %
                (vm.hostname, disk_id, defname))
コード例 #5
0
def node_authorized_keys_sync_cb(result, task_id, node_uuid=None):
    """
    Callback for run_node_authorized_keys_sync().
    """
    node = Node.objects.get(uuid=node_uuid)

    if result['returncode'] == 0:
        node.save_authorized_keys(result['stdout'])
    else:
        result['message'] = 'Compute node SSH key sync error - got bad return code (%s). Error: %s' % \
                            (result['returncode'], result.get('stderr', ''))
        task_log_error(task_id, msg=LOG_NODE_UPDATE, obj=node, task_result=result, update_user_tasks=False)

    return result
コード例 #6
0
ファイル: tasks.py プロジェクト: cgvarela/esdc-ce
def node_sysinfo_cb(result, task_id, node_uuid=None):
    """
    A callback function for updating Node.json (sysinfo).

    node_uuid will be set only if called via API or GUI
    """
    # in case the callback is called by restarting erigonesd:fast service on compute node, the meta dict lacks
    # a lot of information; msg is required as part of exception logging inside callback decorator
    # therefore we set it explicitly
    result['meta']['msg'] = LOG_NODE_UPDATE

    if result['returncode'] != 0:
        logger.error(
            'Found nonzero return code in result from esysinfo command on %s',
            node_uuid)
        raise TaskException(result,
                            'Got bad return code (%s)' % result['returncode'])

    stdout = result.pop('stdout', '')
    result.pop('stderr', None)
    node_new = False

    try:
        esysinfo = parse_esysinfo(stdout)
        img_sources = esysinfo.pop('img_sources')
        img_initial = esysinfo.pop('img_initial')
    except Exception as e:
        logger.error(
            'Could not parse output from esysinfo command on %s. Error: %s',
            node_uuid, e)
        logger.exception(e)
        raise TaskException(result, 'Could not parse esysinfo output')
    else:
        uuid = esysinfo['sysinfo']['UUID']

    try:
        node = Node.objects.get(uuid=uuid)
    except Node.DoesNotExist:
        # The head node must be in online state during the admin DC initialization and each compute node must be in
        # online state during ssh key exchange.
        node_new = True
        is_head = not Node.objects.exists()
        logger.warn('Creating NEW node from sysinfo output from %s', node_uuid)
        node = Node.create_from_sysinfo(uuid,
                                        esysinfo,
                                        status=Node.ONLINE,
                                        is_head=is_head)
        node_created.send(task_id, node=node)  # Signal!
        result[
            'message'] = 'Successfully created new compute node %s' % node.hostname
        task_log_success(task_id,
                         msg=LOG_NODE_CREATE,
                         obj=node,
                         task_result=result,
                         update_user_tasks=True)
        sshkey_changed = bool(node.sshkey)

        if node.is_head:
            logger.warn(
                'New node %s is the first node ever created - assuming head node status. '
                'Initializing mgmt system and creating admin DC', node)
            from api.system.init import init_mgmt
            try:
                init_mgmt(node, images=img_initial)
            except Exception as e:
                logger.exception(e)
                result[
                    'message'] = 'Error while initializing admin datacenter (%s)' % e
                task_log_error(task_id,
                               msg=LOG_NODE_CREATE,
                               obj=node,
                               task_result=result,
                               update_user_tasks=True)

        logger.info('Saving node %s IP address "%s" into admin network', node,
                    node.ip_address)
        try:  # We should proceed even if the IP address is not registered
            node.ip_address.save()
        except Exception as e:
            logger.exception(e)
        else:
            admin_net = node.ip_address.subnet  # The network was updated by init_mgmt()
            # Reload Subnet object because it is cached inside node instance
            admin_net = admin_net.__class__.objects.get(pk=admin_net.pk)
            # We need a request object
            request = get_dummy_request(DefaultDc(), 'POST', system_user=True)
            record_cls = RecordView.Record

            if admin_net.dns_domain and admin_net.dns_domain == node.domain_name:
                logger.info('Creating forward A DNS record for node %s', node)
                # This will fail silently
                RecordView.add_or_update_record(request,
                                                record_cls.A,
                                                admin_net.dns_domain,
                                                node.hostname,
                                                node.address,
                                                task_id=task_id,
                                                related_obj=node)

            if admin_net.ptr_domain:
                logger.info('Creating reverse PTR DNS record for node %s',
                            node)
                # This will fail silently
                RecordView.add_or_update_record(request,
                                                record_cls.PTR,
                                                admin_net.ptr_domain,
                                                record_cls.get_reverse(
                                                    node.address),
                                                node.hostname,
                                                task_id=task_id,
                                                related_obj=node)

    else:
        sshkey_changed = node.sshkey_changed(esysinfo)

        if node.sysinfo_changed(esysinfo) or sshkey_changed:
            logger.warn('Updating node %s json with sysinfo output from %s',
                        node, node_uuid)
            node.update_from_sysinfo(esysinfo)  # Will save public SSH key too
            node_json_changed.send(task_id, node=node)  # Signal!
            result[
                'message'] = 'Successfully updated compute node %s' % node.hostname
            task_log_success(task_id,
                             msg=LOG_NODE_UPDATE,
                             obj=node,
                             task_result=result,
                             update_user_tasks=True)
        else:
            result[
                'message'] = 'No changes detected on compute node %s' % node.hostname
            task_log_success(task_id,
                             msg=LOG_NODE_UPDATE,
                             obj=node,
                             task_result=result,
                             update_user_tasks=True)

    if sshkey_changed:
        logger.warn(
            'SSH key has changed on node %s - creating authorized_keys synchronization tasks',
            node)
        try:
            run_node_authorized_keys_sync()
        except Exception as e:
            logger.exception(e)

    try:
        run_node_img_sources_sync(node, node_img_sources=img_sources)
    except Exception as e:
        logger.exception(e)

    if node_new:
        node.del_initializing()
        # Used by esdc-ee to change node status to unlicensed
        node_status = getattr(settings, 'VMS_NODE_STATUS_DEFAULT', None)

        if node_status:
            node.save_status(
                node_status)  # Set node status (most probably to unlicensed)
    else:
        # Always run vm_status_all for an old compute node
        vm_status_all(task_id, node)

        # Sync snapshots and backup for every node storage
        try:
            NodeVmSnapshotList.sync(node)
        except Exception as e:
            logger.exception(e)

    return result
コード例 #7
0
def node_sysinfo_cb(result, task_id, node_uuid=None):
    """
    A callback function for updating Node.json (sysinfo).

    node_uuid will be set only if called via API or GUI
    """
    # in case the callback is called by restarting erigonesd:fast service on compute node, the meta dict lacks
    # a lot of information; msg is required as part of exception logging inside callback decorator
    # therefore we set it explicitly
    result['meta']['msg'] = LOG_NODE_UPDATE

    if result['returncode'] != 0:
        logger.error(
            'Found nonzero return code in result from esysinfo command on %s',
            node_uuid)
        raise TaskException(result,
                            'Got bad return code (%s)' % result['returncode'])

    stdout = result.pop('stdout', '')
    result.pop('stderr', None)
    node_new = False

    try:
        esysinfo = parse_esysinfo(stdout)
        img_sources = esysinfo.pop('img_sources')
        img_initial = esysinfo.pop('img_initial')
    except Exception as e:
        logger.error(
            'Could not parse output from esysinfo command on %s. Error: %s',
            node_uuid, e)
        logger.exception(e)
        raise TaskException(result, 'Could not parse esysinfo output')
    else:
        uuid = esysinfo['sysinfo']['UUID']

    try:
        node = Node.objects.get(uuid=uuid)
    except Node.DoesNotExist:
        # The head node must be in online state during the admin DC initialization and each compute node must be in
        # online state during ssh key exchange.
        node_new = True
        is_head = not Node.objects.exists()
        logger.warn('Creating NEW node from sysinfo output from %s', node_uuid)
        node = Node.create_from_sysinfo(uuid,
                                        esysinfo,
                                        status=Node.ONLINE,
                                        is_head=is_head)
        node_created.send(task_id, node=node)  # Signal!
        result[
            'message'] = 'Successfully created new compute node %s' % node.hostname
        task_log_success(task_id,
                         msg=LOG_NODE_CREATE,
                         obj=node,
                         task_result=result,
                         update_user_tasks=True)
        sshkey_changed = bool(node.sshkey)

        if node.is_head:
            logger.warn(
                'New node %s is the first node ever created - assuming head node status. '
                'Initializing mgmt system and creating admin DC', node)
            from api.system.init import init_mgmt
            try:
                init_mgmt(node, images=img_initial)
            except Exception as e:
                logger.exception(e)
                result[
                    'message'] = 'Error while initializing admin datacenter (%s)' % e
                task_log_error(task_id,
                               msg=LOG_NODE_CREATE,
                               obj=node,
                               task_result=result,
                               update_user_tasks=True)

        try:
            _save_node_ip_address(task_id, node)
        except Exception as e:
            logger.exception(e)
    else:
        sshkey_changed = node.sshkey_changed(esysinfo)
        sysinfo_changed = node.sysinfo_changed(esysinfo)

        if sysinfo_changed or sshkey_changed:
            logger.warn('Updating node %s json with sysinfo output from %s',
                        node, node_uuid)
            node.update_from_sysinfo(esysinfo)  # Will save public SSH key too
            node_json_changed.send(task_id, node=node)  # Signal!
            result[
                'message'] = 'Successfully updated compute node %s' % node.hostname
        else:
            node_json_unchanged.send(task_id, node=node)  # Signal!
            result[
                'message'] = 'No changes detected on compute node %s' % node.hostname

        task_log_success(task_id,
                         msg=LOG_NODE_UPDATE,
                         obj=node,
                         task_result=result,
                         update_user_tasks=True)

    if sshkey_changed:
        logger.warn(
            'SSH key has changed on node %s - creating authorized_keys synchronization tasks',
            node)
        try:
            run_node_authorized_keys_sync()
        except Exception as e:
            logger.exception(e)

    try:
        run_node_img_sources_sync(node, node_img_sources=img_sources)
    except Exception as e:
        logger.exception(e)

    if node_new:
        node.del_initializing()
        # Used by esdc-ee to change node status to unlicensed
        node_status = getattr(settings, 'VMS_NODE_STATUS_DEFAULT', None)

        if node_status:
            node.save_status(
                node_status)  # Set node status (most probably to unlicensed)
    else:
        # Always run vm_status_all for an old compute node
        vm_status_all(task_id, node)

        # Sync snapshots and backup for every node storage
        try:
            NodeVmSnapshotList.sync(node)
        except Exception as e:
            logger.exception(e)

    # Refresh cached version information + emit event informing about restarted erigonesd:fast
    try:
        del node.system_version

        # Sometimes the node worker does not respond within the given timeout so we have to try more than once
        for i in range(5):
            if node.system_version:
                break

        logger.info('Node %s has system version %s', node, node.system_version)

        if owner_id_from_task_id(task_id) == TASK_USER:  # internal user ID
            NodeSystemRestarted(node,
                                system_version=node.system_version).send()

    except Exception as e:
        logger.exception(e)

    return result