def task_log_error(self, task_id, obj=None, msg='', detail='', **kwargs): from api.task.utils import task_log_error if obj is None: obj = self.server_class(self.dc) task_log_error(task_id, msg, obj=obj, detail=detail, **kwargs)
def _delete_oldest(model, define, view_function, view_item, task_id, msg): """ Helper for finding oldest snapshots/backups and running DELETE view_function(). @type model: django.db.models.Model """ vm = define.vm # TODO: check indexes # noinspection PyUnresolvedReferences total = model.objects.filter(vm=vm, disk_id=define.disk_id, define=define, status=model.OK).count() to_delete = total - define.retention if to_delete < 1: return None # List of snapshot or backup names to delete TODO: check indexes # noinspection PyUnresolvedReferences oldest = model.objects.filter(vm=vm, disk_id=define.disk_id, define=define, status=model.OK)\ .values_list('name', flat=True).order_by('id')[:to_delete] view_name = view_function.__name__ view_data = {'disk_id': define.array_disk_id, view_item: tuple(oldest)} request = get_dummy_request(vm.dc, method='DELETE', system_user=True) request.define_id = define.id # Automatic task # Go! logger.info('Running DELETE %s(%s, %s), because %s>%s', view_name, vm, view_data, total, define.retention) res = call_api_view(request, 'DELETE', view_function, vm.hostname, data=view_data) if res.status_code in (200, 201): logger.warn('DELETE %s(%s, %s) was successful: %s', view_name, vm, view_data, res.data) else: logger.error('Running DELETE %s(%s, %s) failed: %s (%s): %s', view_name, vm, view_data, res.status_code, res.status_text, res.data) MonitoringBackend.vm_send_alert( vm, 'Automatic deletion of old %ss %s/disk-%s failed to start.' % (model.__name__.lower(), vm.hostname, define.array_disk_id)) # Need to log this, because nobody else does (+ there is no PENDING task) detail = 'hostname=%s, %s=%s, disk_id=%s, Error: %s' % ( vm.hostname, view_item, ','.join(oldest), define.array_disk_id, get_task_error_message(res.data)) task_log_error(task_id, msg, vm=vm, detail=detail, update_user_tasks=False) return res
def vm_snapshot_beat(snap_define_id): """ This is a periodic beat task. Run POST vm_snapshot according to snapshot definition. """ from api.vm.snapshot.views import vm_snapshot snap_define = SnapshotDefine.objects.get(id=snap_define_id) snap_name = snap_define.generate_snapshot_name() vm = snap_define.vm disk_id = snap_define.array_disk_id request = get_dummy_request(vm.dc, method='POST', system_user=True) request.define_id = snap_define.id # Automatic task # Go! res = call_api_view(request, 'POST', vm_snapshot, vm.hostname, snap_name, data={ 'disk_id': disk_id, 'fsfreeze': snap_define.fsfreeze }) if res.status_code == 201: logger.info( 'POST vm_snapshot(%s, %s, {disk_id=%s}) was successful: %s', vm, snap_name, disk_id, res.data) else: # Need to log this, because nobody else does (+ there is no PENDING task) detail = 'snapname=%s, disk_id=%s, type=%s. Error: %s' % ( snap_name, disk_id, Snapshot.AUTO, get_task_error_message( res.data)) task_log_error(task_id_from_task_id(vm_snapshot_beat.request.id, dc_id=vm.dc.id), LOG_SNAP_CREATE, vm=vm, detail=detail, update_user_tasks=False) if res.status_code == HTTP_423_LOCKED: logger.warning( 'Running POST vm_snapshot(%s, %s, {disk_id=%s}) failed: %s (%s): %s', vm, snap_name, disk_id, res.status_code, res.status_text, res.data) else: logger.error( 'Running POST vm_snapshot(%s, %s, {disk_id=%s}) failed: %s (%s): %s', vm, snap_name, disk_id, res.status_code, res.status_text, res.data) MonitoringBackend.vm_send_alert( vm, 'Automatic snapshot %s/disk-%s@%s failed to start.' % (vm.hostname, disk_id, snap_define.name))
def vm_backup_beat(bkp_define_id): """ This is a periodic beat task. Run POST vm_backup according to backup definition. """ from api.vm.backup.views import vm_backup bkp_define = BackupDefine.objects.get(id=bkp_define_id) vm = bkp_define.vm disk_id = bkp_define.array_disk_id defname = bkp_define.name request = get_dummy_request(vm.dc, method='POST', system_user=True) request.define_id = bkp_define.id # Automatic task # Go! res = call_api_view(request, 'POST', vm_backup, vm.hostname, defname, data={ 'disk_id': disk_id, 'fsfreeze': bkp_define.fsfreeze }) if res.status_code == 201: logger.info('POST vm_backup(%s, %s, {disk_id=%s}) was successful: %s', vm, defname, disk_id, res.data) else: # Need to log this, because nobody else does (+ there is no PENDING task) detail = 'hostname=%s, bkpname=%s, disk_id=%s, Error: %s' % ( vm.hostname, bkp_define.generate_backup_name(), disk_id, get_task_error_message(res.data)) task_log_error(task_id_from_task_id(vm_backup_beat.request.id, dc_id=vm.dc.id), LOG_BKP_CREATE, vm=vm, detail=detail, update_user_tasks=False) if res.status_code == HTTP_423_LOCKED: logger.warning( 'Running POST vm_backup(%s, %s, {disk_id=%s}) failed: %s (%s): %s', vm, defname, disk_id, res.status_code, res.status_text, res.data) else: logger.error( 'Running POST vm_backup(%s, %s, {disk_id=%s}) failed: %s (%s): %s', vm, defname, disk_id, res.status_code, res.status_text, res.data) Zabbix.vm_send_alert( vm, 'Automatic backup %s/disk-%s@%s failed to start.' % (vm.hostname, disk_id, defname))
def node_authorized_keys_sync_cb(result, task_id, node_uuid=None): """ Callback for run_node_authorized_keys_sync(). """ node = Node.objects.get(uuid=node_uuid) if result['returncode'] == 0: node.save_authorized_keys(result['stdout']) else: result['message'] = 'Compute node SSH key sync error - got bad return code (%s). Error: %s' % \ (result['returncode'], result.get('stderr', '')) task_log_error(task_id, msg=LOG_NODE_UPDATE, obj=node, task_result=result, update_user_tasks=False) return result
def node_sysinfo_cb(result, task_id, node_uuid=None): """ A callback function for updating Node.json (sysinfo). node_uuid will be set only if called via API or GUI """ # in case the callback is called by restarting erigonesd:fast service on compute node, the meta dict lacks # a lot of information; msg is required as part of exception logging inside callback decorator # therefore we set it explicitly result['meta']['msg'] = LOG_NODE_UPDATE if result['returncode'] != 0: logger.error( 'Found nonzero return code in result from esysinfo command on %s', node_uuid) raise TaskException(result, 'Got bad return code (%s)' % result['returncode']) stdout = result.pop('stdout', '') result.pop('stderr', None) node_new = False try: esysinfo = parse_esysinfo(stdout) img_sources = esysinfo.pop('img_sources') img_initial = esysinfo.pop('img_initial') except Exception as e: logger.error( 'Could not parse output from esysinfo command on %s. Error: %s', node_uuid, e) logger.exception(e) raise TaskException(result, 'Could not parse esysinfo output') else: uuid = esysinfo['sysinfo']['UUID'] try: node = Node.objects.get(uuid=uuid) except Node.DoesNotExist: # The head node must be in online state during the admin DC initialization and each compute node must be in # online state during ssh key exchange. node_new = True is_head = not Node.objects.exists() logger.warn('Creating NEW node from sysinfo output from %s', node_uuid) node = Node.create_from_sysinfo(uuid, esysinfo, status=Node.ONLINE, is_head=is_head) node_created.send(task_id, node=node) # Signal! result[ 'message'] = 'Successfully created new compute node %s' % node.hostname task_log_success(task_id, msg=LOG_NODE_CREATE, obj=node, task_result=result, update_user_tasks=True) sshkey_changed = bool(node.sshkey) if node.is_head: logger.warn( 'New node %s is the first node ever created - assuming head node status. ' 'Initializing mgmt system and creating admin DC', node) from api.system.init import init_mgmt try: init_mgmt(node, images=img_initial) except Exception as e: logger.exception(e) result[ 'message'] = 'Error while initializing admin datacenter (%s)' % e task_log_error(task_id, msg=LOG_NODE_CREATE, obj=node, task_result=result, update_user_tasks=True) logger.info('Saving node %s IP address "%s" into admin network', node, node.ip_address) try: # We should proceed even if the IP address is not registered node.ip_address.save() except Exception as e: logger.exception(e) else: admin_net = node.ip_address.subnet # The network was updated by init_mgmt() # Reload Subnet object because it is cached inside node instance admin_net = admin_net.__class__.objects.get(pk=admin_net.pk) # We need a request object request = get_dummy_request(DefaultDc(), 'POST', system_user=True) record_cls = RecordView.Record if admin_net.dns_domain and admin_net.dns_domain == node.domain_name: logger.info('Creating forward A DNS record for node %s', node) # This will fail silently RecordView.add_or_update_record(request, record_cls.A, admin_net.dns_domain, node.hostname, node.address, task_id=task_id, related_obj=node) if admin_net.ptr_domain: logger.info('Creating reverse PTR DNS record for node %s', node) # This will fail silently RecordView.add_or_update_record(request, record_cls.PTR, admin_net.ptr_domain, record_cls.get_reverse( node.address), node.hostname, task_id=task_id, related_obj=node) else: sshkey_changed = node.sshkey_changed(esysinfo) if node.sysinfo_changed(esysinfo) or sshkey_changed: logger.warn('Updating node %s json with sysinfo output from %s', node, node_uuid) node.update_from_sysinfo(esysinfo) # Will save public SSH key too node_json_changed.send(task_id, node=node) # Signal! result[ 'message'] = 'Successfully updated compute node %s' % node.hostname task_log_success(task_id, msg=LOG_NODE_UPDATE, obj=node, task_result=result, update_user_tasks=True) else: result[ 'message'] = 'No changes detected on compute node %s' % node.hostname task_log_success(task_id, msg=LOG_NODE_UPDATE, obj=node, task_result=result, update_user_tasks=True) if sshkey_changed: logger.warn( 'SSH key has changed on node %s - creating authorized_keys synchronization tasks', node) try: run_node_authorized_keys_sync() except Exception as e: logger.exception(e) try: run_node_img_sources_sync(node, node_img_sources=img_sources) except Exception as e: logger.exception(e) if node_new: node.del_initializing() # Used by esdc-ee to change node status to unlicensed node_status = getattr(settings, 'VMS_NODE_STATUS_DEFAULT', None) if node_status: node.save_status( node_status) # Set node status (most probably to unlicensed) else: # Always run vm_status_all for an old compute node vm_status_all(task_id, node) # Sync snapshots and backup for every node storage try: NodeVmSnapshotList.sync(node) except Exception as e: logger.exception(e) return result
def node_sysinfo_cb(result, task_id, node_uuid=None): """ A callback function for updating Node.json (sysinfo). node_uuid will be set only if called via API or GUI """ # in case the callback is called by restarting erigonesd:fast service on compute node, the meta dict lacks # a lot of information; msg is required as part of exception logging inside callback decorator # therefore we set it explicitly result['meta']['msg'] = LOG_NODE_UPDATE if result['returncode'] != 0: logger.error( 'Found nonzero return code in result from esysinfo command on %s', node_uuid) raise TaskException(result, 'Got bad return code (%s)' % result['returncode']) stdout = result.pop('stdout', '') result.pop('stderr', None) node_new = False try: esysinfo = parse_esysinfo(stdout) img_sources = esysinfo.pop('img_sources') img_initial = esysinfo.pop('img_initial') except Exception as e: logger.error( 'Could not parse output from esysinfo command on %s. Error: %s', node_uuid, e) logger.exception(e) raise TaskException(result, 'Could not parse esysinfo output') else: uuid = esysinfo['sysinfo']['UUID'] try: node = Node.objects.get(uuid=uuid) except Node.DoesNotExist: # The head node must be in online state during the admin DC initialization and each compute node must be in # online state during ssh key exchange. node_new = True is_head = not Node.objects.exists() logger.warn('Creating NEW node from sysinfo output from %s', node_uuid) node = Node.create_from_sysinfo(uuid, esysinfo, status=Node.ONLINE, is_head=is_head) node_created.send(task_id, node=node) # Signal! result[ 'message'] = 'Successfully created new compute node %s' % node.hostname task_log_success(task_id, msg=LOG_NODE_CREATE, obj=node, task_result=result, update_user_tasks=True) sshkey_changed = bool(node.sshkey) if node.is_head: logger.warn( 'New node %s is the first node ever created - assuming head node status. ' 'Initializing mgmt system and creating admin DC', node) from api.system.init import init_mgmt try: init_mgmt(node, images=img_initial) except Exception as e: logger.exception(e) result[ 'message'] = 'Error while initializing admin datacenter (%s)' % e task_log_error(task_id, msg=LOG_NODE_CREATE, obj=node, task_result=result, update_user_tasks=True) try: _save_node_ip_address(task_id, node) except Exception as e: logger.exception(e) else: sshkey_changed = node.sshkey_changed(esysinfo) sysinfo_changed = node.sysinfo_changed(esysinfo) if sysinfo_changed or sshkey_changed: logger.warn('Updating node %s json with sysinfo output from %s', node, node_uuid) node.update_from_sysinfo(esysinfo) # Will save public SSH key too node_json_changed.send(task_id, node=node) # Signal! result[ 'message'] = 'Successfully updated compute node %s' % node.hostname else: node_json_unchanged.send(task_id, node=node) # Signal! result[ 'message'] = 'No changes detected on compute node %s' % node.hostname task_log_success(task_id, msg=LOG_NODE_UPDATE, obj=node, task_result=result, update_user_tasks=True) if sshkey_changed: logger.warn( 'SSH key has changed on node %s - creating authorized_keys synchronization tasks', node) try: run_node_authorized_keys_sync() except Exception as e: logger.exception(e) try: run_node_img_sources_sync(node, node_img_sources=img_sources) except Exception as e: logger.exception(e) if node_new: node.del_initializing() # Used by esdc-ee to change node status to unlicensed node_status = getattr(settings, 'VMS_NODE_STATUS_DEFAULT', None) if node_status: node.save_status( node_status) # Set node status (most probably to unlicensed) else: # Always run vm_status_all for an old compute node vm_status_all(task_id, node) # Sync snapshots and backup for every node storage try: NodeVmSnapshotList.sync(node) except Exception as e: logger.exception(e) # Refresh cached version information + emit event informing about restarted erigonesd:fast try: del node.system_version # Sometimes the node worker does not respond within the given timeout so we have to try more than once for i in range(5): if node.system_version: break logger.info('Node %s has system version %s', node, node.system_version) if owner_id_from_task_id(task_id) == TASK_USER: # internal user ID NodeSystemRestarted(node, system_version=node.system_version).send() except Exception as e: logger.exception(e) return result