def purge_task_history(self): try: worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, user=None, worker_name=worker_name) now = datetime.datetime.now() retention_days = Configuration.get_by_name_as_int('task_history_retention_days') n_days_before = now - datetime.timedelta(days=retention_days) tasks_to_purge = TaskHistory.objects.filter(task_name__in=['notification.tasks.database_notification', 'notification.tasks.database_notification_for_team', 'notification.tasks.update_database_status', 'notification.tasks.update_database_used_size', 'notification.tasks.update_instances_status', 'system.tasks.set_celery_healthcheck_last_update'] , ended_at__lt=n_days_before , task_status__in=["SUCCESS", "ERROR"]) tasks_to_purge.delete() task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Purge succesfully done!') except Exception, e: task_history.update_status_for(TaskHistory.STATUS_ERROR, details=e)
def node_zone_migrate_rollback(self, migrate, task): task = TaskHistory.register( request=self.request, task_history=task, user=task.user, worker_name=get_worker_name() ) from tasks_migrate import rollback_node_zone_migrate rollback_node_zone_migrate(migrate, task)
def remove_database_old_backups(self): worker_name = get_worker_name() task_history = TaskHistory.register( request=self.request, worker_name=worker_name, user=None ) task_history.relevance = TaskHistory.RELEVANCE_WARNING snapshots = [] for env in Environment.objects.all(): snapshots += get_snapshots_by_env(env) msgs = [] status = TaskHistory.STATUS_SUCCESS if len(snapshots) == 0: msgs.append("There is no snapshot to purge") for snapshot in snapshots: try: remove_snapshot_backup(snapshot=snapshot, msgs=msgs) except Exception as e: msg = "Error removing backup {}. Error: {}".format(snapshot, e) status = TaskHistory.STATUS_ERROR LOG.error(msg) msgs.append(msg) task_history.update_status_for(status, details="\n".join(msgs)) return
def update_database_used_size(self): LOG.info("Retrieving all databases") try: worker_name = get_worker_name() task_history = TaskHistory.register( request=self.request, user=None, worker_name=worker_name) databases = Database.objects.all() msgs = [] for database in databases: if database.database_status: database.used_size_in_bytes = float( database.database_status.used_size_in_bytes) else: database.used_size_in_bytes = 0.0 database.save() msg = "\nUpdating used size in bytes for database: {}, used size: {}".format( database, database.used_size_in_bytes) msgs.append(msg) LOG.info(msg) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details="\n".join( value for value in msgs)) except Exception, e: task_history.update_status_for(TaskHistory.STATUS_ERROR, details=e)
def database_notification(self): LOG.info("retrieving all teams and sending database notification") teams = Team.objects.all() msgs = {} for team in teams: ############################################### # create task ############################################### msgs[team] = analyzing_notification_for_team(team=team) ############################################### try: LOG.info("Messages: ") LOG.info(msgs) worker_name = get_worker_name() task_history = TaskHistory.register( request=self.request, user=None, worker_name=worker_name) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details="\n".join( str(key) + ': ' + ', '.join(value) for key, value in msgs.items())) except Exception as e: task_history.update_status_for(TaskHistory.STATUS_ERROR, details=e) return
def databaseinfra_notification(self, user=None): worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, user=user, worker_name= worker_name) threshold_infra_notification = Configuration.get_by_name_as_int("threshold_infra_notification", default=0) if threshold_infra_notification <= 0: LOG.warning("database infra notification is disabled") return # Sum capacity per databseinfra with parameter plan, environment and engine infras = DatabaseInfra.objects.values('plan__name', 'environment__name', 'engine__engine_type__name', 'plan__provider').annotate(capacity=Sum('capacity')) for infra in infras: # total database created in databaseinfra per plan, environment and engine used = DatabaseInfra.objects.filter(plan__name=infra['plan__name'], environment__name=infra['environment__name'], engine__engine_type__name=infra['engine__engine_type__name']).aggregate( used=Count('databases')) # calculate the percentage percent = int(used['used'] * 100 / infra['capacity']) if percent >= threshold_infra_notification and infra['plan__provider'] != Plan.CLOUDSTACK: LOG.info('Plan %s in environment %s with %s%% occupied' % ( infra['plan__name'], infra['environment__name'], percent)) LOG.info("Sending database infra notification...") context = {} context['plan'] = infra['plan__name'] context['environment'] = infra['environment__name'] context['used'] = used['used'] context['capacity'] = infra['capacity'] context['percent'] = percent email_notifications.databaseinfra_ending(context=context) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Databaseinfra Notification successfully sent to dbaas admins!') return
def resize_database(self, database, cloudstackpack, task_history=None,user=None): AuditRequest.new_request("resize_database", user, "localhost") try: worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=worker_name) from util.providers import resize_database result = resize_database(database = database, cloudstackpack = cloudstackpack, task = task_history) if result['created']==False: if 'exceptions' in result: error = "\n".join(": ".join(err) for err in result['exceptions']['error_codes']) traceback = "\nException Traceback\n".join(result['exceptions']['traceback']) error = "{}\n{}\n{}".format(error, traceback, error) else: error = "Something went wrong." task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error) else: task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Resize successfully done.') except Exception, e: error = "Resize Database ERROR: {}".format(e) LOG.error(error) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error)
def database_notification(self): """ Create tasks for database notification by team if threshold_database_notification <= 0, the notification is disabled. """ # get all teams and for each one create a new task LOG.info("retrieving all teams and sendind database notification") teams = Team.objects.all() msgs = {} for team in teams: ############################################### # create task ############################################### msgs[team] = database_notification_for_team(team=team) ############################################### try: LOG.info("Messages: ") LOG.info(msgs) worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, user=None, worker_name=worker_name) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details="\n".join( str(key) + ': ' + ', '.join(value) for key, value in msgs.items())) except Exception, e: task_history.update_status_for(TaskHistory.STATUS_ERROR, details=e)
def update_instances_status(self): LOG.info("Retrieving all databaseinfras") worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, user=None, worker_name=worker_name) try: infras = DatabaseInfra.objects.all() msgs = [] for databaseinfra in infras: LOG.info("Retrieving all instances for {}".format(databaseinfra)) for instance in Instance.objects.filter(databaseinfra=databaseinfra, is_arbiter=False): if instance.check_status(): instance.status = Instance.ALIVE else: instance.status = Instance.DEAD instance.save() msg = "\nUpdating instance status, instance: {}, status: {}".format( instance, instance.status) msgs.append(msg) LOG.info(msg) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details="\n".join( value for value in msgs)) except Exception, e: task_history.update_status_for(TaskHistory.STATUS_ERROR, details=e)
def remove_database_old_backups(self): worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, worker_name=worker_name, user=None) backup_retention_days = Configuration.get_by_name_as_int('backup_retention_days') LOG.info("Removing backups older than %s days" % (backup_retention_days)) backup_time_dt = date.today() - timedelta(days=backup_retention_days) snapshots = Snapshot.objects.filter(start_at__lte=backup_time_dt, purge_at__isnull = True, instance__isnull = False, snapshopt_id__isnull = False) msgs = [] status = TaskHistory.STATUS_SUCCESS if len(snapshots) == 0: msgs.append("There is no snapshot to purge") for snapshot in snapshots: try: remove_snapshot_backup(snapshot=snapshot) msg = "Backup %s removed" % (snapshot) LOG.info(msg) except: msg = "Error removing backup %s" % (snapshot) status = TaskHistory.STATUS_ERROR LOG.error(msg) msgs.append(msg) task_history.update_status_for(status, details="\n".join(msgs)) return
def monitor_acl_job(self,database, job_id, bind_address, bind_status=models.CREATED , user=None): if not user: user = self.request.args[-1] AuditRequest.new_request("create_database",user, "localhost") worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, user=user, worker_name=worker_name) LOG.info("id: %s | task: %s | kwargs: %s | args: %s" % (self.request.id, self.request.task, self.request.kwargs, str(self.request.args))) task_history.update_details(persist=True, details="Loading Process...") try: LOG.debug("database: {}, job_id: {}, bind_address: {}, bind_status: {}, user: {}".format(database, job_id, bind_address, bind_status, user)) status = tasks.monitor_acl_job(database, job_id, bind_address,) LOG.debug("Job status return: {}".format(status)) if status: from dbaas_aclapi.util import update_bind_status LOG.info("Updating Bind Status") update_bind_status(database, bind_address, bind_status) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Bind created successfully') return else: raise Exception, "Error when monitoring the Bind Process" except Exception, e: LOG.info("DatabaseBindMonitoring ERROR: {}".format(e)) task_history.update_status_for(TaskHistory.STATUS_ERROR, details='Bind could not be granted') return
def update_database_status(self): LOG.info("Retrieving all databases") try: worker_name = get_worker_name() task_history = TaskHistory.register( request=self.request, user=None, worker_name=worker_name) databases = Database.objects.all() msgs = [] for database in databases: if database.database_status.is_alive: database.status = Database.ALIVE instances_status = database.databaseinfra.check_instances_status() if instances_status == database.databaseinfra.ALERT: database.status = Database.ALERT else: database.status = Database.DEAD database.save() msg = "\nUpdating status for database: {}, status: {}".format( database, database.status) msgs.append(msg) LOG.info(msg) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details="\n".join( value for value in msgs)) except Exception, e: task_history.update_status_for(TaskHistory.STATUS_ERROR, details=e)
def make_databases_backup(self): LOG.info("Making databases backups") worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, worker_name=worker_name, user=None) msgs = [] status = TaskHistory.STATUS_SUCCESS databaseinfras = DatabaseInfra.objects.filter(plan__provider=Plan.CLOUDSTACK) error = {} for databaseinfra in databaseinfras: instances = Instance.objects.filter(databaseinfra=databaseinfra) for instance in instances: if not instance.databaseinfra.get_driver().check_instance_is_eligible_for_backup(instance): LOG.info('Instance %s is not eligible for backup' % (str(instance))) continue try: if make_instance_snapshot_backup(instance = instance, error = error): msg = "Backup for %s was successful" % (str(instance)) LOG.info(msg) else: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % (str(instance), error['errormsg']) LOG.error(msg) print msg except Exception, e: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % (str(instance), str(e)) LOG.error(msg) msgs.append(msg)
def database_environment_migrate_rollback(self, migrate, task): task = TaskHistory.register( request=self.request, task_history=task, user=task.user, worker_name=get_worker_name() ) from tasks_database_migrate import rollback_database_environment_migrate rollback_database_environment_migrate(migrate, task)
def _create_database_rollback(self, rollback_from, task, user): task = TaskHistory.register( request=self.request, task_history=task, user=user, worker_name=get_worker_name() ) from tasks_create_database import rollback_create rollback_create(rollback_from, task, user)
def create_database(self, name, plan, environment, team, project, description, task_history=None, user=None): AuditRequest.new_request("create_database", user, "localhost") try: worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=worker_name) LOG.info("id: %s | task: %s | kwargs: %s | args: %s" % ( self.request.id, self.request.task, self.request.kwargs, str(self.request.args))) task_history.update_details(persist=True, details="Loading Process...") result = make_infra(plan=plan, environment=environment, name=name, team=team, project=project, description=description, task=task_history, ) if result['created'] == False: if 'exceptions' in result: error = "\n".join(": ".join(err) for err in result['exceptions']['error_codes']) traceback = "\nException Traceback\n".join( result['exceptions']['traceback']) error = "{}\n{}\n{}".format(error, traceback, error) else: error = "There is not any infra-structure to allocate this database." task_history.update_status_for( TaskHistory.STATUS_ERROR, details=error) return task_history.update_dbid(db=result['database']) task_history.update_status_for( TaskHistory.STATUS_SUCCESS, details='Database created successfully') return except Exception as e: traceback = full_stack() LOG.error("Ops... something went wrong: %s" % e) LOG.error(traceback) if 'result' in locals() and result['created']: destroy_infra( databaseinfra=result['databaseinfra'], task=task_history) task_history.update_status_for( TaskHistory.STATUS_ERROR, details=traceback) return finally: AuditRequest.cleanup_request()
def node_zone_migrate( self, host, zone, new_environment, task, since_step=None, step_manager=None ): task = TaskHistory.register( request=self.request, task_history=task, user=task.user, worker_name=get_worker_name() ) from tasks_migrate import node_zone_migrate node_zone_migrate(host, zone, new_environment, task, since_step, step_manager=step_manager)
def restore_snapshot(self, database, snapshot, user, task_history): from dbaas_nfsaas.models import HostAttr LOG.info("Restoring snapshot") worker_name = get_worker_name() task_history = models.TaskHistory.objects.get(id=task_history) task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=worker_name) databaseinfra = database.databaseinfra snapshot = Snapshot.objects.get(id=snapshot) snapshot_id = snapshot.snapshopt_id host_attr = HostAttr.objects.get(nfsaas_path=snapshot.export_path) host = host_attr.host host_attr = HostAttr.objects.get(host=host, is_active=True) export_id = host_attr.nfsaas_export_id export_path = host_attr.nfsaas_path steps = RESTORE_SNAPSHOT_SINGLE if databaseinfra.plan.is_ha and databaseinfra.engine_name == 'mysql': steps = RESTORE_SNAPSHOT_MYSQL_HA not_primary_instances = databaseinfra.instances.exclude(hostname=host).exclude(instance_type__in=[Instance.MONGODB_ARBITER, Instance.REDIS_SENTINEL]) not_primary_hosts = [ instance.hostname for instance in not_primary_instances] workflow_dict = build_dict(databaseinfra=databaseinfra, database=database, snapshot_id=snapshot_id, export_path=export_path, export_id=export_id, host=host, steps=steps, not_primary_hosts=not_primary_hosts, ) start_workflow(workflow_dict=workflow_dict, task=task_history) if workflow_dict['exceptions']['traceback']: error = "\n".join( ": ".join(err) for err in workflow_dict['exceptions']['error_codes']) traceback = "\nException Traceback\n".join( workflow_dict['exceptions']['traceback']) error = "{}\n{}\n{}".format(error, traceback, error) task_history.update_status_for( TaskHistory.STATUS_ERROR, details=error) else: task_history.update_status_for( TaskHistory.STATUS_SUCCESS, details='Database sucessfully recovered!') return
def restore_database(self, database, task, snapshot, user, retry_from=None): task = TaskHistory.register( request=self.request, task_history=task, user=user, worker_name=get_worker_name() ) from backup.models import Snapshot snapshot = Snapshot.objects.get(id=snapshot) from tasks_restore_backup import restore_snapshot restore_snapshot(database, snapshot.group, task, retry_from)
def database_environment_migrate( self, database, new_environment, new_offering, task, hosts_zones, since_step=None, step_manager=None ): task = TaskHistory.register( request=self.request, task_history=task, user=task.user, worker_name=get_worker_name() ) from tasks_database_migrate import database_environment_migrate database_environment_migrate( database, new_environment, new_offering, task, hosts_zones, since_step, step_manager=step_manager )
def purge_unused_exports_task(self): from notification.tasks import TaskRegister task = TaskRegister.purge_unused_exports() task = TaskHistory.register( request=self.request, worker_name=get_worker_name(), task_history=task ) task.add_detail('Getting all inactive exports without snapshots') if purge_unused_exports(task): task.set_status_success('Done') else: task.set_status_error('Error')
def create_database( self, name, plan, environment, team, project, description, task, subscribe_to_email_events=True, is_protected=False, user=None, retry_from=None ): task = TaskHistory.register( request=self.request, task_history=task, user=user, worker_name=get_worker_name() ) from tasks_create_database import create_database create_database( name, plan, environment, team, project, description, task, subscribe_to_email_events, is_protected, user, retry_from )
def analyze_databases(self, task_history=None): endpoint, healh_check_route, healh_check_string = get_analyzing_credentials() user = User.objects.get(username='******') worker_name = get_worker_name() task_history = TaskHistory.register(task_history=task_history, request=self.request, user=user, worker_name=worker_name) task_history.update_details(persist=True, details="Loading Process...") AuditRequest.new_request("analyze_databases", user, "localhost") try: analyze_service = AnalyzeService(endpoint, healh_check_route, healh_check_string) with transaction.atomic(): databases = Database.objects.filter(is_in_quarantine=False) today = datetime.now() for database in databases: database_name, engine, instances, environment_name, databaseinfra_name = setup_database_info(database) for execution_plan in ExecutionPlan.objects.all(): if database_can_not_be_resized(database, execution_plan): continue params = execution_plan.setup_execution_params() result = analyze_service.run(engine=engine, database=database_name, instances=instances, **params) if result['status'] == 'success': task_history.update_details(persist=True, details="\nDatabase {} {} was analised.".format(database, execution_plan.plan_name)) if result['msg'] != instances: continue for instance in result['msg']: insert_analyze_repository_record(today, database_name, instance, engine, databaseinfra_name, environment_name, execution_plan) else: raise Exception("Check your service logs..") task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Analisys ok!') except Exception: try: task_history.update_details(persist=True, details="\nDatabase {} {} could not be analised.".format(database, execution_plan.plan_name)) task_history.update_status_for(TaskHistory.STATUS_ERROR, details='Analisys finished with errors!\nError: {}'.format(result['msg'])) except UnboundLocalError: task_history.update_details(persist=True, details="\nProccess crashed") task_history.update_status_for(TaskHistory.STATUS_ERROR, details='Analisys could not be started') finally: AuditRequest.cleanup_request()
def remove_database_backup(self, task, snapshot): worker_name = get_worker_name() task_history = TaskHistory.register( request=self.request, worker_name=worker_name, task_history=task ) task_history.add_detail('Removing {}'.format(snapshot)) try: remove_snapshot_backup(snapshot, force=1) except Exception as e: task_history.add_detail('Error: {}'.format(e)) task.set_status_error('Could not delete backup') return False else: task.set_status_success('Backup deleted with success') return True
def bind_address_on_database(self, database, acl_environment, acl_vlan, action="permit", user=None): if not user: user = self.request.args[-1] LOG.info("User: {}, action: {}".format(user, action)) worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, user=user, worker_name=worker_name) LOG.info("id: %s | task: %s | kwargs: %s | args: %s" % (self.request.id, self.request.task, self.request.kwargs, str(self.request.args))) task_history.update_details(persist=True, details="Loading Process...") try: if action == "permit": bind_status = models.CREATING else: bind_status = models.DESTROYING LOG.info("Params database: {}, acl_environment: {}, acl_vlan: {}, action: {}, bind_status: {}".format(database, acl_environment, acl_vlan, action, bind_status)) job = tasks.bind_unbind_address_on_database(database= database, acl_environment= acl_environment, acl_vlan=acl_vlan, action="permit", bind_status= bind_status) if not job: raise Exception, "Error when executing the Bind" task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Bind created successfully') if bind_status == models.CREATING: bind_status = models.CREATED else: bind_status = models.ERROR LOG.debug("Bind Status: {}".format(bind_status)) monitor_acl_job.delay(database, job, acl_environment+'/'+acl_vlan, bind_status, user=user) return except Exception,e: LOG.info("DatabaseBind ERROR: {}".format(e)) task_history.update_status_for(TaskHistory.STATUS_ERROR, details='Bind could not be created') return
def make_database_backup(self, database, task): worker_name = get_worker_name() task_history = TaskHistory.register( request=self.request, worker_name=worker_name, task_history=task ) if not database.pin_task(task): task.error_in_lock(database) return False task_history.add_detail('Starting database {} backup'.format(database)) instances = _get_backup_instance(database, task) if not instances: task.set_status_error('Could not find eligible instances', database) return False _check_snapshot_limit(instances, task) group = BackupGroup() group.save() has_warning = False for instance in instances: snapshot = _create_database_backup(instance, task, group) if not snapshot: task.set_status_error( 'Backup was unsuccessful in {}'.format(instance), database ) return False snapshot.is_automatic = False snapshot.save() if not has_warning: has_warning = snapshot.has_warning if has_warning: task.set_status_warning('Backup was warning', database) else: task.set_status_success('Backup was successful', database) return True
def destroy_database(self, database, task_history=None, user=None): # register History AuditRequest.new_request("destroy_database", user, "localhost") try: worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name= worker_name) LOG.info("id: %s | task: %s | kwargs: %s | args: %s" % ( self.request.id, self.request.task, self.request.kwargs, str(self.request.args))) task_history.update_details(persist=True, details="Loading Process...") databaseinfra = database.databaseinfra destroy_infra(databaseinfra=databaseinfra, task=task_history) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Database destroyed successfully') return finally: AuditRequest.cleanup_request()
def make_databases_backup(self): LOG.info("Making databases backups") worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, worker_name=worker_name, user=None) msgs = [] status = TaskHistory.STATUS_SUCCESS databaseinfras = DatabaseInfra.objects.filter( plan__provider=Plan.CLOUDSTACK) error = {} for databaseinfra in databaseinfras: instances = Instance.objects.filter(databaseinfra=databaseinfra) for instance in instances: if not instance.databaseinfra.get_driver().check_instance_is_eligible_for_backup(instance): LOG.info('Instance %s is not eligible for backup' % (str(instance))) continue try: if make_instance_snapshot_backup(instance=instance, error=error): msg = "Backup for %s was successful" % (str(instance)) LOG.info(msg) else: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), error['errormsg']) LOG.error(msg) print msg except Exception, e: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), str(e)) LOG.error(msg) msgs.append(msg)
def update_ssl(self, database, task, since_step=None, step_manager=None): from maintenance.models import UpdateSsl task = TaskHistory.register(request=self.request, task_history=task, user=task.user, worker_name=get_worker_name()) if step_manager: step_manager = step_manager step_manager.id = None step_manager.started_at = None since_step = step_manager.current_step else: retry_from = UpdateSsl.objects.filter(can_do_retry=True, database=database, status=UpdateSsl.ERROR).last() step_manager = UpdateSsl() if retry_from: step_manager.current_step = retry_from.current_step since_step = retry_from.current_step step_manager.database = database step_manager.task = task step_manager.save() steps = database.databaseinfra.update_ssl_steps() instances = database.infra.get_driver().get_database_instances() result = steps_for_instances(steps, instances, task, step_manager.update_step, since_step, step_manager=step_manager) step_manager = UpdateSsl.objects.get(id=step_manager.id) if result: step_manager.set_success() task.set_status_success('SSL Update with success') else: step_manager.set_error() task.set_status_error('Could not update SSL')
def databaseinfra_notification(self, user=None): worker_name = get_worker_name() task_history = TaskHistory.register( request=self.request, user=user, worker_name=worker_name) threshold_infra_notification = Configuration.get_by_name_as_int( "threshold_infra_notification", default=0) if threshold_infra_notification <= 0: LOG.warning("database infra notification is disabled") return # Sum capacity per databseinfra with parameter plan, environment and engine infras = DatabaseInfra.objects.values('plan__name', 'environment__name', 'engine__engine_type__name', 'plan__provider').annotate(capacity=Sum('capacity')) for infra in infras: # total database created in databaseinfra per plan, environment and # engine used = DatabaseInfra.objects.filter(plan__name=infra['plan__name'], environment__name=infra[ 'environment__name'], engine__engine_type__name=infra['engine__engine_type__name']).aggregate( used=Count('databases')) # calculate the percentage percent = int(used['used'] * 100 / infra['capacity']) if percent >= threshold_infra_notification and infra['plan__provider'] != Plan.CLOUDSTACK: LOG.info('Plan %s in environment %s with %s%% occupied' % ( infra['plan__name'], infra['environment__name'], percent)) LOG.info("Sending database infra notification...") context = {} context['plan'] = infra['plan__name'] context['environment'] = infra['environment__name'] context['used'] = used['used'] context['capacity'] = infra['capacity'] context['percent'] = percent email_notifications.databaseinfra_ending(context=context) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Databaseinfra Notification successfully sent to dbaas admins!') return
def recreate_slave(self, host, task, since_step=None, step_manager=None): from maintenance.models import RecreateSlave task = TaskHistory.register(request=self.request, task_history=task, user=task.user, worker_name=get_worker_name()) instance = host.instances.first() if step_manager: step_manager = step_manager step_manager.id = None step_manager.started_at = None since_step = step_manager.current_step else: retry_from = RecreateSlave.objects.filter( can_do_retry=True, host=host, status=RecreateSlave.ERROR).last() step_manager = RecreateSlave() if retry_from: step_manager.current_step = retry_from.current_step step_manager.snapshot = retry_from.snapshot since_step = retry_from.current_step step_manager.host = instance.hostname step_manager.task = task step_manager.save() steps = host.instances.first().databaseinfra.recreate_slave_steps() result = steps_for_instances(steps, [instance], task, step_manager.update_step, since_step, step_manager=step_manager) step_manager = RecreateSlave.objects.get(id=step_manager.id) if result: step_manager.set_success() task.set_status_success('Slave recreated with success') else: step_manager.set_error() task.set_status_error('Could not recreate slave')
def check_ssl_expire_at(self): LOG.info("Retrieving all SSL MySQL databases") worker_name = get_worker_name() task = TaskHistory.register( request=self.request, user=None, worker_name=worker_name) task.relevance = TaskHistory.RELEVANCE_CRITICAL one_month_later = date.today() + timedelta(days=30) try: infras = DatabaseInfra.objects.filter( ssl_configured=True, engine__engine_type__name='mysql', instances__hostname__ssl_expire_at__lte=one_month_later ).distinct() for infra in infras: database = infra.databases.first() task.update_details( "Checking database {}...".format(database), persist=True ) scheudled_tasks = TaskSchedule.objects.filter( scheduled_for__lte=one_month_later, status=TaskSchedule.SCHEDULED, database=database ) if scheudled_tasks: task.update_details("Already scheduled!\n", persist=True) else: TaskSchedule.objects.create( method_path='ddd', scheduled_for=one_month_later, database=database ) task.update_details("Schedule created!\n", persist=True) task.update_status_for(TaskHistory.STATUS_SUCCESS, details="\nDone") except Exception as err: task.update_status_for(TaskHistory.STATUS_ERROR, details=err) return
def create_database(self, name, plan, environment, team, project, description, task, backup_hour, maintenance_window, maintenance_day, subscribe_to_email_events=True, is_protected=False, user=None, retry_from=None): task = TaskHistory.register(request=self.request, task_history=task, user=user, worker_name=get_worker_name()) from tasks_create_database import create_database create_database(name, plan, environment, team, project, description, task, backup_hour, maintenance_window, maintenance_day, subscribe_to_email_events, is_protected, user, retry_from)
def make_databases_backup(self): LOG.info("Making databases backups") worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, worker_name=worker_name, user=None) status = TaskHistory.STATUS_SUCCESS databaseinfras = DatabaseInfra.objects.filter( plan__provider=Plan.CLOUDSTACK, plan__has_persistence=True ) error = {} backup_number = 0 backups_per_group = len(databaseinfras) / 12 for databaseinfra in databaseinfras: if backups_per_group > 0: if backup_number < backups_per_group: backup_number += 1 else: backup_number = 0 waiting_msg = "\nWaiting 5 minutes to start the next backup group" task_history.update_details(persist=True, details=waiting_msg) time.sleep(300) instances = Instance.objects.filter(databaseinfra=databaseinfra) for instance in instances: try: if not instance.databaseinfra.get_driver().check_instance_is_eligible_for_backup(instance): LOG.info('Instance %s is not eligible for backup' % (str(instance))) continue except Exception as e: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), str(e)) LOG.error(msg) else: time_now = str(time.strftime("%m/%d/%Y %H:%M:%S")) start_msg = "\n{} - Starting backup for {} ...".format(time_now, instance) task_history.update_details(persist=True, details=start_msg) try: snapshot = make_instance_snapshot_backup( instance=instance, error=error ) if snapshot and snapshot.was_successful: msg = "Backup for %s was successful" % (str(instance)) LOG.info(msg) elif snapshot and snapshot.has_warning: status = TaskHistory.STATUS_WARNING msg = "Backup for %s has warning" % (str(instance)) LOG.info(msg) else: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), error['errormsg']) LOG.error(msg) LOG.info(msg) except Exception as e: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), str(e)) LOG.error(msg) time_now = str(time.strftime("%m/%d/%Y %H:%M:%S")) msg = "\n{} - {}".format(time_now, msg) task_history.update_details(persist=True, details=msg) task_history.update_status_for(status, details="\nBackup finished") return
def execute_scheduled_maintenance(self, maintenance_id): LOG.debug("Maintenance id: {}".format(maintenance_id)) maintenance = models.Maintenance.objects.get(id=maintenance_id) models.Maintenance.objects.filter(id=maintenance_id, ).update( status=maintenance.RUNNING, started_at=datetime.now()) LOG.info("Maintenance {} is RUNNING".format(maintenance, )) worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, worker_name=worker_name) LOG.info("id: %s | task: %s | kwargs: %s | args: %s" % (self.request.id, self.request.task, self.request.kwargs, str(self.request.args))) task_history.update_details( persist=True, details="Executing Maintenance: {}".format(maintenance)) for hm in models.HostMaintenance.objects.filter(maintenance=maintenance): main_output = {} hm.status = hm.RUNNING hm.started_at = datetime.now() hm.save() if hm.host is None: hm.status = hm.UNAVAILABLEHOST hm.finished_at = datetime.now() hm.save() continue host = hm.host update_task = "\nRunning Maintenance on {}".format(host) param_dict = {} for param in models.MaintenanceParameters.objects.filter( maintenance=maintenance): param_function = _get_function(param.function_name) param_dict[param.parameter_name] = param_function(host.id) main_script = build_context_script(param_dict, maintenance.main_script) exit_status = exec_remote_command_host(host, main_script, main_output) if exit_status == 0: hm.status = hm.SUCCESS else: if maintenance.rollback_script: rollback_output = {} hm.status = hm.ROLLBACK hm.save() rollback_script = build_context_script( param_dict, maintenance.rollback_script) exit_status = exec_remote_command_host(host, rollback_script, rollback_output) if exit_status == 0: hm.status = hm.ROLLBACK_SUCCESS else: hm.status = hm.ROLLBACK_ERROR hm.rollback_log = get_dict_lines(rollback_output) else: hm.status = hm.ERROR update_task += "...status: {}".format(hm.status) task_history.update_details(persist=True, details=update_task) hm.main_log = get_dict_lines(main_output) hm.finished_at = datetime.now() hm.save() models.Maintenance.objects.filter(id=maintenance_id, ).update( status=maintenance.FINISHED, finished_at=datetime.now()) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Maintenance executed succesfully') LOG.info("Maintenance: {} has FINISHED".format(maintenance, ))
def analyze_databases(self, task_history=None): (endpoint, healh_check_route, healh_check_string) = get_analyzing_credentials() user = User.objects.get(username='******') worker_name = get_worker_name() task_history = TaskHistory.register(task_history=task_history, request=self.request, user=user, worker_name=worker_name) task_history.update_details(persist=True, details="Loading Process...") AuditRequest.new_request("analyze_databases", user, "localhost") try: analyze_service = AnalyzeService(endpoint, healh_check_route, healh_check_string) with transaction.atomic(): databases = Database.objects.filter(is_in_quarantine=False) today = datetime.now() for database in databases: (database_name, engine, instances, environment_name, databaseinfra_name) = setup_database_info(database) for execution_plan in ExecutionPlan.objects.all(): if not database_can_be_resized(database, execution_plan): continue params = execution_plan.setup_execution_params() result = { 'msg': 'Could not analyse {}'.format(database_name) } try: result = analyze_service.run(engine=engine, database=database_name, instances=instances, **params) if result['status'] == 'success': task_history.update_details( persist=True, details=("\nDatabase {} {} was " "analysed.").format( database, execution_plan.plan_name)) if result['msg'] != instances: continue for instance in result['msg']: insert_analyze_repository_record( today, database_name, instance, engine, databaseinfra_name, environment_name, execution_plan) else: raise Exception("Check your service logs..") except Exception: task_history.update_details( persist=True, details=("\nDatabase {} {} could not be " "analysed.").format( database, execution_plan.plan_name)) task_history.update_status_for( TaskHistory.STATUS_ERROR, details='Analysis finished with errors!' '\nError: {}'.format(result['msg'])) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Analysis ok!') except Exception: task_history.update_details(persist=True, details="\nProcess crashed") task_history.update_status_for(TaskHistory.STATUS_ERROR, details='Analysis could not be started') finally: AuditRequest.cleanup_request()
def upgrade_mongodb_24_to_30(self, database, user, task_history=None): from workflow.settings import MONGODB_UPGRADE_24_TO_30_SINGLE from workflow.settings import MONGODB_UPGRADE_24_TO_30_HA from util import build_dict from workflow.workflow import start_workflow worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=worker_name) databaseinfra = database.databaseinfra driver = databaseinfra.get_driver() instances = driver.get_database_instances() source_plan = databaseinfra.plan target_plan = source_plan.engine_equivalent_plan source_engine = databaseinfra.engine target_engine = source_engine.engine_upgrade_option if source_plan.is_ha: steps = MONGODB_UPGRADE_24_TO_30_HA else: steps = MONGODB_UPGRADE_24_TO_30_SINGLE stop_now = False if not target_plan: msg = "There is not Engine Equivalent Plan!" stop_now = True if not target_engine: msg = "There is not Engine Upgrade Option!" stop_now = True if database.status != Database.ALIVE or not database.database_status.is_alive: msg = "Database is not alive!" stop_now = True if database.is_beeing_used_elsewhere(task_id=self.request.id): msg = "Database is in use by another task!" stop_now = True if not source_engine.version.startswith('2.4.'): msg = "Database version must be 2.4!" stop_now = True if target_engine and target_engine.version != '3.0.12': msg = "Target database version must be 3.0.12!" stop_now = True if stop_now: task_history.update_status_for(TaskHistory.STATUS_ERROR, details=msg) LOG.info("Upgrade finished") return try: delete_zabbix_alarms(database) except Exception as e: message = "Could not delete Zabbix alarms: {}".format(e) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=message) LOG.error(message) return try: workflow_dict = build_dict(steps=steps, databaseinfra=databaseinfra, instances=instances, source_plan=source_plan, target_plan=target_plan, source_engine=source_engine, target_engine=target_engine) start_workflow(workflow_dict=workflow_dict, task=task_history) if workflow_dict['exceptions']['traceback']: error = "\n".join( ": ".join(err) for err in workflow_dict['exceptions']['error_codes']) traceback = "\nException Traceback\n".join( workflow_dict['exceptions']['traceback']) error = "{}\n{}\n{}".format(error, traceback, error) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error) LOG.info("MongoDB Upgrade finished with errors") return task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='MongoDB sucessfully upgraded!') LOG.info("MongoDB Upgrade finished") except Exception as e: task_history.update_status_for(TaskHistory.STATUS_ERROR, details=e) LOG.warning("MongoDB Upgrade finished with errors") try: create_zabbix_alarms(database) except Exception as e: message = "Could not create Zabbix alarms: {}".format(e) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=message) LOG.error(message)
def create_database(self, name, plan, environment, team, project, description, subscribe_to_email_events=True, task_history=None, user=None, is_protected=False): AuditRequest.new_request("create_database", user, "localhost") try: worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=worker_name) LOG.info("id: %s | task: %s | kwargs: %s | args: %s" % (self.request.id, self.request.task, self.request.kwargs, str(self.request.args))) task_history.update_details(persist=True, details="Loading Process...") result = make_infra( plan=plan, environment=environment, name=name, team=team, project=project, description=description, subscribe_to_email_events=subscribe_to_email_events, task=task_history, is_protected=is_protected) if result['created'] is False: if 'exceptions' in result: error = "\n".join( ": ".join(err) for err in result['exceptions']['error_codes']) traceback = "\nException Traceback\n".join( result['exceptions']['traceback']) error = "{}\n{}\n{}".format(error, traceback, error) else: error = "There is not any infra-structure to allocate this database." task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error) return task_history.update_dbid(db=result['database']) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Database created successfully') return except Exception as e: traceback = full_stack() LOG.error("Ops... something went wrong: %s" % e) LOG.error(traceback) if 'result' in locals() and result['created']: destroy_infra(databaseinfra=result['databaseinfra'], task=task_history) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=traceback) return finally: AuditRequest.cleanup_request()
def execute_database_region_migration(self, database_region_migration_detail_id, task_history=None, user=None): #AuditRequest.new_request("execute_database_region_migration", user, "localhost") try: if task_history: arguments = task_history.arguments else: arguments = None task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=get_worker_name()) if arguments: task_history.arguments = arguments task_history.save() database_region_migration_detail = DatabaseRegionMigrationDetail.objects.get( id=database_region_migration_detail_id) database_region_migration = database_region_migration_detail.database_region_migration database = database_region_migration.database databaseinfra = database.databaseinfra source_environment = databaseinfra.environment target_environment = source_environment.equivalent_environment engine = database.engine_type steps = get_engine_steps(engine) workflow_steps = steps[ database_region_migration_detail.step].step_classes source_instances = [] source_hosts = [] for instance in Instance.objects.filter(databaseinfra=databaseinfra): source_instances.append(instance) if instance.instance_type != instance.REDIS_SENTINEL: source_hosts.append(instance.hostname) source_plan = databaseinfra.plan target_plan = source_plan.equivalent_plan_id workflow_dict = build_dict( #database_region_migration_detail = database_region_migration_detail, #database_region_migration = database_region_migration, #database = database, databaseinfra=databaseinfra, #source_environment = source_environment, target_environment=target_environment, steps=workflow_steps, #engine = engine, source_instances=source_instances, source_hosts=source_hosts, #source_plan = source_plan, target_plan=target_plan, ) start_workflow(workflow_dict=workflow_dict, task=task_history) if workflow_dict['created'] == False: if 'exceptions' in workflow_dict: error = "\n".join( ": ".join(err) for err in workflow_dict['exceptions']['error_codes']) traceback = "\nException Traceback\n".join( workflow_dict['exceptions']['traceback']) error = "{}\n{}\n{}".format(error, traceback, error) else: error = "There is not any infra-structure to allocate this database." task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error) return else: task_history.update_status_for( TaskHistory.STATUS_SUCCESS, details='Database region migration was succesfully') return except Exception, e: traceback = full_stack() LOG.error("Ops... something went wrong: %s" % e) LOG.error(traceback) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=traceback) return
def update_ssl(self, database, task, since_step=None, step_manager=None, scheduled_task=None, auto_rollback=False): from maintenance.models import UpdateSsl task = TaskHistory.register(request=self.request, task_history=task, user=task.user, worker_name=get_worker_name()) if step_manager: step_manager = step_manager step_manager.id = None step_manager.started_at = None since_step = step_manager.current_step else: retry_from = UpdateSsl.objects.filter(can_do_retry=True, database=database, status=UpdateSsl.ERROR).last() step_manager = UpdateSsl() if retry_from: step_manager.current_step = retry_from.current_step since_step = retry_from.current_step step_manager.task_schedule = retry_from.task_schedule step_manager.database = database step_manager.task = task if scheduled_task: step_manager.task_schedule = scheduled_task step_manager.set_running() step_manager.save() steps = database.databaseinfra.update_ssl_steps() hosts = [] for instance in database.infra.instances.all(): if instance.hostname not in hosts: hosts.append(instance.hostname) instances = [] for host in hosts: instances.append(host.instances.all()[0]) result = steps_for_instances(steps, instances, task, step_manager.update_step, since_step, step_manager=step_manager) step_manager = UpdateSsl.objects.get(id=step_manager.id) if result: step_manager.set_success() task.set_status_success('SSL Update with success') else: step_manager.set_error() task.set_status_error('Could not update SSL') if auto_rollback: from workflow.workflow import rollback_for_instances_full new_task = task new_task.id = None new_task.details = '' new_task.task_name += '_rollback' new_task.task_status = new_task.STATUS_RUNNING new_task.save() rollback_step_manager = step_manager rollback_step_manager.id = None rollback_step_manager.task_schedule = None rollback_step_manager.can_do_retry = 0 rollback_step_manager.save() result = rollback_for_instances_full( steps, instances, new_task, rollback_step_manager.get_current_step, rollback_step_manager.update_step, ) if result: rollback_step_manager.set_success() task.set_status_success('Rollback SSL Update with success') else: if hasattr(rollback_step_manager, 'cleanup'): rollback_step_manager.cleanup(instances) rollback_step_manager.set_error() task.set_status_error('Could not rollback update SSL')
def make_databases_backup(self): LOG.info("Making databases backups") worker_name = get_worker_name() task_history = TaskHistory.register( request=self.request, worker_name=worker_name, user=None ) task_history.relevance = TaskHistory.RELEVANCE_ERROR backup_group_interval = Configuration.get_by_name_as_int( 'backup_group_interval', default=1 ) waiting_msg = "\nWaiting {} minute(s) to start the next backup group".format( backup_group_interval ) status = TaskHistory.STATUS_SUCCESS environments = Environment.objects.all() prod_envs = Environment.prod_envs() dev_envs = Environment.dev_envs() env_names_order = list(prod_envs) + list(dev_envs) if not env_names_order: env_names_order = [env.name for env in environments] current_time = datetime.now() current_hour = current_time.hour # Get all infras with a backup today until the current hour infras_with_backup_today = DatabaseInfra.objects.filter( instances__backup_instance__status=Snapshot.SUCCESS, backup_hour__lt=current_hour, plan__has_persistence=True, instances__backup_instance__end_at__year=current_time.year, instances__backup_instance__end_at__month=current_time.month, instances__backup_instance__end_at__day=current_time.day).distinct() # Get all infras with pending backups based on infras_with_backup_today infras_pending_backup = DatabaseInfra.objects.filter( backup_hour__lt=current_hour, plan__has_persistence=True, ).exclude(pk__in=[infra.pk for infra in infras_with_backup_today]) # Get all infras to backup on the current hour infras_current_hour = DatabaseInfra.objects.filter( plan__has_persistence=True, backup_hour=current_time.hour ) # Merging pending and current infras to backup list infras = infras_current_hour | infras_pending_backup for env_name in env_names_order: try: env = environments.get(name=env_name) except Environment.DoesNotExist: continue msg = '\nStarting Backup for env {}'.format(env.name) task_history.update_details(persist=True, details=msg) databaseinfras_by_env = infras.filter(environment=env) error = {} backup_number = 0 backups_per_group = len(infras) / 12 for infra in databaseinfras_by_env: if not infra.databases.first(): continue if backups_per_group > 0: if backup_number < backups_per_group: backup_number += 1 else: backup_number = 0 task_history.update_details(waiting_msg, True) sleep(backup_group_interval*60) group = BackupGroup() group.save() instances_backup = infra.instances.filter( read_only=False, is_active=True ) for instance in instances_backup: try: driver = instance.databaseinfra.get_driver() is_eligible = driver.check_instance_is_eligible_for_backup( instance ) if not is_eligible: LOG.info( 'Instance {} is not eligible for backup'.format( instance ) ) continue except Exception as e: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), str(e)) LOG.error(msg) time_now = str(strftime("%m/%d/%Y %H:%M:%S")) start_msg = "\n{} - Starting backup for {} ...".format( time_now, instance ) task_history.update_details(persist=True, details=start_msg) try: snapshot = make_instance_snapshot_backup( instance=instance, error=error, group=group, current_hour=current_hour ) if snapshot and snapshot.was_successful: msg = "Backup for %s was successful" % (str(instance)) LOG.info(msg) elif snapshot and snapshot.was_error: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), error['errormsg']) LOG.error(msg) else: status = TaskHistory.STATUS_WARNING msg = "Backup for %s has warning" % (str(instance)) LOG.info(msg) LOG.info(msg) except Exception as e: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), str(e)) LOG.error(msg) time_now = str(strftime("%m/%d/%Y %H:%M:%S")) msg = "\n{} - {}".format(time_now, msg) task_history.update_details(persist=True, details=msg) task_history.update_status_for(status, details="\nBackup finished") return
def make_databases_backup(self): LOG.info("Making databases backups") worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, worker_name=worker_name, user=None) status = TaskHistory.STATUS_SUCCESS envs = Environment.objects.all() # TODO: back here to do right env_names_order = ['prod', 'qa2', 'dev-cta-nao-usar', 'dev'] databaseinfras = DatabaseInfra.objects.filter( plan__provider=Plan.CLOUDSTACK, plan__has_persistence=True ) for env_name in env_names_order: try: env = envs.get(name=env_name) except Environment.DoesNotExist: continue msg = 'Starting Backup for env {}'.format(env.name) task_history.update_details(persist=True, details=msg) databaseinfras_by_env = databaseinfras.filter(environment=env) error = {} backup_number = 0 backups_per_group = len(databaseinfras) / 12 for databaseinfra in databaseinfras_by_env: if backups_per_group > 0: if backup_number < backups_per_group: backup_number += 1 else: backup_number = 0 waiting_msg = "\nWaiting 5 minutes to start the next backup group" task_history.update_details(persist=True, details=waiting_msg) time.sleep(300) instances = Instance.objects.filter( databaseinfra=databaseinfra, read_only=False ) group = BackupGroup() group.save() for instance in instances: try: if not instance.databaseinfra.get_driver().check_instance_is_eligible_for_backup(instance): LOG.info('Instance %s is not eligible for backup' % (str(instance))) continue except Exception as e: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), str(e)) LOG.error(msg) time_now = str(time.strftime("%m/%d/%Y %H:%M:%S")) start_msg = "\n{} - Starting backup for {} ...".format(time_now, instance) task_history.update_details(persist=True, details=start_msg) try: snapshot = make_instance_snapshot_backup( instance=instance, error=error, group=group ) if snapshot and snapshot.was_successful: msg = "Backup for %s was successful" % (str(instance)) LOG.info(msg) elif snapshot and snapshot.has_warning: status = TaskHistory.STATUS_WARNING msg = "Backup for %s has warning" % (str(instance)) LOG.info(msg) else: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), error['errormsg']) LOG.error(msg) LOG.info(msg) except Exception as e: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), str(e)) LOG.error(msg) time_now = str(time.strftime("%m/%d/%Y %H:%M:%S")) msg = "\n{} - {}".format(time_now, msg) task_history.update_details(persist=True, details=msg) task_history.update_status_for(status, details="\nBackup finished") return
def execute_scheduled_maintenance(self, maintenance_id): LOG.debug("Maintenance id: {}".format(maintenance_id)) maintenance = models.Maintenance.objects.get(id=maintenance_id) models.Maintenance.objects.filter(id=maintenance_id, ).update( status=maintenance.RUNNING, started_at=datetime.now()) LOG.info("Maintenance {} is RUNNING".format(maintenance, )) worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, worker_name=worker_name) LOG.info("id: %s | task: %s | kwargs: %s | args: %s" % (self.request.id, self.request.task, self.request.kwargs, str(self.request.args))) task_history.update_details( persist=True, details="Executing Maintenance: {}".format(maintenance)) for hm in models.HostMaintenance.objects.filter(maintenance=maintenance): main_output = {} hm.status = hm.RUNNING hm.started_at = datetime.now() hm.save() if hm.host is None: hm.status = hm.UNAVAILABLEHOST hm.finished_at = datetime.now() hm.save() continue host = hm.host update_task = "\nRunning Maintenance on {}".format(host) try: cloudstack_host_attributes = host.cs_host_attributes.get() except ObjectDoesNotExist, e: LOG.warn("Host {} does not have cloudstack attrs...{}".format( hm.host, e)) hm.status = hm.UNAVAILABLECSHOSTATTR hm.finished_at = datetime.now() hm.save() continue param_dict = {} for param in models.MaintenanceParameters.objects.filter( maintenance=maintenance): param_function = _get_function(param.function_name) param_dict[param.parameter_name] = param_function(host.id) main_script = build_context_script(param_dict, maintenance.main_script) exit_status = exec_remote_command( server=host.address, username=cloudstack_host_attributes.vm_user, password=cloudstack_host_attributes.vm_password, command=main_script, output=main_output) if exit_status == 0: hm.status = hm.SUCCESS else: if maintenance.rollback_script: rollback_output = {} hm.status = hm.ROLLBACK hm.save() rollback_script = build_context_script( param_dict, maintenance.rollback_script) exit_status = exec_remote_command( server=host.address, username=cloudstack_host_attributes.vm_user, password=cloudstack_host_attributes.vm_password, command=rollback_script, output=rollback_output) if exit_status == 0: hm.status = hm.ROLLBACK_SUCCESS else: hm.status = hm.ROLLBACK_ERROR hm.rollback_log = get_dict_lines(rollback_output) else: hm.status = hm.ERROR update_task += "...status: {}".format(hm.status) task_history.update_details(persist=True, details=update_task) hm.main_log = get_dict_lines(main_output) hm.finished_at = datetime.now() hm.save()
def execute_scheduled_maintenance(self, maintenance_id): LOG.debug("Maintenance id: {}".format(maintenance_id)) maintenance = models.Maintenance.objects.get(id=maintenance_id) models.Maintenance.objects.filter(id=maintenance_id).update( status=maintenance.RUNNING, started_at=datetime.now() ) LOG.info("Maintenance {} is RUNNING".format(maintenance,)) worker_name = get_worker_name() task_history = TaskHistory.register( request=self.request, worker_name=worker_name ) task_history.relevance = TaskHistory.RELEVANCE_CRITICAL LOG.info("id: {} | task: {} | kwargs: {} | args: {}".format( self.request.id, self.request.task, self.request.kwargs, str(self.request.args) )) task_history.update_details( persist=True, details="Executing Maintenance: {}".format(maintenance) ) for hm in models.HostMaintenance.objects.filter(maintenance=maintenance): main_output = {} hm.status = hm.RUNNING hm.started_at = datetime.now() hm.save() if hm.host is None: hm.status = hm.UNAVAILABLEHOST hm.finished_at = datetime.now() hm.save() continue host = hm.host update_task = "\nRunning Maintenance on {}".format(host) if maintenance.disable_alarms: disable_alarms(hm.host) param_dict = {} params = models.MaintenanceParameters.objects.filter( maintenance=maintenance ) for param in params: param_function = get_function(param.function_name) param_dict[param.parameter_name] = param_function(host.id) main_script = build_context_script(param_dict, maintenance.main_script) exit_status = exec_remote_command_host(host, main_script, main_output) if exit_status == 0: hm.status = hm.SUCCESS else: if maintenance.rollback_script: rollback_output = {} hm.status = hm.ROLLBACK hm.save() rollback_script = build_context_script( param_dict, maintenance.rollback_script ) exit_status = exec_remote_command_host( host, rollback_script, rollback_output ) if exit_status == 0: hm.status = hm.ROLLBACK_SUCCESS else: hm.status = hm.ROLLBACK_ERROR hm.rollback_log = get_dict_lines(rollback_output) else: hm.status = hm.ERROR if maintenance.disable_alarms: enable_alarms(hm.host) update_task += "...status: {}".format(hm.status) task_history.update_details(persist=True, details=update_task) hm.main_log = get_dict_lines(main_output) hm.finished_at = datetime.now() hm.save() models.Maintenance.objects.filter(id=maintenance_id).update( status=maintenance.FINISHED, finished_at=datetime.now() ) task_history.update_status_for( TaskHistory.STATUS_SUCCESS, details='Maintenance executed succesfully' ) LOG.info("Maintenance: {} has FINISHED".format(maintenance))
def execute_scheduled_maintenance(self,maintenance_id): LOG.debug("Maintenance id: {}".format(maintenance_id)) maintenance = models.Maintenance.objects.get(id=maintenance_id) models.Maintenance.objects.filter(id=maintenance_id, ).update(status=maintenance.RUNNING, started_at=datetime.now()) LOG.info("Maintenance {} is RUNNING".format(maintenance,)) worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request,worker_name= worker_name) LOG.info("id: %s | task: %s | kwargs: %s | args: %s" % ( self.request.id, self.request.task, self.request.kwargs, str(self.request.args))) task_history.update_details(persist=True, details="Executing Maintenance: {}".format(maintenance)) for hm in models.HostMaintenance.objects.filter(maintenance=maintenance): main_output = {} hm.status = hm.RUNNING hm.started_at = datetime.now() hm.save() if hm.host is None: hm.status = hm.UNAVAILABLEHOST hm.finished_at = datetime.now() hm.save() continue host = hm.host update_task = "\nRunning Maintenance on {}".format(host) try: cloudstack_host_attributes = host.cs_host_attributes.get() except ObjectDoesNotExist, e: LOG.warn("Host {} does not have cloudstack attrs...{}".format(hm.host,e)) hm.status = hm.UNAVAILABLECSHOSTATTR hm.finished_at = datetime.now() hm.save() continue param_dict = {} for param in models.MaintenanceParameters.objects.filter(maintenance=maintenance): param_function = _get_function(param.function_name) param_dict[param.parameter_name] = param_function(host.id) main_script = build_context_script(param_dict, maintenance.main_script) exit_status = exec_remote_command(server=host.address, username=cloudstack_host_attributes.vm_user, password=cloudstack_host_attributes.vm_password, command=main_script, output=main_output) if exit_status == 0: hm.status = hm.SUCCESS else: if maintenance.rollback_script: rollback_output = {} hm.status = hm.ROLLBACK hm.save() rollback_script = build_context_script(param_dict, maintenance.rollback_script) exit_status = exec_remote_command(server=host.address, username=cloudstack_host_attributes.vm_user, password=cloudstack_host_attributes.vm_password, command=rollback_script, output=rollback_output) if exit_status ==0: hm.status = hm.ROLLBACK_SUCCESS else: hm.status = hm.ROLLBACK_ERROR hm.rollback_log = get_dict_lines(rollback_output) else: hm.status = hm.ERROR update_task += "...status: {}".format(hm.status) task_history.update_details(persist=True, details=update_task) hm.main_log = get_dict_lines(main_output) hm.finished_at = datetime.now() hm.save()
def bind_address_on_database(self, database, acl_environment, acl_vlan, action="permit", user=None): if not user: user = self.request.args[-1] LOG.info("User: {}, action: {}".format(user, action)) worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, user=user, worker_name=worker_name) LOG.info("id: %s | task: %s | kwargs: %s | args: %s" % (self.request.id, self.request.task, self.request.kwargs, str(self.request.args))) task_history.update_details(persist=True, details="Loading Process...") try: if action == "permit": bind_status = models.CREATING else: bind_status = models.DESTROYING LOG.info( "Params database: {}, acl_environment: {}, acl_vlan: {}, action: {}, bind_status: {}" .format(database, acl_environment, acl_vlan, action, bind_status)) job = tasks.bind_unbind_address_on_database( database=database, acl_environment=acl_environment, acl_vlan=acl_vlan, action="permit", bind_status=bind_status) if not job: raise Exception, "Error when executing the Bind" task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Bind created successfully') if bind_status == models.CREATING: bind_status = models.CREATED else: bind_status = models.ERROR LOG.debug("Bind Status: {}".format(bind_status)) monitor_acl_job.delay(database, job, acl_environment + '/' + acl_vlan, bind_status, user=user) return except Exception, e: LOG.info("DatabaseBind ERROR: {}".format(e)) task_history.update_status_for(TaskHistory.STATUS_ERROR, details='Bind could not be created') return
def restore_snapshot(self, database, snapshot, user, task_history): try: from dbaas_nfsaas.models import HostAttr LOG.info("Restoring snapshot") worker_name = get_worker_name() task_history = models.TaskHistory.objects.get(id=task_history) task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=worker_name) databaseinfra = database.databaseinfra snapshot = Snapshot.objects.get(id=snapshot) snapshot_id = snapshot.snapshopt_id host_attr_snapshot = HostAttr.objects.get(nfsaas_path=snapshot.export_path) host = host_attr_snapshot.host host_attr = HostAttr.objects.get(host=host, is_active=True) export_id_snapshot = host_attr_snapshot.nfsaas_export_id export_id = host_attr.nfsaas_export_id export_path = host_attr.nfsaas_path steps = get_restore_snapshot_settings( database.plan.replication_topology.class_path ) not_primary_instances = databaseinfra.instances.exclude(hostname=host).exclude(instance_type__in=[Instance.MONGODB_ARBITER, Instance.REDIS_SENTINEL]) not_primary_hosts = [ instance.hostname for instance in not_primary_instances] workflow_dict = build_dict(databaseinfra=databaseinfra, database=database, snapshot_id=snapshot_id, export_path=export_path, export_id=export_id, export_id_snapshot=export_id_snapshot, host=host, steps=steps, not_primary_hosts=not_primary_hosts, ) start_workflow(workflow_dict=workflow_dict, task=task_history) if workflow_dict['exceptions']['traceback']: raise Exception('Restore could not be finished') else: task_history.update_status_for( TaskHistory.STATUS_SUCCESS, details='Database sucessfully recovered!') except Exception, e: if 'workflow_dict' in locals(): error = "\n".join(": ".join(err) for err in workflow_dict['exceptions']['error_codes']) traceback = "\nException Traceback\n".join(workflow_dict['exceptions']['traceback']) error = "{}\n{}\n{}".format(error, traceback, error) else: error = str(e) task_history.update_status_for( TaskHistory.STATUS_ERROR, details=error)
def execute_scheduled_maintenance(self, maintenance_id): LOG.debug("Maintenance id: {}".format(maintenance_id)) maintenance = models.Maintenance.objects.get(id=maintenance_id) models.Maintenance.objects.filter(id=maintenance_id).update( status=maintenance.RUNNING, started_at=datetime.now()) LOG.info("Maintenance {} is RUNNING".format(maintenance, )) worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, worker_name=worker_name) task_history.relevance = TaskHistory.RELEVANCE_CRITICAL LOG.info("id: {} | task: {} | kwargs: {} | args: {}".format( self.request.id, self.request.task, self.request.kwargs, str(self.request.args))) task_history.update_details( persist=True, details="Executing Maintenance: {}".format(maintenance)) for hm in models.HostMaintenance.objects.filter(maintenance=maintenance): # main_output = {} hm.status = hm.RUNNING hm.started_at = datetime.now() hm.save() if hm.host is None: hm.status = hm.UNAVAILABLEHOST hm.finished_at = datetime.now() hm.save() continue host = hm.host update_task = "\nRunning Maintenance on {}".format(host) if maintenance.disable_alarms: disable_alarms(hm.host) param_dict = {} params = models.MaintenanceParameters.objects.filter( maintenance=maintenance) for param in params: param_function = get_function(param.function_name) param_dict[param.parameter_name] = param_function(host.id) main_script = build_context_script(param_dict, maintenance.main_script) main_output = host.ssh.run_script(script=main_script, raise_if_error=False) if main_output['exit_code'] == 0: hm.status = hm.SUCCESS else: if maintenance.rollback_script: hm.status = hm.ROLLBACK hm.save() rollback_script = build_context_script( param_dict, maintenance.rollback_script) rollback_output = host.ssh.run_script(script=rollback_script, raise_if_error=False) if rollback_output['exit_code'] == 0: hm.status = hm.ROLLBACK_SUCCESS else: hm.status = hm.ROLLBACK_ERROR hm.rollback_log = get_dict_lines(rollback_output) else: hm.status = hm.ERROR if maintenance.disable_alarms: enable_alarms(hm.host) update_task += "...status: {}".format(hm.status) task_history.update_details(persist=True, details=update_task) hm.main_log = get_dict_lines(main_output) hm.finished_at = datetime.now() hm.save() models.Maintenance.objects.filter(id=maintenance_id).update( status=maintenance.FINISHED, finished_at=datetime.now()) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Maintenance executed succesfully') LOG.info("Maintenance: {} has FINISHED".format(maintenance))
def volume_migration(self, database, user, task_history=None): from dbaas_nfsaas.models import HostAttr, PlanAttr from workflow.settings import VOLUME_MIGRATION from util import build_dict from workflow.workflow import start_workflow from time import sleep def switch_master(databaseinfra, instance): driver = databaseinfra.get_driver() for attempt in range(0, 21): if driver.is_replication_ok(instance): driver.switch_master() return LOG.info("Waiting 10s to check replication...") sleep(10) worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=worker_name) stop_now = False if database.status != Database.ALIVE or not database.database_status.is_alive: msg = "Database is not alive!" stop_now = True if database.is_beeing_used_elsewhere(task_id=self.request.id): msg = "Database is in use by another task!" stop_now = True if database.has_migration_started(): msg = "Region migration for this database has already started!" stop_now = True if stop_now: task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details=msg) LOG.info("Migration finished") return default_plan_size = PlanAttr.objects.get( dbaas_plan=database.plan).nfsaas_plan LOG.info("Migrating {} volumes".format(database)) databaseinfra = database.databaseinfra driver = databaseinfra.get_driver() environment = database.environment plan = database.plan instances = driver.get_slave_instances() master_instance = driver.get_master_instance() instances.append(master_instance) LOG.info('Instances: {}'.format(str(instances))) hosts = [instance.hostname for instance in instances] volumes = HostAttr.objects.filter(host__in=hosts, is_active=True, nfsaas_size_id=default_plan_size) if len(volumes) == len(hosts): task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Volumes already migrated!') LOG.info("Migration finished") return for index, instance in enumerate(instances): if not driver.check_instance_is_eligible_for_backup(instance=instance): LOG.info('Instance is not eligible for backup {}'.format( str(instance))) continue LOG.info('Volume migration for instance {}'.format(str(instance))) host = instance.hostname old_volume = HostAttr.objects.get(host=host, is_active=True) if old_volume.nfsaas_size_id == default_plan_size: if databaseinfra.plan.is_ha: switch_master(databaseinfra, instance) continue workflow_dict = build_dict( databaseinfra=databaseinfra, database=database, environment=environment, plan=plan, host=host, instance=instance, old_volume=old_volume, steps=VOLUME_MIGRATION, ) start_workflow(workflow_dict=workflow_dict, task=task_history) if workflow_dict['exceptions']['traceback']: error = "\n".join( ": ".join(err) for err in workflow_dict['exceptions']['error_codes']) traceback = "\nException Traceback\n".join( workflow_dict['exceptions']['traceback']) error = "{}\n{}\n{}".format(error, traceback, error) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error) LOG.info("Migration finished with errors") return if databaseinfra.plan.is_ha: LOG.info("Waiting 60s to check continue...") sleep(60) switch_master(databaseinfra, instance) LOG.info("Waiting 60s to check continue...") sleep(60) task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Volumes sucessfully migrated!') LOG.info("Migration finished") return
def execute_database_region_migration_undo(self, database_region_migration_detail_id, task_history=None, user=None): #AuditRequest.new_request("execute_database_region_migration", user, "localhost") try: if task_history: arguments = task_history.arguments else: arguments = None task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=get_worker_name()) if arguments: task_history.arguments = arguments task_history.save() database_region_migration_detail = DatabaseRegionMigrationDetail.objects.get( id=database_region_migration_detail_id) database_region_migration = database_region_migration_detail.database_region_migration database = database_region_migration.database databaseinfra = database.databaseinfra source_environment = databaseinfra.environment target_environment = source_environment.equivalent_environment engine = database.engine_type steps = get_engine_steps(engine) workflow_steps = steps[ database_region_migration_detail.step].step_classes source_instances = [] source_hosts = [] for instance in databaseinfra.instances.filter( future_instance__isnull=False): source_instances.append(instance) source_hosts.append(instance.hostname) target_instances = [] target_hosts = [] for instance in databaseinfra.instances.filter( future_instance__isnull=True): target_instances.append(instance) target_hosts.append(instance.hostname) source_plan = databaseinfra.plan target_plan = source_plan.equivalent_plan_id workflow_dict = build_dict( database_region_migration_detail=database_region_migration_detail, database_region_migration=database_region_migration, database=database, databaseinfra=databaseinfra, source_environment=source_environment, target_environment=target_environment, steps=workflow_steps, engine=engine, source_instances=source_instances, source_plan=source_plan, target_plan=target_plan, source_hosts=source_hosts, target_instances=target_instances, target_hosts=target_hosts) stop_workflow(workflow_dict=workflow_dict, task=task_history) task_history.update_status_for( TaskHistory.STATUS_SUCCESS, details='Database region migration was succesfully') except Exception, e: traceback = full_stack() LOG.error("Ops... something went wrong: %s" % e) LOG.error(traceback) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=traceback) return
def clone_database(self, origin_database, clone_name, plan, environment, task_history=None, user=None): AuditRequest.new_request("clone_database", user, "localhost") try: worker_name = get_worker_name() LOG.info("id: %s | task: %s | kwargs: %s | args: %s" % (self.request.id, self.request.task, self.request.kwargs, str(self.request.args))) task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=worker_name) LOG.info("origin_database: %s" % origin_database) task_history.update_details(persist=True, details="Loading Process...") result = clone_infra( plan=plan, environment=environment, name=clone_name, team=origin_database.team, project=origin_database.project, description=origin_database.description, task=task_history, clone=origin_database, ) if result['created'] == False: if 'exceptions' in result: error = "\n\n".join( ": ".join(err) for err in result['exceptions']['error_codes']) traceback = "\n\nException Traceback\n".join( result['exceptions']['traceback']) error = "{}\n{}".format(error, traceback) else: error = "There is not any infra-structure to allocate this database." task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error) return task_history.update_dbid(db=result['database']) task_history.update_status_for( TaskHistory.STATUS_SUCCESS, details='\nDatabase cloned successfully') except SoftTimeLimitExceeded: LOG.error("task id %s - timeout exceeded" % self.request.id) task_history.update_status_for(TaskHistory.STATUS_ERROR, details="timeout exceeded") if 'result' in locals() and result['created']: destroy_infra(databaseinfra=result['databaseinfra'], task=task_history) return except Exception, e: traceback = full_stack() LOG.error("Ops... something went wrong: %s" % e) LOG.error(traceback) if 'result' in locals() and result['created']: destroy_infra(databaseinfra=result['databaseinfra'], task=task_history) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=traceback) return
def change_parameters_database(self, database, user, task, since_step=0): worker_name = get_worker_name() task = TaskHistory.register(self.request, user, task, worker_name) infra = database.infra plan = infra.plan class_path = plan.replication_topology.class_path from physical.models import DatabaseInfraParameter changed_parameters = DatabaseInfraParameter.get_databaseinfra_changed_parameters( databaseinfra=infra, ) all_dinamic = True custom_procedure = None for changed_parameter in changed_parameters: if changed_parameter.parameter.dynamic is False: all_dinamic = False break for changed_parameter in changed_parameters: if changed_parameter.parameter.custom_method: custom_procedure = changed_parameter.parameter.custom_method break steps = get_database_change_parameter_setting( class_path, all_dinamic, custom_procedure) LOG.info(steps) task.add_detail("Changed parameters:", level=0) for changed_parameter in changed_parameters: msg = "{}: old value: [{}], new value: [{}]".format( changed_parameter.parameter.name, changed_parameter.current_value, changed_parameter.value ) task.add_detail(msg, level=1) task.add_detail("", level=0) if since_step > 0: steps_dec = get_database_change_parameter_retry_steps_count( class_path, all_dinamic, custom_procedure) LOG.info('since_step: {}, steps_dec: {}'.format(since_step, steps_dec)) since_step = since_step - steps_dec if since_step < 0: since_step = 0 database_change_parameter = DatabaseChangeParameter() database_change_parameter.database = database database_change_parameter.task = task database_change_parameter.save() instances_to_change_parameters = infra.get_driver().get_database_instances() success = steps_for_instances( steps, instances_to_change_parameters, task, database_change_parameter.update_step, since_step ) if success: database_change_parameter.set_success() task.update_status_for(TaskHistory.STATUS_SUCCESS, 'Done') else: database_change_parameter.set_error() task.update_status_for( TaskHistory.STATUS_ERROR, 'Could not do change the database parameters.\nChange parameters doesn\'t have rollback' )
def execute_database_region_migration(self, database_region_migration_detail_id, task_history=None, user=None): #AuditRequest.new_request("execute_database_region_migration", user, "localhost") try: if task_history: arguments = task_history.arguments else: arguments = None task_history = TaskHistory.register(request=self.request, task_history = task_history, user = user, worker_name = get_worker_name()) if arguments: task_history.arguments = arguments task_history.save() database_region_migration_detail = DatabaseRegionMigrationDetail.objects.get(id=database_region_migration_detail_id) database_region_migration = database_region_migration_detail.database_region_migration database = database_region_migration.database databaseinfra = database.databaseinfra source_environment = databaseinfra.environment target_environment = source_environment.equivalent_environment engine = database.engine_type steps = get_engine_steps(engine) workflow_steps = steps[database_region_migration_detail.step].step_classes source_instances = [] source_hosts = [] for instance in Instance.objects.filter(databaseinfra=databaseinfra): source_instances.append(instance) if instance.instance_type != instance.REDIS_SENTINEL: source_hosts.append(instance.hostname) source_plan = databaseinfra.plan target_plan = source_plan.equivalent_plan_id workflow_dict = build_dict( #database_region_migration_detail = database_region_migration_detail, #database_region_migration = database_region_migration, #database = database, databaseinfra = databaseinfra, #source_environment = source_environment, target_environment = target_environment, steps = workflow_steps, #engine = engine, source_instances = source_instances, source_hosts = source_hosts, #source_plan = source_plan, target_plan = target_plan, ) start_workflow(workflow_dict = workflow_dict, task = task_history) if workflow_dict['created'] == False: if 'exceptions' in workflow_dict: error = "\n".join(": ".join(err) for err in workflow_dict['exceptions']['error_codes']) traceback = "\nException Traceback\n".join(workflow_dict['exceptions']['traceback']) error = "{}\n{}\n{}".format(error, traceback, error) else: error = "There is not any infra-structure to allocate this database." task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error) return else: task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Database region migration was succesfully') return except Exception, e: traceback = full_stack() LOG.error("Ops... something went wrong: %s" % e) LOG.error(traceback) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=traceback) return
def resize_database(self, database, cloudstackpack, task_history=None, user=None): AuditRequest.new_request("resize_database", user, "localhost") try: worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=worker_name) from util.providers import resize_database_instances from util import get_credentials_for from dbaas_cloudstack.provider import CloudStackProvider from dbaas_credentials.models import CredentialType cs_credentials = get_credentials_for( environment=database.environment, credential_type=CredentialType.CLOUDSTACK) cs_provider = CloudStackProvider(credentials=cs_credentials) databaseinfra = database.databaseinfra driver = databaseinfra.get_driver() instances = driver.get_slave_instances() instances.append(driver.get_master_instance()) instances_to_resize = [] resized_instances = [] disable_zabbix_alarms(database) for instance in instances: host = instance.hostname host_attr = host.cs_host_attributes.get() offering_id = cs_provider.get_vm_offering_id( vm_id=host_attr.vm_id, project_id=cs_credentials.project) if offering_id == cloudstackpack.offering.serviceofferingid: LOG.info("Instance offering: {}".format(offering_id)) resized_instances.append(instance) else: instances_to_resize.append(instance) result = resize_database_instances(database=database, cloudstackpack=cloudstackpack, instances=instances_to_resize, task=task_history) if result['created']: resized_instances += result['completed_instances'] else: if 'exceptions' not in result: error = "Something went wrong." else: error = "\n".join( ": ".join(err) for err in result['exceptions']['error_codes']) traceback = "\nException Traceback\n".join( result['exceptions']['traceback']) error = "{}\n{}\n{}".format(error, traceback, error) if databaseinfra.plan.is_ha: LOG.info("Waiting 60s to check continue...") sleep(60) instance = driver.get_slave_instances()[0] driver.check_replication_and_switch(instance) if len(instances) == len(resized_instances): from dbaas_cloudstack.models import DatabaseInfraOffering LOG.info('Updating offering DatabaseInfra.') databaseinfraoffering = DatabaseInfraOffering.objects.get( databaseinfra=databaseinfra) databaseinfraoffering.offering = cloudstackpack.offering databaseinfraoffering.save() if databaseinfra.engine.engine_type.name == 'redis': new_max_memory = databaseinfraoffering.offering.memory_size_mb resize_factor = 0.5 if new_max_memory > 1024: resize_factor = 0.75 new_max_memory *= resize_factor databaseinfra.per_database_size_mbytes = int(new_max_memory) databaseinfra.save() task_history.update_status_for(TaskHistory.STATUS_SUCCESS, details='Resize successfully done.') return task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error) return except Exception as e: error = "Resize Database ERROR: {}".format(e) LOG.error(error) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error) finally: enable_zabbix_alarms(database) AuditRequest.cleanup_request()
def register_task_history(self, task): return TaskHistory.register(request=self.request, task_history=task, user=task.user, worker_name=get_worker_name())
def database_disk_resize(self, database, disk_offering, task_history, user): from dbaas_nfsaas.models import HostAttr from workflow.steps.util.nfsaas_utils import resize_disk AuditRequest.new_request("database_disk_resize", user, "localhost") databaseinfra = database.databaseinfra old_disk_offering = database.databaseinfra.disk_offering resized = [] try: worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=worker_name) task_history.update_details(persist=True, details='\nLoading Disk offering') for instance in databaseinfra.get_driver().get_database_instances(): if not HostAttr.objects.filter( host_id=instance.hostname_id).exists(): continue task_history.update_details(persist=True, details='\nChanging instance {} to ' 'NFS {}'.format( instance, disk_offering)) if resize_disk(environment=database.environment, host=instance.hostname, disk_offering=disk_offering): resized.append(instance) task_history.update_details( persist=True, details='\nUpdate DBaaS metadata from {} to ' '{}'.format(databaseinfra.disk_offering, disk_offering)) databaseinfra.disk_offering = disk_offering databaseinfra.save() task_history.update_status_for( status=TaskHistory.STATUS_SUCCESS, details='\nDisk resize successfully done.') return True except Exception as e: error = "Disk resize ERROR: {}".format(e) LOG.error(error) if databaseinfra.disk_offering != old_disk_offering: task_history.update_details(persist=True, details='\nUndo update DBaaS metadata') databaseinfra.disk_offering = old_disk_offering databaseinfra.save() for instance in resized: task_history.update_details( persist=True, details='\nUndo NFS change for instance {}'.format(instance)) resize_disk(environment=database.environment, host=instance.hostname, disk_offering=old_disk_offering) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error) finally: AuditRequest.cleanup_request()
def execute_database_region_migration_undo(self, database_region_migration_detail_id, task_history=None, user=None): AuditRequest.new_request("execute_database_region_migration", user, "localhost") try: if task_history: arguments = task_history.arguments else: arguments = None task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=get_worker_name()) if arguments: task_history.arguments = arguments task_history.save() database_region_migration_detail = DatabaseRegionMigrationDetail.objects.get( id=database_region_migration_detail_id) database_region_migration_detail.started_at = datetime.now() database_region_migration_detail.status = database_region_migration_detail.RUNNING database_region_migration_detail.is_migration_up = False database_region_migration_detail.save() database_region_migration = database_region_migration_detail.database_region_migration database = database_region_migration.database databaseinfra = database.databaseinfra source_environment = databaseinfra.environment target_environment = source_environment.equivalent_environment engine = database.engine_type steps = get_engine_steps(engine) workflow_steps = steps[ database_region_migration_detail.step].step_classes source_instances = [] source_hosts = [] for instance in databaseinfra.instances.filter( future_instance__isnull=False): source_instances.append(instance) if instance.instance_type != instance.REDIS: source_hosts.append(instance.hostname) target_instances = [] target_hosts = [] for instance in databaseinfra.instances.filter( future_instance__isnull=True): target_instances.append(instance) if instance.instance_type != instance.REDIS: target_hosts.append(instance.hostname) source_plan = databaseinfra.plan target_plan = source_plan.equivalent_plan_id if not source_hosts: raise Exception('There is no source host') if not source_instances: raise Exception('There is no source instance') if not target_hosts: raise Exception('There is no target host') if not target_instances: raise Exception('There is no target instance') source_secondary_ips = DatabaseInfraAttr.objects.filter( databaseinfra=databaseinfra, equivalent_dbinfraattr__isnull=False) source_secondary_ips = list(source_secondary_ips) workflow_dict = build_dict( database_region_migration_detail=database_region_migration_detail, database_region_migration=database_region_migration, database=database, databaseinfra=databaseinfra, source_environment=source_environment, target_environment=target_environment, steps=workflow_steps, engine=engine, source_instances=source_instances, source_plan=source_plan, target_plan=target_plan, source_hosts=source_hosts, target_instances=target_instances, target_hosts=target_hosts, source_secondary_ips=source_secondary_ips, ) stop_workflow(workflow_dict=workflow_dict, task=task_history) current_step = database_region_migration.current_step database_region_migration.current_step = current_step - 1 database_region_migration.save() database_region_migration_detail.status = database_region_migration_detail.SUCCESS database_region_migration_detail.finished_at = datetime.now() database_region_migration_detail.save() task_history.update_status_for( TaskHistory.STATUS_SUCCESS, details='Database region migration was succesfully') except Exception, e: traceback = full_stack() LOG.error("Ops... something went wrong: %s" % e) LOG.error(traceback) task_history.update_status_for(TaskHistory.STATUS_ERROR, details=traceback) database_region_migration_detail.status = database_region_migration_detail.ERROR database_region_migration_detail.finished_at = datetime.now() database_region_migration_detail.save() return
def execute_database_region_migration(self, database_region_migration_detail_id, task_history=None, user=None): AuditRequest.new_request("execute_database_region_migration", user, "localhost") try: if task_history: arguments = task_history.arguments else: arguments = None task_history = TaskHistory.register(request=self.request, task_history=task_history, user=user, worker_name=get_worker_name()) if arguments: task_history.arguments = arguments task_history.save() database_region_migration_detail = DatabaseRegionMigrationDetail.objects.get( id=database_region_migration_detail_id) database_region_migration_detail.started_at = datetime.now() database_region_migration_detail.status = database_region_migration_detail.RUNNING database_region_migration_detail.save() database_region_migration = database_region_migration_detail.database_region_migration database = database_region_migration.database databaseinfra = database.databaseinfra source_environment = databaseinfra.environment target_environment = source_environment.equivalent_environment engine = database.engine_type steps = get_engine_steps(engine) workflow_steps = steps[ database_region_migration_detail.step].step_classes source_instances = [] source_hosts = [] for instance in Instance.objects.filter(databaseinfra=databaseinfra): if database_region_migration.current_step > 0 and not instance.future_instance: continue source_instances.append(instance) if instance.instance_type != instance.REDIS: source_hosts.append(instance.hostname) source_plan = databaseinfra.plan target_plan = source_plan.equivalent_plan source_offering = databaseinfra.cs_dbinfra_offering.get().offering target_offering = source_offering.equivalent_offering source_secondary_ips = [] for secondary_ip in DatabaseInfraAttr.objects.filter( databaseinfra=databaseinfra): if database_region_migration.current_step > 0 and\ not secondary_ip.equivalent_dbinfraattr: continue source_secondary_ips.append(secondary_ip) workflow_dict = build_dict( databaseinfra=databaseinfra, database=database, source_environment=source_environment, target_environment=target_environment, steps=workflow_steps, source_instances=source_instances, source_hosts=source_hosts, source_plan=source_plan, target_plan=target_plan, source_offering=source_offering, target_offering=target_offering, source_secondary_ips=source_secondary_ips, ) start_workflow(workflow_dict=workflow_dict, task=task_history) if workflow_dict['created'] == False: if 'exceptions' in workflow_dict: error = "\n".join( ": ".join(err) for err in workflow_dict['exceptions']['error_codes']) traceback = "\nException Traceback\n".join( workflow_dict['exceptions']['traceback']) error = "{}\n{}\n{}".format(error, traceback, error) else: error = "There is not any infra-structure to allocate this database." database_region_migration_detail.status = database_region_migration_detail.ROLLBACK database_region_migration_detail.finished_at = datetime.now() database_region_migration_detail.save() task_history.update_status_for(TaskHistory.STATUS_ERROR, details=error) return else: database_region_migration_detail.status = database_region_migration_detail.SUCCESS database_region_migration_detail.finished_at = datetime.now() database_region_migration_detail.save() current_step = database_region_migration.current_step database_region_migration.current_step = current_step + 1 database_region_migration.save() task_history.update_status_for( TaskHistory.STATUS_SUCCESS, details='Database region migration was succesfully') return except Exception, e: traceback = full_stack() LOG.error("Ops... something went wrong: %s" % e) LOG.error(traceback) database_region_migration_detail.status = database_region_migration_detail.ROLLBACK database_region_migration_detail.finished_at = datetime.now() database_region_migration_detail.save() task_history.update_status_for(TaskHistory.STATUS_ERROR, details=traceback) return
def make_databases_backup(self): LOG.info("Making databases backups") worker_name = get_worker_name() task_history = TaskHistory.register(request=self.request, worker_name=worker_name, user=None) task_history.relevance = TaskHistory.RELEVANCE_ERROR waiting_msg = "\nWaiting 5 minutes to start the next backup group" status = TaskHistory.STATUS_SUCCESS environments = Environment.objects.all() prod_envs = Configuration.get_by_name_as_list('prod_envs') dev_envs = Configuration.get_by_name_as_list('dev_envs') env_names_order = prod_envs + dev_envs if not env_names_order: env_names_order = [env.name for env in environments] infras = DatabaseInfra.objects.filter(plan__has_persistence=True) for env_name in env_names_order: try: env = environments.get(name=env_name) except Environment.DoesNotExist: continue msg = '\nStarting Backup for env {}'.format(env.name) task_history.update_details(persist=True, details=msg) databaseinfras_by_env = infras.filter(environment=env) error = {} backup_number = 0 backups_per_group = len(infras) / 12 for infra in databaseinfras_by_env: if not infra.databases.first(): continue if backups_per_group > 0: if backup_number < backups_per_group: backup_number += 1 else: backup_number = 0 task_history.update_details(waiting_msg, True) sleep(300) group = BackupGroup() group.save() for instance in infra.instances.filter(read_only=False): try: driver = instance.databaseinfra.get_driver() is_eligible = driver.check_instance_is_eligible_for_backup( instance) if not is_eligible: LOG.info( 'Instance {} is not eligible for backup'.format( instance)) continue except Exception as e: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), str(e)) LOG.error(msg) time_now = str(strftime("%m/%d/%Y %H:%M:%S")) start_msg = "\n{} - Starting backup for {} ...".format( time_now, instance) task_history.update_details(persist=True, details=start_msg) try: snapshot = make_instance_snapshot_backup(instance=instance, error=error, group=group) if snapshot and snapshot.was_successful: msg = "Backup for %s was successful" % (str(instance)) LOG.info(msg) elif snapshot and snapshot.has_warning: status = TaskHistory.STATUS_WARNING msg = "Backup for %s has warning" % (str(instance)) LOG.info(msg) else: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), error['errormsg']) LOG.error(msg) LOG.info(msg) except Exception as e: status = TaskHistory.STATUS_ERROR msg = "Backup for %s was unsuccessful. Error: %s" % ( str(instance), str(e)) LOG.error(msg) time_now = str(strftime("%m/%d/%Y %H:%M:%S")) msg = "\n{} - {}".format(time_now, msg) task_history.update_details(persist=True, details=msg) task_history.update_status_for(status, details="\nBackup finished") return