def schedule_failed_job(self, failed_task_id): if self.stopped: return try: job = db.failed_task_get(self.ctx, failed_task_id) retry_count = job['retry_count'] result = job['result'] job_id = job['job_id'] if retry_count >= \ TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT or \ result == TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS: LOG.info("Exiting Failure task processing for task [%d] " "with result [%s] and retry count [%d] " % (job['id'], result, retry_count)) self._teardown_task(self.ctx, job['id'], job_id) return # If job already scheduled, skip if job_id and self.scheduler.get_job(job_id): return try: db.task_get(self.ctx, job['task_id']) except TaskNotFound as e: LOG.info( "Removing failed telemetry job as parent job " "do not exist: %s", six.text_type(e)) # tear down if original task is not available self._teardown_task(self.ctx, job['id'], job_id) return if not (job_id and self.scheduler.get_job(job_id)): job_id = uuidutils.generate_uuid() db.failed_task_update(self.ctx, job['id'], {'job_id': job_id}) collection_class = importutils.import_class(job['method']) instance = \ collection_class.get_instance(self.ctx, job['id']) self.scheduler.add_job( instance, 'interval', seconds=job['interval'], next_run_time=datetime.now(), id=job_id, misfire_grace_time=int( CONF.telemetry.performance_collection_interval / 2)) self.job_ids.add(job_id) except Exception as e: LOG.error( "Failed to schedule retry tasks for performance " "collection, reason: %s", six.text_type(e)) else: LOG.info("Schedule collection completed")
def __call__(self): # Upon periodic job callback, if storage is already deleted or soft # deleted,do not proceed with failed performance collection flow try: failed_task = db.failed_task_get(self.ctx, self.failed_task_id) if failed_task["deleted"]: LOG.debug('Storage %s getting deleted, ignoring ' 'performance collection cycle for failed task id %s.' % (self.storage_id, self.failed_task_id)) return except exception.FailedTaskNotFound: LOG.debug('Storage %s already deleted, ignoring ' 'performance collection cycle for failed task id %s.' % (self.storage_id, self.failed_task_id)) return # Pull performance collection info self.retry_count = self.retry_count + 1 try: status = self.task_rpcapi.collect_telemetry( self.ctx, self.storage_id, PerformanceCollectionTask.__module__ + '.' + PerformanceCollectionTask.__name__, self.args, self.start_time, self.end_time) if not status: raise exception.TelemetryTaskExecError() except Exception as e: LOG.error(e) msg = _("Failed to collect performance metrics for storage " "id:{0}, reason:{1}".format(self.storage_id, six.text_type(e))) LOG.error(msg) else: LOG.info("Successfully completed Performance metrics collection " "for storage id :{0} ".format(self.storage_id)) self.result = TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS self._stop_task() return if self.retry_count >= TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT: msg = _( "Failed to collect performance metrics of task instance " "id:{0} for start time:{1} and end time:{2} with " "maximum retry. Giving up on " "retry".format(self.failed_task_id, self.start_time, self.end_time)) LOG.error(msg) self._stop_task() return self.result = TelemetryJobStatus.FAILED_JOB_STATUS_RETRYING db.failed_task_update(self.ctx, self.failed_task_id, {FailedTask.retry_count.name: self.retry_count, FailedTask.result.name: self.result})
def remove_failed_job(self, failed_task_id): try: LOG.info("Received failed job %s to remove", failed_task_id) job = db.failed_task_get(self.ctx, failed_task_id) job_id = job['job_id'] self.remove_scheduled_job(job_id) db.failed_task_delete(self.ctx, job['id']) LOG.info("Removed failed_task entry %s ", job['id']) except Exception as e: LOG.error("Failed to remove periodic scheduling job , reason: %s.", six.text_type(e))
def get_instance(ctx, failed_task_id): failed_task = db.failed_task_get(ctx, failed_task_id) task = db.task_get(ctx, failed_task[FailedTask.task_id.name]) return FailedPerformanceCollectionHandler( ctx, failed_task[FailedTask.id.name], task[Task.storage_id.name], task[Task.args.name], failed_task[FailedTask.job_id.name], failed_task[FailedTask.retry_count.name], failed_task[FailedTask.start_time.name], failed_task[FailedTask.end_time.name], )
def get_local_executor(self, context, task_id, failed_task_id, executor): executor_names = self.executor_map.keys() storage_id = None if task_id: job = db.task_get(context, task_id) storage_id = job['storage_id'] elif failed_task_id: job = db.failed_task_get(context, failed_task_id) storage_id = job['storage_id'] else: raise exception.InvalidInput("Missing task id") # Storage already exists for name in executor_names: executor_topic = "{0}:{1}".format(executor, name) if storage_id in self.executor_map[name]["storages"]: return executor_topic # Return existing executor_topic for name in executor_names: no_of_storages = len(self.executor_map[name]["storages"]) if no_of_storages and (no_of_storages < CONF.telemetry.max_storages_in_child): executor_topic = "{0}:{1}".format(executor, name) LOG.info( "Selecting existing local executor {0} for {1}".format( executor_topic, storage_id)) self.executor_map[name]["storages"].append(storage_id) return executor_topic # Return executor_topic after creating one for index in range(CONF.telemetry.max_childs_in_node): name = "executor_{0}".format(index + 1) if name not in executor_names: executor_topic = "{0}:{1}".format(executor, name) LOG.info("Create a new local executor {0} for {1}".format( executor_topic, storage_id)) launcher = self.create_process(topic=executor_topic, host=executor) self.executor_map[name] = { "storages": [storage_id], "launcher": launcher, "cleanup_delay": 0 } return executor_topic msg = "Reached maximum number of ({0}) local executors". \ format(CONF.telemetry.max_childs_in_node) LOG.error(msg) raise RuntimeError(msg)
def remove_failed_job(self, context, failed_task_id, executor): if not self.enable_sub_process: instance = FailedJobHandler.get_instance(context, failed_task_id) instance.remove_failed_job(failed_task_id) else: job = db.failed_task_get(context, failed_task_id) storage_id = job['storage_id'] for name in self.executor_map.keys(): if storage_id in self.executor_map[name]["storages"]: local_executor = "{0}:{1}".format(executor, name) self.rpcapi.remove_failed_job_local( context, failed_task_id, local_executor) tasks, failed_tasks = self.get_all_tasks(storage_id) if len(failed_tasks) == 0 and len(tasks) == 0: self.stop_executor(name, local_executor, storage_id)