def schedule_failed_job(self, failed_task_id): if self.stopped: return try: job = db.failed_task_get(self.ctx, failed_task_id) retry_count = job['retry_count'] result = job['result'] job_id = job['job_id'] if retry_count >= \ TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT or \ result == TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS: LOG.info("Exiting Failure task processing for task [%d] " "with result [%s] and retry count [%d] " % (job['id'], result, retry_count)) self._teardown_task(self.ctx, job['id'], job_id) return # If job already scheduled, skip if job_id and self.scheduler.get_job(job_id): return try: db.task_get(self.ctx, job['task_id']) except TaskNotFound as e: LOG.info( "Removing failed telemetry job as parent job " "do not exist: %s", six.text_type(e)) # tear down if original task is not available self._teardown_task(self.ctx, job['id'], job_id) return if not (job_id and self.scheduler.get_job(job_id)): job_id = uuidutils.generate_uuid() db.failed_task_update(self.ctx, job['id'], {'job_id': job_id}) collection_class = importutils.import_class(job['method']) instance = \ collection_class.get_instance(self.ctx, job['id']) self.scheduler.add_job( instance, 'interval', seconds=job['interval'], next_run_time=datetime.now(), id=job_id, misfire_grace_time=int( CONF.telemetry.performance_collection_interval / 2)) self.job_ids.add(job_id) except Exception as e: LOG.error( "Failed to schedule retry tasks for performance " "collection, reason: %s", six.text_type(e)) else: LOG.info("Schedule collection completed")
def remove_job(self, task_id): try: LOG.info("Received job %s to remove", task_id) job = db.task_get(self.ctx, task_id) job_id = job['job_id'] self.remove_scheduled_job(job_id) except Exception as e: LOG.error("Failed to remove periodic scheduling job , reason: %s.", six.text_type(e))
def get_instance(ctx, failed_task_id): failed_task = db.failed_task_get(ctx, failed_task_id) task = db.task_get(ctx, failed_task[FailedTask.task_id.name]) return FailedPerformanceCollectionHandler( ctx, failed_task[FailedTask.id.name], task[Task.storage_id.name], task[Task.args.name], failed_task[FailedTask.job_id.name], failed_task[FailedTask.retry_count.name], failed_task[FailedTask.start_time.name], failed_task[FailedTask.end_time.name], )
def get_local_executor(self, context, task_id, failed_task_id, executor): executor_names = self.executor_map.keys() storage_id = None if task_id: job = db.task_get(context, task_id) storage_id = job['storage_id'] elif failed_task_id: job = db.failed_task_get(context, failed_task_id) storage_id = job['storage_id'] else: raise exception.InvalidInput("Missing task id") # Storage already exists for name in executor_names: executor_topic = "{0}:{1}".format(executor, name) if storage_id in self.executor_map[name]["storages"]: return executor_topic # Return existing executor_topic for name in executor_names: no_of_storages = len(self.executor_map[name]["storages"]) if no_of_storages and (no_of_storages < CONF.telemetry.max_storages_in_child): executor_topic = "{0}:{1}".format(executor, name) LOG.info( "Selecting existing local executor {0} for {1}".format( executor_topic, storage_id)) self.executor_map[name]["storages"].append(storage_id) return executor_topic # Return executor_topic after creating one for index in range(CONF.telemetry.max_childs_in_node): name = "executor_{0}".format(index + 1) if name not in executor_names: executor_topic = "{0}:{1}".format(executor, name) LOG.info("Create a new local executor {0} for {1}".format( executor_topic, storage_id)) launcher = self.create_process(topic=executor_topic, host=executor) self.executor_map[name] = { "storages": [storage_id], "launcher": launcher, "cleanup_delay": 0 } return executor_topic msg = "Reached maximum number of ({0}) local executors". \ format(CONF.telemetry.max_childs_in_node) LOG.error(msg) raise RuntimeError(msg)
def remove_job(self, context, task_id, executor): if not self.enable_sub_process: instance = JobHandler.get_instance(context, task_id) instance.remove_job(task_id) else: job = db.task_get(context, task_id) storage_id = job['storage_id'] for name in self.executor_map.keys(): if storage_id in self.executor_map[name]["storages"]: local_executor = "{0}:{1}".format(executor, name) self.rpcapi.remove_job_local(context, task_id, local_executor) tasks, failed_tasks = self.get_all_tasks(storage_id) if len(failed_tasks) == 0 and len(tasks) == 0: self.stop_executor(name, local_executor, storage_id)
def __call__(self): # Upon periodic job callback, if storage is already deleted or soft # deleted,do not proceed with performance collection flow try: task = db.task_get(self.ctx, self.task_id) if task["deleted"]: LOG.debug('Storage %s getting deleted, ignoring performance ' 'collection cycle for task id %s.' % (self.storage_id, self.task_id)) return except exception.TaskNotFound: LOG.debug('Storage %s already deleted, ignoring performance ' 'collection cycle for task id %s.' % (self.storage_id, self.task_id)) return # Handles performance collection from driver and dispatch start_time = None end_time = None try: LOG.debug('Collecting performance metrics for task id: %s' % self.task_id) current_time = int(datetime.utcnow().timestamp()) # Times are epoch time in milliseconds end_time = current_time * 1000 start_time = end_time - (self.interval * 1000) status = self.task_rpcapi. \ collect_telemetry(self.ctx, self.storage_id, telemetry.TelemetryTask.__module__ + '.' + 'PerformanceCollectionTask', self.args, start_time, end_time) db.task_update(self.ctx, self.task_id, {'last_run_time': current_time}) if not status: raise exception.TelemetryTaskExecError() except Exception as e: LOG.error("Failed to collect performance metrics for " "task id :{0}, reason:{1}".format(self.task_id, six.text_type(e))) self._handle_task_failure(start_time, end_time) else: LOG.debug("Performance collection done for storage id :{0}" ",task id :{1} and interval(in sec):{2}" .format(self.storage_id, self.task_id, self.interval))
def get_instance(ctx, task_id): task = db.task_get(ctx, task_id) return PerformanceCollectionHandler(ctx, task_id, task['storage_id'], task['args'], task['interval'])
def __call__(self): """ :return: """ try: # Remove jobs from scheduler when marked for delete filters = {'deleted': True} failed_tasks = db.failed_task_get_all(self.ctx, filters=filters) LOG.debug("Total failed_tasks found deleted " "in this cycle:%s" % len(failed_tasks)) for failed_task in failed_tasks: job_id = failed_task['job_id'] if job_id and self.scheduler.get_job(job_id): self.scheduler.remove_job(job_id) db.failed_task_delete(self.ctx, failed_task['id']) except Exception as e: LOG.error("Failed to remove periodic scheduling job , reason: %s.", six.text_type(e)) try: # Create the object of periodic scheduler failed_tasks = db.failed_task_get_all(self.ctx) if not len(failed_tasks): LOG.info("No failed task found for performance collection") return LOG.debug("Schedule performance collection triggered: total " "failed tasks:%s" % len(failed_tasks)) for failed_task in failed_tasks: failed_task_id = failed_task[FailedTask.id.name] LOG.info("Processing failed task : %s" % failed_task_id) # Get failed jobs, if retry count has reached max, # remove job and delete db entry retry_count = failed_task[FailedTask.retry_count.name] result = failed_task[FailedTask.result.name] job_id = failed_task[FailedTask.job_id.name] if retry_count >= \ TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT or \ result == TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS: LOG.info("Exiting Failure task processing for task [%d] " "with result [%s] and retry count [%d] " % (failed_task_id, result, retry_count)) # task ID is same as job id self._teardown_task(self.ctx, failed_task_id, job_id) continue # If job already scheduled, skip if job_id and self.scheduler.get_job(job_id): continue try: db.task_get(self.ctx, failed_task[FailedTask.task_id.name]) except TaskNotFound as e: LOG.info( "Removing failed telemetry job as parent job " "do not exist: %s", six.text_type(e)) # tear down if original task is not available self._teardown_task(self.ctx, failed_task_id, job_id) continue if not job_id: job_id = uuidutils.generate_uuid() db.failed_task_update(self.ctx, failed_task_id, {FailedTask.job_id.name: job_id}) collection_class = importutils.import_class( failed_task[FailedTask.method.name]) instance = \ collection_class.get_instance(self.ctx, failed_task_id) self.scheduler.add_job( instance, 'interval', seconds=failed_task[FailedTask.interval.name], next_run_time=datetime.now(), id=job_id) except Exception as e: LOG.error( "Failed to schedule retry tasks for performance " "collection, reason: %s", six.text_type(e)) else: LOG.info("Schedule collection completed")
def schedule_job(self, task_id): if self.stopped: # If Job is stopped return immediately return LOG.info("JobHandler received A job %s to schedule" % task_id) job = db.task_get(self.ctx, task_id) # Check delete status of the task deleted = job['deleted'] if deleted: return collection_class = importutils.import_class(job['method']) instance = collection_class.get_instance(self.ctx, self.task_id) current_time = int(datetime.now().timestamp()) last_run_time = current_time next_collection_time = last_run_time + job['interval'] job_id = uuidutils.generate_uuid() next_collection_time = datetime \ .fromtimestamp(next_collection_time) \ .strftime('%Y-%m-%d %H:%M:%S') existing_job_id = job['job_id'] scheduler_job = self.scheduler.get_job(existing_job_id) if not (existing_job_id and scheduler_job): LOG.info('JobHandler scheduling a new job') self.scheduler.add_job(instance, 'interval', seconds=job['interval'], next_run_time=next_collection_time, id=job_id, misfire_grace_time=int(job['interval'] / 2)) update_task_dict = {'job_id': job_id} db.task_update(self.ctx, self.task_id, update_task_dict) self.job_ids.add(job_id) LOG.info('Periodic collection tasks scheduled for for job id: ' '%s ' % self.task_id) # Check if historic collection is needed for this task. # If the last run time is already set, adjust start_time based on # last run time or history_on_reschedule which is smaller # If jod id is created but last run time is not yet set, then # adjust start_time based on interval or history_on_reschedule # whichever is smaller end_time = current_time * 1000 # Maximum supported history duration on restart history_on_reschedule = CONF.telemetry. \ performance_history_on_reschedule if job['last_run_time']: start_time = job['last_run_time'] * 1000 \ if current_time - job['last_run_time'] < \ history_on_reschedule \ else (end_time - history_on_reschedule * 1000) self.perform_history_collection(start_time, end_time, last_run_time) elif existing_job_id: interval_in_sec = job['interval'] start_time = (end_time - interval_in_sec * 1000) \ if interval_in_sec < history_on_reschedule \ else (end_time - history_on_reschedule * 1000) self.perform_history_collection(start_time, end_time, last_run_time) else: LOG.info('Job already exists with this scheduler')
def get_instance(ctx, task_id): task = db.task_get(ctx, task_id) return JobHandler(ctx, task_id, task['storage_id'], task['args'], task['interval'])