Beispiel #1
0
    def schedule_failed_job(self, failed_task_id):

        if self.stopped:
            return

        try:
            job = db.failed_task_get(self.ctx, failed_task_id)
            retry_count = job['retry_count']
            result = job['result']
            job_id = job['job_id']
            if retry_count >= \
                    TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT or \
                    result == TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS:
                LOG.info("Exiting Failure task processing for task [%d] "
                         "with result [%s] and retry count [%d] " %
                         (job['id'], result, retry_count))
                self._teardown_task(self.ctx, job['id'], job_id)
                return
            # If job already scheduled, skip
            if job_id and self.scheduler.get_job(job_id):
                return

            try:
                db.task_get(self.ctx, job['task_id'])
            except TaskNotFound as e:
                LOG.info(
                    "Removing failed telemetry job as parent job "
                    "do not exist: %s", six.text_type(e))
                # tear down if original task is not available
                self._teardown_task(self.ctx, job['id'], job_id)
                return

            if not (job_id and self.scheduler.get_job(job_id)):
                job_id = uuidutils.generate_uuid()
                db.failed_task_update(self.ctx, job['id'], {'job_id': job_id})

                collection_class = importutils.import_class(job['method'])
                instance = \
                    collection_class.get_instance(self.ctx, job['id'])
                self.scheduler.add_job(
                    instance,
                    'interval',
                    seconds=job['interval'],
                    next_run_time=datetime.now(),
                    id=job_id,
                    misfire_grace_time=int(
                        CONF.telemetry.performance_collection_interval / 2))
                self.job_ids.add(job_id)

        except Exception as e:
            LOG.error(
                "Failed to schedule retry tasks for performance "
                "collection, reason: %s", six.text_type(e))
        else:
            LOG.info("Schedule collection completed")
    def __call__(self):
        # Upon periodic job callback, if storage is already deleted or soft
        # deleted,do not proceed with failed performance collection flow
        try:
            failed_task = db.failed_task_get(self.ctx, self.failed_task_id)
            if failed_task["deleted"]:
                LOG.debug('Storage %s getting deleted, ignoring '
                          'performance collection cycle for failed task id %s.'
                          % (self.storage_id, self.failed_task_id))
                return
        except exception.FailedTaskNotFound:
            LOG.debug('Storage %s already deleted, ignoring '
                      'performance collection cycle for failed task id %s.'
                      % (self.storage_id, self.failed_task_id))
            return

        # Pull performance collection info
        self.retry_count = self.retry_count + 1
        try:
            status = self.task_rpcapi.collect_telemetry(
                self.ctx, self.storage_id,
                PerformanceCollectionTask.__module__ + '.' +
                PerformanceCollectionTask.__name__,
                self.args, self.start_time, self.end_time)

            if not status:
                raise exception.TelemetryTaskExecError()
        except Exception as e:
            LOG.error(e)
            msg = _("Failed to collect performance metrics for storage "
                    "id:{0}, reason:{1}".format(self.storage_id,
                                                six.text_type(e)))
            LOG.error(msg)
        else:
            LOG.info("Successfully completed Performance metrics collection "
                     "for storage id :{0} ".format(self.storage_id))
            self.result = TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS
            self._stop_task()
            return

        if self.retry_count >= TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT:
            msg = _(
                "Failed to collect performance metrics of task instance "
                "id:{0} for start time:{1} and end time:{2} with "
                "maximum retry. Giving up on "
                "retry".format(self.failed_task_id, self.start_time,
                               self.end_time))
            LOG.error(msg)
            self._stop_task()
            return

        self.result = TelemetryJobStatus.FAILED_JOB_STATUS_RETRYING
        db.failed_task_update(self.ctx, self.failed_task_id,
                              {FailedTask.retry_count.name: self.retry_count,
                               FailedTask.result.name: self.result})
Beispiel #3
0
 def remove_failed_job(self, failed_task_id):
     try:
         LOG.info("Received failed job %s to remove", failed_task_id)
         job = db.failed_task_get(self.ctx, failed_task_id)
         job_id = job['job_id']
         self.remove_scheduled_job(job_id)
         db.failed_task_delete(self.ctx, job['id'])
         LOG.info("Removed failed_task entry  %s ", job['id'])
     except Exception as e:
         LOG.error("Failed to remove periodic scheduling job , reason: %s.",
                   six.text_type(e))
 def get_instance(ctx, failed_task_id):
     failed_task = db.failed_task_get(ctx, failed_task_id)
     task = db.task_get(ctx, failed_task[FailedTask.task_id.name])
     return FailedPerformanceCollectionHandler(
         ctx,
         failed_task[FailedTask.id.name],
         task[Task.storage_id.name],
         task[Task.args.name],
         failed_task[FailedTask.job_id.name],
         failed_task[FailedTask.retry_count.name],
         failed_task[FailedTask.start_time.name],
         failed_task[FailedTask.end_time.name],
     )
Beispiel #5
0
    def get_local_executor(self, context, task_id, failed_task_id, executor):
        executor_names = self.executor_map.keys()
        storage_id = None
        if task_id:
            job = db.task_get(context, task_id)
            storage_id = job['storage_id']
        elif failed_task_id:
            job = db.failed_task_get(context, failed_task_id)
            storage_id = job['storage_id']
        else:
            raise exception.InvalidInput("Missing task id")

        # Storage already exists
        for name in executor_names:
            executor_topic = "{0}:{1}".format(executor, name)
            if storage_id in self.executor_map[name]["storages"]:
                return executor_topic

        # Return existing executor_topic
        for name in executor_names:
            no_of_storages = len(self.executor_map[name]["storages"])
            if no_of_storages and (no_of_storages <
                                   CONF.telemetry.max_storages_in_child):
                executor_topic = "{0}:{1}".format(executor, name)
                LOG.info(
                    "Selecting existing local executor {0} for {1}".format(
                        executor_topic, storage_id))
                self.executor_map[name]["storages"].append(storage_id)
                return executor_topic

        # Return executor_topic after creating one
        for index in range(CONF.telemetry.max_childs_in_node):
            name = "executor_{0}".format(index + 1)
            if name not in executor_names:
                executor_topic = "{0}:{1}".format(executor, name)
                LOG.info("Create a new local executor {0} for {1}".format(
                    executor_topic, storage_id))
                launcher = self.create_process(topic=executor_topic,
                                               host=executor)
                self.executor_map[name] = {
                    "storages": [storage_id],
                    "launcher": launcher,
                    "cleanup_delay": 0
                }
                return executor_topic

        msg = "Reached maximum number of ({0}) local executors". \
            format(CONF.telemetry.max_childs_in_node)
        LOG.error(msg)
        raise RuntimeError(msg)
Beispiel #6
0
 def remove_failed_job(self, context, failed_task_id, executor):
     if not self.enable_sub_process:
         instance = FailedJobHandler.get_instance(context, failed_task_id)
         instance.remove_failed_job(failed_task_id)
     else:
         job = db.failed_task_get(context, failed_task_id)
         storage_id = job['storage_id']
         for name in self.executor_map.keys():
             if storage_id in self.executor_map[name]["storages"]:
                 local_executor = "{0}:{1}".format(executor, name)
                 self.rpcapi.remove_failed_job_local(
                     context, failed_task_id, local_executor)
                 tasks, failed_tasks = self.get_all_tasks(storage_id)
                 if len(failed_tasks) == 0 and len(tasks) == 0:
                     self.stop_executor(name, local_executor, storage_id)