def _stop_task(self):
     db.failed_task_update(
         self.ctx, self.failed_task_id, {
             FailedTask.retry_count.name: self.retry_count,
             FailedTask.result.name: self.result
         })
     self.metrics_task_rpcapi.remove_failed_job(self.ctx,
                                                self.failed_task_id,
                                                self.executor)
Ejemplo n.º 2
0
    def schedule_failed_job(self, failed_task_id):

        if self.stopped:
            return

        try:
            job = db.failed_task_get(self.ctx, failed_task_id)
            retry_count = job['retry_count']
            result = job['result']
            job_id = job['job_id']
            if retry_count >= \
                    TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT or \
                    result == TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS:
                LOG.info("Exiting Failure task processing for task [%d] "
                         "with result [%s] and retry count [%d] " %
                         (job['id'], result, retry_count))
                self._teardown_task(self.ctx, job['id'], job_id)
                return
            # If job already scheduled, skip
            if job_id and self.scheduler.get_job(job_id):
                return

            try:
                db.task_get(self.ctx, job['task_id'])
            except TaskNotFound as e:
                LOG.info(
                    "Removing failed telemetry job as parent job "
                    "do not exist: %s", six.text_type(e))
                # tear down if original task is not available
                self._teardown_task(self.ctx, job['id'], job_id)
                return

            if not (job_id and self.scheduler.get_job(job_id)):
                job_id = uuidutils.generate_uuid()
                db.failed_task_update(self.ctx, job['id'], {'job_id': job_id})

                collection_class = importutils.import_class(job['method'])
                instance = \
                    collection_class.get_instance(self.ctx, job['id'])
                self.scheduler.add_job(
                    instance,
                    'interval',
                    seconds=job['interval'],
                    next_run_time=datetime.now(),
                    id=job_id,
                    misfire_grace_time=int(
                        CONF.telemetry.performance_collection_interval / 2))
                self.job_ids.add(job_id)

        except Exception as e:
            LOG.error(
                "Failed to schedule retry tasks for performance "
                "collection, reason: %s", six.text_type(e))
        else:
            LOG.info("Schedule collection completed")
    def __call__(self):
        # Upon periodic job callback, if storage is already deleted or soft
        # deleted,do not proceed with failed performance collection flow
        try:
            failed_task = db.failed_task_get(self.ctx, self.failed_task_id)
            if failed_task["deleted"]:
                LOG.debug('Storage %s getting deleted, ignoring '
                          'performance collection cycle for failed task id %s.'
                          % (self.storage_id, self.failed_task_id))
                return
        except exception.FailedTaskNotFound:
            LOG.debug('Storage %s already deleted, ignoring '
                      'performance collection cycle for failed task id %s.'
                      % (self.storage_id, self.failed_task_id))
            return

        # Pull performance collection info
        self.retry_count = self.retry_count + 1
        try:
            status = self.task_rpcapi.collect_telemetry(
                self.ctx, self.storage_id,
                PerformanceCollectionTask.__module__ + '.' +
                PerformanceCollectionTask.__name__,
                self.args, self.start_time, self.end_time)

            if not status:
                raise exception.TelemetryTaskExecError()
        except Exception as e:
            LOG.error(e)
            msg = _("Failed to collect performance metrics for storage "
                    "id:{0}, reason:{1}".format(self.storage_id,
                                                six.text_type(e)))
            LOG.error(msg)
        else:
            LOG.info("Successfully completed Performance metrics collection "
                     "for storage id :{0} ".format(self.storage_id))
            self.result = TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS
            self._stop_task()
            return

        if self.retry_count >= TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT:
            msg = _(
                "Failed to collect performance metrics of task instance "
                "id:{0} for start time:{1} and end time:{2} with "
                "maximum retry. Giving up on "
                "retry".format(self.failed_task_id, self.start_time,
                               self.end_time))
            LOG.error(msg)
            self._stop_task()
            return

        self.result = TelemetryJobStatus.FAILED_JOB_STATUS_RETRYING
        db.failed_task_update(self.ctx, self.failed_task_id,
                              {FailedTask.retry_count.name: self.retry_count,
                               FailedTask.result.name: self.result})
Ejemplo n.º 4
0
    def distribute_failed_job(self, failed_task_id, executor):

        try:
            db.failed_task_update(self.ctx, failed_task_id,
                                  {'executor': executor})
            LOG.info('Distribute a failed job, id: %s' % failed_task_id)
            self.task_rpcapi.assign_failed_job(self.ctx, failed_task_id,
                                               executor)
        except Exception as e:
            LOG.error('Failed to distribute failed job, reason: %s',
                      six.text_type(e))
            raise e
Ejemplo n.º 5
0
    def __call__(self):
        """
        :return:
        """
        try:
            # Remove jobs from scheduler when marked for delete
            filters = {'deleted': True}
            failed_tasks = db.failed_task_get_all(self.ctx, filters=filters)
            LOG.debug("Total failed_tasks found deleted "
                      "in this cycle:%s" % len(failed_tasks))
            for failed_task in failed_tasks:
                job_id = failed_task['job_id']
                if job_id and self.scheduler.get_job(job_id):
                    self.scheduler.remove_job(job_id)
                db.failed_task_delete(self.ctx, failed_task['id'])
        except Exception as e:
            LOG.error("Failed to remove periodic scheduling job , reason: %s.",
                      six.text_type(e))
        try:
            # Create the object of periodic scheduler
            failed_tasks = db.failed_task_get_all(self.ctx)

            if not len(failed_tasks):
                LOG.info("No failed task found for performance collection")
                return

            LOG.debug("Schedule performance collection triggered: total "
                      "failed tasks:%s" % len(failed_tasks))

            for failed_task in failed_tasks:
                failed_task_id = failed_task[FailedTask.id.name]
                LOG.info("Processing failed task : %s" % failed_task_id)

                # Get failed jobs, if retry count has reached max,
                # remove job and delete db entry
                retry_count = failed_task[FailedTask.retry_count.name]
                result = failed_task[FailedTask.result.name]
                job_id = failed_task[FailedTask.job_id.name]
                if retry_count >= \
                        TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT or \
                        result == TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS:
                    LOG.info("Exiting Failure task processing for task [%d] "
                             "with result [%s] and retry count [%d] " %
                             (failed_task_id, result, retry_count))
                    # task ID is same as job id
                    self._teardown_task(self.ctx, failed_task_id, job_id)
                    continue

                # If job already scheduled, skip
                if job_id and self.scheduler.get_job(job_id):
                    continue

                try:
                    db.task_get(self.ctx, failed_task[FailedTask.task_id.name])
                except TaskNotFound as e:
                    LOG.info(
                        "Removing failed telemetry job as parent job "
                        "do not exist: %s", six.text_type(e))
                    # tear down if original task is not available
                    self._teardown_task(self.ctx, failed_task_id, job_id)
                    continue

                if not job_id:
                    job_id = uuidutils.generate_uuid()
                    db.failed_task_update(self.ctx, failed_task_id,
                                          {FailedTask.job_id.name: job_id})

                collection_class = importutils.import_class(
                    failed_task[FailedTask.method.name])
                instance = \
                    collection_class.get_instance(self.ctx, failed_task_id)
                self.scheduler.add_job(
                    instance,
                    'interval',
                    seconds=failed_task[FailedTask.interval.name],
                    next_run_time=datetime.now(),
                    id=job_id)

        except Exception as e:
            LOG.error(
                "Failed to schedule retry tasks for performance "
                "collection, reason: %s", six.text_type(e))
        else:
            LOG.info("Schedule collection completed")
 def _stop_task(self):
     db.failed_task_update(self.ctx, self.failed_task_id,
                           {FailedTask.retry_count.name: self.retry_count,
                            FailedTask.result.name: self.result})
     self.scheduler_instance.pause_job(self.job_id)