コード例 #1
0
    def schedule_failed_job(self, failed_task_id):

        if self.stopped:
            return

        try:
            job = db.failed_task_get(self.ctx, failed_task_id)
            retry_count = job['retry_count']
            result = job['result']
            job_id = job['job_id']
            if retry_count >= \
                    TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT or \
                    result == TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS:
                LOG.info("Exiting Failure task processing for task [%d] "
                         "with result [%s] and retry count [%d] " %
                         (job['id'], result, retry_count))
                self._teardown_task(self.ctx, job['id'], job_id)
                return
            # If job already scheduled, skip
            if job_id and self.scheduler.get_job(job_id):
                return

            try:
                db.task_get(self.ctx, job['task_id'])
            except TaskNotFound as e:
                LOG.info(
                    "Removing failed telemetry job as parent job "
                    "do not exist: %s", six.text_type(e))
                # tear down if original task is not available
                self._teardown_task(self.ctx, job['id'], job_id)
                return

            if not (job_id and self.scheduler.get_job(job_id)):
                job_id = uuidutils.generate_uuid()
                db.failed_task_update(self.ctx, job['id'], {'job_id': job_id})

                collection_class = importutils.import_class(job['method'])
                instance = \
                    collection_class.get_instance(self.ctx, job['id'])
                self.scheduler.add_job(
                    instance,
                    'interval',
                    seconds=job['interval'],
                    next_run_time=datetime.now(),
                    id=job_id,
                    misfire_grace_time=int(
                        CONF.telemetry.performance_collection_interval / 2))
                self.job_ids.add(job_id)

        except Exception as e:
            LOG.error(
                "Failed to schedule retry tasks for performance "
                "collection, reason: %s", six.text_type(e))
        else:
            LOG.info("Schedule collection completed")
コード例 #2
0
 def remove_job(self, task_id):
     try:
         LOG.info("Received job %s to remove", task_id)
         job = db.task_get(self.ctx, task_id)
         job_id = job['job_id']
         self.remove_scheduled_job(job_id)
     except Exception as e:
         LOG.error("Failed to remove periodic scheduling job , reason: %s.",
                   six.text_type(e))
コード例 #3
0
 def get_instance(ctx, failed_task_id):
     failed_task = db.failed_task_get(ctx, failed_task_id)
     task = db.task_get(ctx, failed_task[FailedTask.task_id.name])
     return FailedPerformanceCollectionHandler(
         ctx,
         failed_task[FailedTask.id.name],
         task[Task.storage_id.name],
         task[Task.args.name],
         failed_task[FailedTask.job_id.name],
         failed_task[FailedTask.retry_count.name],
         failed_task[FailedTask.start_time.name],
         failed_task[FailedTask.end_time.name],
     )
コード例 #4
0
    def get_local_executor(self, context, task_id, failed_task_id, executor):
        executor_names = self.executor_map.keys()
        storage_id = None
        if task_id:
            job = db.task_get(context, task_id)
            storage_id = job['storage_id']
        elif failed_task_id:
            job = db.failed_task_get(context, failed_task_id)
            storage_id = job['storage_id']
        else:
            raise exception.InvalidInput("Missing task id")

        # Storage already exists
        for name in executor_names:
            executor_topic = "{0}:{1}".format(executor, name)
            if storage_id in self.executor_map[name]["storages"]:
                return executor_topic

        # Return existing executor_topic
        for name in executor_names:
            no_of_storages = len(self.executor_map[name]["storages"])
            if no_of_storages and (no_of_storages <
                                   CONF.telemetry.max_storages_in_child):
                executor_topic = "{0}:{1}".format(executor, name)
                LOG.info(
                    "Selecting existing local executor {0} for {1}".format(
                        executor_topic, storage_id))
                self.executor_map[name]["storages"].append(storage_id)
                return executor_topic

        # Return executor_topic after creating one
        for index in range(CONF.telemetry.max_childs_in_node):
            name = "executor_{0}".format(index + 1)
            if name not in executor_names:
                executor_topic = "{0}:{1}".format(executor, name)
                LOG.info("Create a new local executor {0} for {1}".format(
                    executor_topic, storage_id))
                launcher = self.create_process(topic=executor_topic,
                                               host=executor)
                self.executor_map[name] = {
                    "storages": [storage_id],
                    "launcher": launcher,
                    "cleanup_delay": 0
                }
                return executor_topic

        msg = "Reached maximum number of ({0}) local executors". \
            format(CONF.telemetry.max_childs_in_node)
        LOG.error(msg)
        raise RuntimeError(msg)
コード例 #5
0
 def remove_job(self, context, task_id, executor):
     if not self.enable_sub_process:
         instance = JobHandler.get_instance(context, task_id)
         instance.remove_job(task_id)
     else:
         job = db.task_get(context, task_id)
         storage_id = job['storage_id']
         for name in self.executor_map.keys():
             if storage_id in self.executor_map[name]["storages"]:
                 local_executor = "{0}:{1}".format(executor, name)
                 self.rpcapi.remove_job_local(context, task_id,
                                              local_executor)
                 tasks, failed_tasks = self.get_all_tasks(storage_id)
                 if len(failed_tasks) == 0 and len(tasks) == 0:
                     self.stop_executor(name, local_executor, storage_id)
コード例 #6
0
    def __call__(self):
        # Upon periodic job callback, if storage is already deleted or soft
        # deleted,do not proceed with performance collection flow
        try:
            task = db.task_get(self.ctx, self.task_id)
            if task["deleted"]:
                LOG.debug('Storage %s getting deleted, ignoring performance '
                          'collection cycle for task id %s.'
                          % (self.storage_id, self.task_id))
                return
        except exception.TaskNotFound:
            LOG.debug('Storage %s already deleted, ignoring performance '
                      'collection cycle for task id %s.'
                      % (self.storage_id, self.task_id))
            return

        # Handles performance collection from driver and dispatch
        start_time = None
        end_time = None
        try:
            LOG.debug('Collecting performance metrics for task id: %s'
                      % self.task_id)
            current_time = int(datetime.utcnow().timestamp())

            # Times are epoch time in milliseconds
            end_time = current_time * 1000
            start_time = end_time - (self.interval * 1000)
            status = self.task_rpcapi. \
                collect_telemetry(self.ctx, self.storage_id,
                                  telemetry.TelemetryTask.__module__ + '.' +
                                  'PerformanceCollectionTask', self.args,
                                  start_time, end_time)

            db.task_update(self.ctx, self.task_id,
                           {'last_run_time': current_time})

            if not status:
                raise exception.TelemetryTaskExecError()
        except Exception as e:
            LOG.error("Failed to collect performance metrics for "
                      "task id :{0}, reason:{1}".format(self.task_id,
                                                        six.text_type(e)))
            self._handle_task_failure(start_time, end_time)
        else:
            LOG.debug("Performance collection done for storage id :{0}"
                      ",task id :{1} and interval(in sec):{2}"
                      .format(self.storage_id, self.task_id, self.interval))
コード例 #7
0
 def get_instance(ctx, task_id):
     task = db.task_get(ctx, task_id)
     return PerformanceCollectionHandler(ctx, task_id, task['storage_id'],
                                         task['args'], task['interval'])
コード例 #8
0
    def __call__(self):
        """
        :return:
        """
        try:
            # Remove jobs from scheduler when marked for delete
            filters = {'deleted': True}
            failed_tasks = db.failed_task_get_all(self.ctx, filters=filters)
            LOG.debug("Total failed_tasks found deleted "
                      "in this cycle:%s" % len(failed_tasks))
            for failed_task in failed_tasks:
                job_id = failed_task['job_id']
                if job_id and self.scheduler.get_job(job_id):
                    self.scheduler.remove_job(job_id)
                db.failed_task_delete(self.ctx, failed_task['id'])
        except Exception as e:
            LOG.error("Failed to remove periodic scheduling job , reason: %s.",
                      six.text_type(e))
        try:
            # Create the object of periodic scheduler
            failed_tasks = db.failed_task_get_all(self.ctx)

            if not len(failed_tasks):
                LOG.info("No failed task found for performance collection")
                return

            LOG.debug("Schedule performance collection triggered: total "
                      "failed tasks:%s" % len(failed_tasks))

            for failed_task in failed_tasks:
                failed_task_id = failed_task[FailedTask.id.name]
                LOG.info("Processing failed task : %s" % failed_task_id)

                # Get failed jobs, if retry count has reached max,
                # remove job and delete db entry
                retry_count = failed_task[FailedTask.retry_count.name]
                result = failed_task[FailedTask.result.name]
                job_id = failed_task[FailedTask.job_id.name]
                if retry_count >= \
                        TelemetryCollection.MAX_FAILED_JOB_RETRY_COUNT or \
                        result == TelemetryJobStatus.FAILED_JOB_STATUS_SUCCESS:
                    LOG.info("Exiting Failure task processing for task [%d] "
                             "with result [%s] and retry count [%d] " %
                             (failed_task_id, result, retry_count))
                    # task ID is same as job id
                    self._teardown_task(self.ctx, failed_task_id, job_id)
                    continue

                # If job already scheduled, skip
                if job_id and self.scheduler.get_job(job_id):
                    continue

                try:
                    db.task_get(self.ctx, failed_task[FailedTask.task_id.name])
                except TaskNotFound as e:
                    LOG.info(
                        "Removing failed telemetry job as parent job "
                        "do not exist: %s", six.text_type(e))
                    # tear down if original task is not available
                    self._teardown_task(self.ctx, failed_task_id, job_id)
                    continue

                if not job_id:
                    job_id = uuidutils.generate_uuid()
                    db.failed_task_update(self.ctx, failed_task_id,
                                          {FailedTask.job_id.name: job_id})

                collection_class = importutils.import_class(
                    failed_task[FailedTask.method.name])
                instance = \
                    collection_class.get_instance(self.ctx, failed_task_id)
                self.scheduler.add_job(
                    instance,
                    'interval',
                    seconds=failed_task[FailedTask.interval.name],
                    next_run_time=datetime.now(),
                    id=job_id)

        except Exception as e:
            LOG.error(
                "Failed to schedule retry tasks for performance "
                "collection, reason: %s", six.text_type(e))
        else:
            LOG.info("Schedule collection completed")
コード例 #9
0
    def schedule_job(self, task_id):

        if self.stopped:
            # If Job is stopped return immediately
            return

        LOG.info("JobHandler received A job %s to schedule" % task_id)
        job = db.task_get(self.ctx, task_id)
        # Check delete status of the task
        deleted = job['deleted']
        if deleted:
            return
        collection_class = importutils.import_class(job['method'])
        instance = collection_class.get_instance(self.ctx, self.task_id)
        current_time = int(datetime.now().timestamp())
        last_run_time = current_time
        next_collection_time = last_run_time + job['interval']
        job_id = uuidutils.generate_uuid()
        next_collection_time = datetime \
            .fromtimestamp(next_collection_time) \
            .strftime('%Y-%m-%d %H:%M:%S')

        existing_job_id = job['job_id']

        scheduler_job = self.scheduler.get_job(existing_job_id)

        if not (existing_job_id and scheduler_job):
            LOG.info('JobHandler scheduling a new job')
            self.scheduler.add_job(instance,
                                   'interval',
                                   seconds=job['interval'],
                                   next_run_time=next_collection_time,
                                   id=job_id,
                                   misfire_grace_time=int(job['interval'] / 2))

            update_task_dict = {'job_id': job_id}
            db.task_update(self.ctx, self.task_id, update_task_dict)
            self.job_ids.add(job_id)
            LOG.info('Periodic collection tasks scheduled for for job id: '
                     '%s ' % self.task_id)

            # Check if historic collection is needed for this task.
            # If the last run time is already set, adjust start_time based on
            # last run time or history_on_reschedule which is smaller
            # If jod id is created but last run time is not yet set, then
            # adjust start_time based on interval or history_on_reschedule
            # whichever is smaller

            end_time = current_time * 1000
            # Maximum supported history duration on restart
            history_on_reschedule = CONF.telemetry. \
                performance_history_on_reschedule
            if job['last_run_time']:
                start_time = job['last_run_time'] * 1000 \
                    if current_time - job['last_run_time'] < \
                    history_on_reschedule \
                    else (end_time - history_on_reschedule * 1000)
                self.perform_history_collection(start_time, end_time,
                                                last_run_time)
            elif existing_job_id:
                interval_in_sec = job['interval']
                start_time = (end_time - interval_in_sec * 1000) \
                    if interval_in_sec < history_on_reschedule \
                    else (end_time - history_on_reschedule * 1000)
                self.perform_history_collection(start_time, end_time,
                                                last_run_time)
        else:
            LOG.info('Job already exists with this scheduler')
コード例 #10
0
 def get_instance(ctx, task_id):
     task = db.task_get(ctx, task_id)
     return JobHandler(ctx, task_id, task['storage_id'], task['args'],
                       task['interval'])