def _execute_jobs(self): logger.info('Jobs execution started') try: logger.debug('Lock acquiring') with sync_manager.lock(self.JOBS_LOCK, blocking=False): logger.debug('Lock acquired') ready_jobs = self._ready_jobs() for job in ready_jobs: try: self.__process_job(job) job.save() except LockError: pass except Exception as e: logger.error('Failed to process job {0}: ' '{1}\n{2}'.format(job.id, e, traceback.format_exc())) continue except LockFailedError as e: pass except Exception as e: logger.error('Failed to process existing jobs: {0}\n{1}'.format( e, traceback.format_exc())) finally: logger.info('Jobs execution finished') self.__tq.add_task_at(self.JOBS_EXECUTE, self.jobs_timer.next(), self._execute_jobs)
def create_job(self, request): try: try: job_type = request[0] except IndexError: raise ValueError('Job type is required') if job_type not in (JobTypes.TYPE_MOVE_JOB, JobTypes.TYPE_RECOVER_DC_JOB, JobTypes.TYPE_COUPLE_DEFRAG_JOB, JobTypes.TYPE_RESTORE_GROUP_JOB): raise ValueError('Invalid job type: {0}'.format(job_type)) try: params = request[1] except IndexError: params = {} try: force = request[2] except IndexError: force = False with sync_manager.lock(self.JOBS_LOCK, timeout=self.JOB_MANUAL_TIMEOUT): job = self._create_job(job_type, params, force=force) except LockFailedError as e: raise except Exception as e: logger.error('Failed to create job: {0}\n{1}'.format(e, traceback.format_exc())) raise return job.dump()
def _try_distribute_keys(self): try: with sync_manager.lock(CacheManager.DISTRIBUTE_LOCK, blocking=False): self.distributor.distribute(self.top_keys) except LockFailedError: logger.info('Distribute task is already running') pass except LockError: logger.exception('Distribute task failed to acquire lock') pass
def reserve_group_ids(self, count, timeout=10): with sync_manager.lock("cluster_max_group", timeout=timeout): session = self.node.meta_session try: request = session.read_latest(keys.MASTERMIND_MAX_GROUP_KEY) max_group = int(request.get()[0].data) except elliptics.NotFoundError: max_group = 0 new_max_group = max_group + count session.write_data(keys.MASTERMIND_MAX_GROUP_KEY, str(new_max_group)).get() return range(max_group + 1, max_group + count + 1)
def collect(self): """ Runs samples collect task Samples collection is performed periodically each <DATA_COLLECT_PERIOD> seconds. """ try: with sync_manager.lock(CoupleFreeEffectiveSpaceMonitor.COUPLE_FREE_EFF_SPACE_DATA, blocking=False): self.__collect_free_effective_space() except LockAlreadyAcquiredError: logger.info("Couples' effective free space is already being collected") pass
def reserve_group_ids(self, count, timeout=10): with sync_manager.lock('cluster_max_group', timeout=timeout): session = self.node.meta_session try: request = session.read_latest(keys.MASTERMIND_MAX_GROUP_KEY) max_group = int(request.get()[0].data) except elliptics.NotFoundError: max_group = 0 new_max_group = max_group + count session.write_data(keys.MASTERMIND_MAX_GROUP_KEY, str(new_max_group)).get() return range(max_group + 1, max_group + count + 1)
def _execute_jobs(self): logger.info('Jobs execution started') try: logger.debug('Lock acquiring') with sync_manager.lock(self.JOBS_LOCK, blocking=False): logger.debug('Lock acquired') new_jobs, executing_jobs = [], [] type_jobs_count = {} for job in self.jobs(statuses=Job.STATUS_EXECUTING): type_jobs_count.setdefault(job.type, 0) type_jobs_count[job.type] += 1 executing_jobs.append(job) for job in self.jobs(statuses=Job.STATUS_NEW): jobs_count = type_jobs_count.setdefault(job.type, 0) if jobs_count >= JOB_CONFIG.get(job.type, {}).get('max_executing_jobs', 3): continue type_jobs_count[job.type] += 1 new_jobs.append(job) new_jobs.sort(key=lambda j: j.create_ts) ready_jobs = executing_jobs + new_jobs logger.debug('Ready jobs: {0}'.format(len(ready_jobs))) for job in ready_jobs: try: with job.tasks_lock(): self.__process_job(job) job.save() except LockError: pass except Exception as e: logger.error('Failed to process job {0}: ' '{1}\n{2}'.format(job.id, e, traceback.format_exc())) continue except LockFailedError as e: pass except Exception as e: logger.error('Failed to process existing jobs: {0}\n{1}'.format( e, traceback.format_exc())) finally: logger.info('Jobs execution finished') self.__tq.add_task_at(self.JOBS_EXECUTE, self.jobs_timer.next(), self._execute_jobs)
def collect(self): """ Runs samples collect task Samples collection is performed periodically each <DATA_COLLECT_PERIOD> seconds. """ try: with sync_manager.lock( CoupleFreeEffectiveSpaceMonitor.COUPLE_FREE_EFF_SPACE_DATA, blocking=False): self.__collect_free_effective_space() except LockAlreadyAcquiredError: logger.info('Couples\' effective free space is already being collected') pass
def stop_jobs(self, request): jobs = [] try: try: job_uids = request[0] except IndexError: raise ValueError('Job uids is required') logger.debug('Lock acquiring') with sync_manager.lock(self.JOBS_LOCK, timeout=self.JOB_MANUAL_TIMEOUT): logger.debug('Lock acquired') jobs = self.job_finder.jobs(ids=job_uids) self._stop_jobs(jobs) except LockFailedError as e: raise except Exception as e: logger.error('Failed to stop jobs: {0}\n{1}'.format(e, traceback.format_exc())) raise return [job.dump() for job in jobs]
def stop_jobs(self, request): jobs = [] try: try: job_uids = request[0] except IndexError: raise ValueError('Job uids is required') logger.debug('Lock acquiring') with sync_manager.lock(self.JOBS_LOCK, timeout=self.JOB_MANUAL_TIMEOUT): logger.debug('Lock acquired') jobs = self.job_finder.jobs(ids=job_uids) self._stop_jobs(jobs) except LockFailedError as e: raise except Exception as e: logger.error('Failed to stop jobs: {0}\n{1}'.format( e, traceback.format_exc())) raise return [job.dump() for job in jobs]
def __change_failed_task_status(self, job_id, task_id, status): logger.debug('Lock acquiring') with sync_manager.lock(self.JOBS_LOCK, timeout=self.JOB_MANUAL_TIMEOUT): logger.debug('Lock acquired') job = self.__get_job(job_id) with job.tasks_lock(): if job.status not in (Job.STATUS_PENDING, Job.STATUS_BROKEN): raise ValueError('Job {0}: status is "{1}", should have been ' '{2}|{3}'.format(job.id, job.status, Job.STATUS_PENDING, Job.STATUS_BROKEN)) task = None for t in job.tasks: if t.id == task_id: task = t break else: raise ValueError('Job {0} does not contain task ' 'with id {1}'.format(job_id, task_id)) if task.status != Task.STATUS_FAILED: raise ValueError('Job {0}: task {1} has status {2}, should ' 'have been failed'.format(job.id, task.id, task.status)) task.status = status task.attempts = 0 job.status = Job.STATUS_EXECUTING job.update_ts = time.time() job._dirty = True job.save() logger.info('Job {0}: task {1} status was reset to {2}, ' 'job status was reset to {3}'.format( job.id, task.id, task.status, job.status)) return job
def _create_job(self, job_type, params, force=False): # Forcing manual approval of newly created job params.setdefault('need_approving', True) if job_type == JobTypes.TYPE_MOVE_JOB: JobType = MoveJob elif job_type == JobTypes.TYPE_RECOVER_DC_JOB: JobType = RecoverDcJob elif job_type == JobTypes.TYPE_COUPLE_DEFRAG_JOB: JobType = CoupleDefragJob elif job_type == JobTypes.TYPE_RESTORE_GROUP_JOB: JobType = RestoreGroupJob elif job_type == JobTypes.TYPE_MAKE_LRC_GROUPS_JOB: JobType = MakeLrcGroupsJob try: job = JobType.new(self.session, **params) except LockAlreadyAcquiredError as e: if not force: raise job_ids = e.holders_ids # check job types priority STOP_ALLOWED_TYPES = (JobTypes.TYPE_RECOVER_DC_JOB, JobTypes.TYPE_COUPLE_DEFRAG_JOB) if job_type not in (JobTypes.TYPE_RESTORE_GROUP_JOB, JobTypes.TYPE_MOVE_JOB): raise jobs = self.job_finder.jobs(ids=job_ids) for existing_job in jobs: if self.JOB_PRIORITIES[existing_job.type] >= self.JOB_PRIORITIES[job_type]: raise RuntimeError('Cannot stop job {0}, type is {1} ' 'and has equal or higher priority'.format( existing_job.id, existing_job.type)) if existing_job.status in (Job.STATUS_NOT_APPROVED, Job.STATUS_NEW): continue elif existing_job.type not in STOP_ALLOWED_TYPES: raise RuntimeError('Cannot stop job {0}, type is {1}'.format( existing_job.id, existing_job.type)) logger.info('Stopping jobs: {0}'.format(job_ids)) logger.debug('Lock acquiring') with sync_manager.lock(self.JOBS_LOCK, timeout=self.JOB_MANUAL_TIMEOUT): logger.debug('Lock acquired') self._stop_jobs(jobs) logger.info('Retrying job creation') job = JobType.new(self.session, **params) job.collection = self.job_finder.collection inv_group_ids = job._involved_groups try: logger.info('Job {0}: updating groups {1} status'.format(job.id, inv_group_ids)) self.node_info_updater.update_status( groups=[ storage.groups[ig] for ig in inv_group_ids if ig in storage.groups ] ) except Exception as e: logger.info('Job {0}: failed to update groups status: {1}\n{2}'.format( job.id, e, traceback.format_exc())) pass try: job.create_tasks() job.save() logger.info('Job {0} created: {1}'.format(job.id, job.dump())) except Exception: job.release_locks() job.unmark_groups(self.session) raise if 'group' in params: group_id = params['group'] if group_id in storage.groups: group = storage.groups[group_id] group.set_active_job(job) group.update_status_recursive() return job
def _create_job(self, job_type, params, force=False): # Forcing manual approval of newly created job params.setdefault('need_approving', True) if job_type == JobTypes.TYPE_MOVE_JOB: JobType = MoveJob elif job_type == JobTypes.TYPE_RECOVER_DC_JOB: JobType = RecoverDcJob elif job_type == JobTypes.TYPE_COUPLE_DEFRAG_JOB: JobType = CoupleDefragJob elif job_type == JobTypes.TYPE_RESTORE_GROUP_JOB: JobType = RestoreGroupJob try: job = JobType.new(self.session, **params) except LockAlreadyAcquiredError as e: if not force: raise job_ids = e.holders_ids # check job types priority STOP_ALLOWED_TYPES = (JobTypes.TYPE_RECOVER_DC_JOB, JobTypes.TYPE_COUPLE_DEFRAG_JOB) if job_type not in (JobTypes.TYPE_RESTORE_GROUP_JOB, JobTypes.TYPE_MOVE_JOB): raise jobs = self.job_finder.jobs(ids=job_ids) for existing_job in jobs: if self.JOB_PRIORITIES[ existing_job.type] >= self.JOB_PRIORITIES[job_type]: raise RuntimeError( 'Cannot stop job {0}, type is {1} ' 'and has equal or higher priority'.format( existing_job.id, existing_job.type)) if existing_job.status in (Job.STATUS_NOT_APPROVED, Job.STATUS_NEW): continue elif existing_job.type not in STOP_ALLOWED_TYPES: raise RuntimeError( 'Cannot stop job {0}, type is {1}'.format( existing_job.id, existing_job.type)) logger.info('Stopping jobs: {0}'.format(job_ids)) logger.debug('Lock acquiring') with sync_manager.lock(self.JOBS_LOCK, timeout=self.JOB_MANUAL_TIMEOUT): logger.debug('Lock acquired') self._stop_jobs(jobs) logger.info('Retrying job creation') job = JobType.new(self.session, **params) job.collection = self.job_finder.collection inv_group_ids = job._involved_groups try: logger.info('Job {0}: updating groups {1} status'.format( job.id, inv_group_ids)) inv_groups = [storage.groups[ig] for ig in inv_group_ids] self.node_info_updater.update_status(groups=inv_groups) except Exception as e: logger.info( 'Job {0}: failed to update groups status: {1}\n{2}'.format( job.id, e, traceback.format_exc())) pass try: job.create_tasks() job.save() logger.info('Job {0} created: {1}'.format(job.id, job.dump())) except Exception: job.release_locks() job.unmark_groups(self.session) raise if 'group' in params: group_id = params['group'] if group_id in storage.groups: group = storage.groups[group_id] group.set_active_job(job) group.update_status_recursive() return job