def stop_job(self, job_name): job_master = self.job_tracker.get_job_master(job_name) stage = Stage(job_master.workers, 'stop_job') stage.barrier(True, job_name) stage = Stage(job_master.workers, 'clear_job') stage.barrier(True, job_name) self.job_tracker.remove_job(job_name) self.logger.debug('stop job: %s' % job_name)
def run_job(self, job_name, unzip=False, wait_for_workers=False): if wait_for_workers: while not self.stopped.is_set(): if len(self.worker_tracker.workers) > 0: break stopped = self.stopped.wait(3) if stopped: return if unzip: self._unzip(job_name) job_path = os.path.join(self.job_dir, job_name) job_desc = import_job_desc(job_path) job_master = JobMaster(self.ctx, job_name, job_desc, self.worker_tracker.workers.keys()) job_master.init() self.job_tracker.register_job(job_name, job_master) self._register_runned_job(job_name, job_desc) zip_file = os.path.join(self.zip_dir, job_name+'.zip') for worker in job_master.workers: FileTransportClient(worker, zip_file).send_file() self.logger.debug( 'entering the master prepare stage, job id: %s' % job_name) self.logger.debug( 'job available workers: %s' % job_master.workers) stage = Stage(job_master.workers, 'prepare', logger=self.logger) prepared_ok = stage.barrier(True, job_name) if not prepared_ok: self.logger.error("prepare for running failed") return self.logger.debug( 'entering the master run_job stage, job id: %s' % job_name) stage = Stage(job_master.workers, 'run_job', logger=self.logger) run_ok = stage.barrier(True, job_name) if not run_ok: self.logger.error("run job failed, job id: %s" % job_name)
def pack_job_error(self, job_name): job_master = self.job_tracker.get_job_master(job_name) stage = Stage(job_master.workers, 'pack_job_error') stage.barrier(True, job_name) error_dir = os.path.join(self.working_dir, 'errors') if not os.path.exists(error_dir): os.makedirs(error_dir) error_filename = os.path.join(error_dir, '%s_errors.zip' % job_name) suffix = '%s_errors.zip' % job_name temp_dir = tempfile.mkdtemp() try: for name in os.listdir(self.zip_dir): if name.endswith(suffix): shutil.move(os.path.join(self.zip_dir, name), temp_dir) ZipHandler.compress(error_filename, temp_dir) finally: shutil.rmtree(temp_dir) return error_filename
def pack_job_error(self, job_name): job_master = self.job_tracker.get_job_master(job_name) stage = Stage(job_master.workers, 'pack_job_error') stage.barrier(True, job_name) error_dir = os.path.join(self.working_dir, 'errors') if not os.path.exists(error_dir): os.makedirs(error_dir) error_filename = os.path.join(error_dir, '%s_errors.zip'%job_name) suffix = '%s_errors.zip' % job_name temp_dir = tempfile.mkdtemp() try: for name in os.listdir(self.zip_dir): if name.endswith(suffix): shutil.move(os.path.join(self.zip_dir, name), temp_dir) ZipHandler.compress(error_filename, temp_dir) finally: shutil.rmtree(temp_dir) return error_filename
def run_job(self, job_name, unzip=False, wait_for_workers=False): if wait_for_workers: while not self.stopped.is_set(): if len(self.worker_tracker.workers) > 0: break stopped = self.stopped.wait(3) if stopped: return if unzip: self._unzip(job_name) job_path = os.path.join(self.job_dir, job_name) job_desc = import_job_desc(job_path) job_master = JobMaster(self.ctx, job_name, job_desc, self.worker_tracker.workers.keys()) job_master.init() self.job_tracker.register_job(job_name, job_master) self._register_runned_job(job_name, job_desc) zip_file = os.path.join(self.zip_dir, job_name+'.zip') for worker in job_master.workers: FileTransportClient(worker, zip_file).send_file() self.logger.debug( 'entering the master prepare stage, job id: %s' % job_name) self.logger.debug( 'job available workers: %s' % job_master.workers) stage = Stage(job_master.workers, 'prepare') stage.barrier(True, job_name) self.logger.debug( 'entering the master run_job stage, job id: %s' % job_name) stage = Stage(job_master.workers, 'run_job') stage.barrier(True, job_name)
def _shutdown_all_workers(self): stage = Stage(self.worker_tracker.workers.keys(), 'shutdown') stage.barrier(True)