def __init__(self, ctx): self.ctx = ctx self.rpc_server = self.ctx.master_rpc_server assert self.rpc_server is not None self.working_dir = os.path.join(self.ctx.working_dir, 'master') self.zip_dir = os.path.join(self.working_dir, 'zip') self.job_dir = os.path.join(self.working_dir, 'jobs') if not os.path.exists(self.zip_dir): os.makedirs(self.zip_dir) if not os.path.exists(self.job_dir): os.makedirs(self.job_dir) self.worker_tracker = WorkerTracker() self.job_tracker = JobTracker() self.black_list = [] self.stopped = threading.Event() self.logger = get_logger("cola_master") self._init_log_server(self.logger) self._register_rpc() self.load() FileTransportServer(self.rpc_server, self.zip_dir)
class Master(object): def __init__(self, ctx): self.ctx = ctx self.rpc_server = self.ctx.master_rpc_server assert self.rpc_server is not None self.working_dir = os.path.join(self.ctx.working_dir, 'master') self.zip_dir = os.path.join(self.working_dir, 'zip') self.job_dir = os.path.join(self.working_dir, 'jobs') if not os.path.exists(self.zip_dir): os.makedirs(self.zip_dir) if not os.path.exists(self.job_dir): os.makedirs(self.job_dir) self.worker_tracker = WorkerTracker() self.job_tracker = JobTracker() self.black_list = [] self.stopped = threading.Event() self.logger = get_logger("cola_master") self._init_log_server(self.logger) self._register_rpc() self.load() FileTransportServer(self.rpc_server, self.zip_dir) def load(self): self.runned_job_metas = {} job_meta_file = os.path.join(self.working_dir, JOB_META_STATUS_FILENAME) if os.path.exists(job_meta_file) and \ os.path.getsize(job_meta_file) > 0: try: with open(job_meta_file) as f: self.runned_job_metas = pickle.load(f) except: pass def save(self): job_meta_file = os.path.join(self.working_dir, JOB_META_STATUS_FILENAME) with open(job_meta_file, 'w') as f: pickle.dump(self.runned_job_metas, f) def _register_rpc(self): self.rpc_server.register_function(self.run_job, 'run_job') self.rpc_server.register_function(self.stop_job, 'stop_job') self.rpc_server.register_function(self.pack_job_error, 'pack_job_error') self.rpc_server.register_function(self.list_runnable_jobs, 'runnable_jobs') self.rpc_server.register_function( lambda: self.job_tracker.running_jobs.keys(), 'running_jobs') self.rpc_server.register_function(self.list_workers, 'list_workers') self.rpc_server.register_function(self.shutdown, 'shutdown') self.rpc_server.register_function(self.register_heartbeat, 'register_heartbeat') def register_heartbeat(self, worker): self.worker_tracker.register_worker(worker) return self.worker_tracker.workers.keys() def _init_log_server(self, logger): self.log_server = LogRecordSocketReceiver(host=self.ctx.ip, logger=self.logger) self.log_t = threading.Thread(target=self.log_server.serve_forever) self.log_t.start() def _shutdown_log_server(self): if hasattr(self, 'log_server'): self.log_server.shutdown() self.log_t.join() def _check_workers(self): while not self.stopped.is_set(): for worker, info in self.worker_tracker.workers.iteritems(): # if loose connection if int(time.time()) - info.last_update \ > HEARTBEAT_CHECK_INTERVAL: info.continous_register = 0 if info.status == RUNNING: info.status = HANGUP elif info.status == HANGUP: info.status = STOPPED self.black_list.append(worker) for job in self.job_tracker.running_jobs: self.job_tracker.remove_worker(job, worker) # if continously connect for more than 10 min elif info.continous_register >= CONTINOUS_HEARTBEAT: if info.status != RUNNING: info.status = RUNNING if worker in self.black_list: self.black_list.remove(worker) for job in self.job_tracker.running_jobs: if not client_call(worker, 'has_job'): client_call(worker, 'prepare', job) client_call(worker, 'run_job', job) self.job_tracker.add_worker(job, worker) self.stopped.wait(HEARTBEAT_CHECK_INTERVAL) def _check_jobs(self): while not self.stopped.is_set(): for job_master in self.job_tracker.running_jobs.values(): if job_master.budget_server.get_status() == ALLFINISHED: self.stop_job(job_master.job_name) self.job_tracker.remove_job(job_master.job_name) self.stopped.wait(JOB_CHECK_INTERVAL) def _unzip(self, job_name): zip_file = os.path.join(self.zip_dir, job_name + '.zip') if os.path.exists(zip_file): ZipHandler.uncompress(zip_file, self.job_dir) def _register_runned_job(self, job_name, job_desc): self.runned_job_metas[job_name] = { 'job_name': job_desc.name, 'created': time.time() } def run(self): self._worker_t = threading.Thread(target=self._check_workers) self._worker_t.start() self._job_t = threading.Thread(target=self._check_jobs) self._job_t.start() def run_job(self, job_name, unzip=False, wait_for_workers=False): if wait_for_workers: while not self.stopped.is_set(): if len(self.worker_tracker.workers) > 0: break stopped = self.stopped.wait(3) if stopped: return if unzip: self._unzip(job_name) job_path = os.path.join(self.job_dir, job_name) job_desc = import_job_desc(job_path) job_master = JobMaster(self.ctx, job_name, job_desc, self.worker_tracker.workers.keys()) job_master.init() self.job_tracker.register_job(job_name, job_master) self._register_runned_job(job_name, job_desc) zip_file = os.path.join(self.zip_dir, job_name + '.zip') for worker in job_master.workers: FileTransportClient(worker, zip_file).send_file() self.logger.debug('entering the master prepare stage, job id: %s' % job_name) self.logger.debug('job available workers: %s' % job_master.workers) stage = Stage(job_master.workers, 'prepare') prepared_ok = stage.barrier(True, job_name) if not prepared_ok: self.logger.error("prepare for running failed") return self.logger.debug('entering the master run_job stage, job id: %s' % job_name) stage = Stage(job_master.workers, 'run_job') run_ok = stage.barrier(True, job_name) if not run_ok: self.logger.error("run job failed, job id: %s" % job_name) def stop_job(self, job_name): job_master = self.job_tracker.get_job_master(job_name) stage = Stage(job_master.workers, 'stop_job') stage.barrier(True, job_name) stage = Stage(job_master.workers, 'clear_job') stage.barrier(True, job_name) self.job_tracker.remove_job(job_name) self.logger.debug('stop job: %s' % job_name) def pack_job_error(self, job_name): job_master = self.job_tracker.get_job_master(job_name) stage = Stage(job_master.workers, 'pack_job_error') stage.barrier(True, job_name) error_dir = os.path.join(self.working_dir, 'errors') if not os.path.exists(error_dir): os.makedirs(error_dir) error_filename = os.path.join(error_dir, '%s_errors.zip' % job_name) suffix = '%s_errors.zip' % job_name temp_dir = tempfile.mkdtemp() try: for name in os.listdir(self.zip_dir): if name.endswith(suffix): shutil.move(os.path.join(self.zip_dir, name), temp_dir) ZipHandler.compress(error_filename, temp_dir) finally: shutil.rmtree(temp_dir) return error_filename def list_runnable_jobs(self): job_dirs = filter( lambda s: os.path.isdir(os.path.join(self.job_dir, s)), os.listdir(self.job_dir)) jobs = {} for job_dir in job_dirs: desc = import_job_desc(os.path.join(self.job_dir, job_dir)) jobs[job_dir] = desc.name return jobs def has_running_jobs(self): return len(self.job_tracker.running_jobs) > 0 def list_workers(self): return [(worker, STATUSES[worker_info.status]) for worker, worker_info \ in self.worker_tracker.workers.iteritems()] def _stop_all_jobs(self): for job_name in self.job_tracker.running_jobs.keys(): self.stop_job(job_name) def _shutdown_all_workers(self): stage = Stage(self.worker_tracker.workers.keys(), 'shutdown') stage.barrier(True) def shutdown(self): if not hasattr(self, '_worker_t'): return if not hasattr(self, '_job_t'): return self.logger.debug('master starts to shutdown') self.stopped.set() self._stop_all_jobs() self._shutdown_all_workers() self._worker_t.join() self._job_t.join() self.save() self.rpc_server.shutdown() self.logger.debug('master shutdown finished') self._shutdown_log_server()
class Master(object): def __init__(self, ctx): self.ctx = ctx self.rpc_server = self.ctx.master_rpc_server assert self.rpc_server is not None self.working_dir = os.path.join(self.ctx.working_dir, 'master') self.zip_dir = os.path.join(self.working_dir, 'zip') self.job_dir = os.path.join(self.working_dir, 'jobs') if not os.path.exists(self.zip_dir): os.makedirs(self.zip_dir) if not os.path.exists(self.job_dir): os.makedirs(self.job_dir) self.worker_tracker = WorkerTracker() self.job_tracker = JobTracker() self.black_list = [] self.stopped = threading.Event() self.logger = get_logger("cola_master") self._init_log_server(self.logger) self._register_rpc() self.load() FileTransportServer(self.rpc_server, self.zip_dir) def load(self): self.runned_job_metas = {} job_meta_file = os.path.join(self.working_dir, JOB_META_STATUS_FILENAME) if os.path.exists(job_meta_file) and \ os.path.getsize(job_meta_file) > 0: try: with open(job_meta_file) as f: self.runned_job_metas = pickle.load(f) except: pass def save(self): job_meta_file = os.path.join(self.working_dir, JOB_META_STATUS_FILENAME) with open(job_meta_file, 'w') as f: pickle.dump(self.runned_job_metas, f) def _register_rpc(self): self.rpc_server.register_function(self.run_job, 'run_job') self.rpc_server.register_function(self.stop_job, 'stop_job') self.rpc_server.register_function(self.pack_job_error, 'pack_job_error') self.rpc_server.register_function(self.list_runnable_jobs, 'runnable_jobs') self.rpc_server.register_function(lambda: self.job_tracker.running_jobs.keys(), 'running_jobs') self.rpc_server.register_function(self.list_workers, 'list_workers') self.rpc_server.register_function(self.shutdown, 'shutdown') self.rpc_server.register_function(self.register_heartbeat, 'register_heartbeat') def register_heartbeat(self, worker): self.worker_tracker.register_worker(worker) return self.worker_tracker.workers.keys() def _init_log_server(self, logger): self.log_server = LogRecordSocketReceiver(host=self.ctx.ip, logger=self.logger) self.log_t = threading.Thread(target=self.log_server.serve_forever) self.log_t.start() def _shutdown_log_server(self): if hasattr(self, 'log_server'): self.log_server.shutdown() self.log_t.join() def _check_workers(self): while not self.stopped.is_set(): for worker, info in self.worker_tracker.workers.iteritems(): # if loose connection if int(time.time()) - info.last_update \ > HEARTBEAT_CHECK_INTERVAL: info.continous_register = 0 if info.status == RUNNING: info.status = HANGUP elif info.status == HANGUP: info.status = STOPPED self.black_list.append(worker) for job in self.job_tracker.running_jobs: self.job_tracker.remove_worker(job, worker) # if continously connect for more than 10 min elif info.continous_register >= CONTINOUS_HEARTBEAT: if info.status != RUNNING: info.status = RUNNING if worker in self.black_list: self.black_list.remove(worker) for job in self.job_tracker.running_jobs: if not client_call(worker, 'has_job'): client_call(worker, 'prepare', job) client_call(worker, 'run_job', job) self.job_tracker.add_worker(job, worker) self.stopped.wait(HEARTBEAT_CHECK_INTERVAL) def _check_jobs(self): while not self.stopped.is_set(): for job_master in self.job_tracker.running_jobs.values(): if job_master.budget_server.get_status() == ALLFINISHED: self.stop_job(job_master.job_name) self.job_tracker.remove_job(job_master.job_name) self.stopped.wait(JOB_CHECK_INTERVAL) def _unzip(self, job_name): zip_file = os.path.join(self.zip_dir, job_name+'.zip') if os.path.exists(zip_file): ZipHandler.uncompress(zip_file, self.job_dir) def _register_runned_job(self, job_name, job_desc): self.runned_job_metas[job_name] = {'job_name': job_desc.name, 'created': time.time()} def run(self): self._worker_t = threading.Thread(target=self._check_workers) self._worker_t.start() self._job_t = threading.Thread(target=self._check_jobs) self._job_t.start() def run_job(self, job_name, unzip=False, wait_for_workers=False): if wait_for_workers: while not self.stopped.is_set(): if len(self.worker_tracker.workers) > 0: break stopped = self.stopped.wait(3) if stopped: return if unzip: self._unzip(job_name) job_path = os.path.join(self.job_dir, job_name) job_desc = import_job_desc(job_path) job_master = JobMaster(self.ctx, job_name, job_desc, self.worker_tracker.workers.keys()) job_master.init() self.job_tracker.register_job(job_name, job_master) self._register_runned_job(job_name, job_desc) zip_file = os.path.join(self.zip_dir, job_name+'.zip') for worker in job_master.workers: FileTransportClient(worker, zip_file).send_file() self.logger.debug( 'entering the master prepare stage, job id: %s' % job_name) self.logger.debug( 'job available workers: %s' % job_master.workers) stage = Stage(job_master.workers, 'prepare') stage.barrier(True, job_name) self.logger.debug( 'entering the master run_job stage, job id: %s' % job_name) stage = Stage(job_master.workers, 'run_job') stage.barrier(True, job_name) def stop_job(self, job_name): job_master = self.job_tracker.get_job_master(job_name) stage = Stage(job_master.workers, 'stop_job') stage.barrier(True, job_name) stage = Stage(job_master.workers, 'clear_job') stage.barrier(True, job_name) self.job_tracker.remove_job(job_name) self.logger.debug('stop job: %s' % job_name) def pack_job_error(self, job_name): job_master = self.job_tracker.get_job_master(job_name) stage = Stage(job_master.workers, 'pack_job_error') stage.barrier(True, job_name) error_dir = os.path.join(self.working_dir, 'errors') if not os.path.exists(error_dir): os.makedirs(error_dir) error_filename = os.path.join(error_dir, '%s_errors.zip'%job_name) suffix = '%s_errors.zip' % job_name temp_dir = tempfile.mkdtemp() try: for name in os.listdir(self.zip_dir): if name.endswith(suffix): shutil.move(os.path.join(self.zip_dir, name), temp_dir) ZipHandler.compress(error_filename, temp_dir) finally: shutil.rmtree(temp_dir) return error_filename def list_runnable_jobs(self): job_dirs = filter(lambda s: os.path.isdir(os.path.join(self.job_dir, s)), os.listdir(self.job_dir)) jobs = {} for job_dir in job_dirs: desc = import_job_desc(os.path.join(self.job_dir, job_dir)) jobs[job_dir] = desc.name return jobs def has_running_jobs(self): return len(self.job_tracker.running_jobs) > 0 def list_workers(self): return [(worker, STATUSES[worker_info.status]) for worker, worker_info \ in self.worker_tracker.workers.iteritems()] def _stop_all_jobs(self): for job_name in self.job_tracker.running_jobs.keys(): self.stop_job(job_name) def _shutdown_all_workers(self): stage = Stage(self.worker_tracker.workers.keys(), 'shutdown') stage.barrier(True) def shutdown(self): if not hasattr(self, '_worker_t'): return if not hasattr(self, '_job_t'): return self.logger.debug('master starts to shutdown') self.stopped.set() self._stop_all_jobs() self._shutdown_all_workers() self._worker_t.join() self._job_t.join() self.save() self.rpc_server.shutdown() self.logger.debug('master shutdown finished') self._shutdown_log_server()