Ejemplo n.º 1
0
    def __init__(self, ctx):
        self.ctx = ctx
        self.rpc_server = self.ctx.master_rpc_server
        assert self.rpc_server is not None

        self.working_dir = os.path.join(self.ctx.working_dir, 'master')
        self.zip_dir = os.path.join(self.working_dir, 'zip')
        self.job_dir = os.path.join(self.working_dir, 'jobs')
        if not os.path.exists(self.zip_dir):
            os.makedirs(self.zip_dir)
        if not os.path.exists(self.job_dir):
            os.makedirs(self.job_dir)

        self.worker_tracker = WorkerTracker()
        self.job_tracker = JobTracker()

        self.black_list = []

        self.stopped = threading.Event()

        self.logger = get_logger("cola_master")
        self._init_log_server(self.logger)

        self._register_rpc()
        self.load()
        FileTransportServer(self.rpc_server, self.zip_dir)
Ejemplo n.º 2
0
    def __init__(self, ctx):
        self.ctx = ctx
        self.rpc_server = self.ctx.master_rpc_server
        assert self.rpc_server is not None
        
        self.working_dir = os.path.join(self.ctx.working_dir, 'master')
        self.zip_dir = os.path.join(self.working_dir, 'zip')
        self.job_dir = os.path.join(self.working_dir, 'jobs')
        if not os.path.exists(self.zip_dir):
            os.makedirs(self.zip_dir)
        if not os.path.exists(self.job_dir):
            os.makedirs(self.job_dir)
        
        self.worker_tracker = WorkerTracker()
        self.job_tracker = JobTracker()

        self.black_list = []
        
        self.stopped = threading.Event()
        
        self.logger = get_logger("cola_master")
        self._init_log_server(self.logger)
        
        self._register_rpc()
        self.load()
        FileTransportServer(self.rpc_server, self.zip_dir)
Ejemplo n.º 3
0
class Master(object):
    def __init__(self, ctx):
        self.ctx = ctx
        self.rpc_server = self.ctx.master_rpc_server
        assert self.rpc_server is not None

        self.working_dir = os.path.join(self.ctx.working_dir, 'master')
        self.zip_dir = os.path.join(self.working_dir, 'zip')
        self.job_dir = os.path.join(self.working_dir, 'jobs')
        if not os.path.exists(self.zip_dir):
            os.makedirs(self.zip_dir)
        if not os.path.exists(self.job_dir):
            os.makedirs(self.job_dir)

        self.worker_tracker = WorkerTracker()
        self.job_tracker = JobTracker()

        self.black_list = []

        self.stopped = threading.Event()

        self.logger = get_logger("cola_master")
        self._init_log_server(self.logger)

        self._register_rpc()
        self.load()
        FileTransportServer(self.rpc_server, self.zip_dir)

    def load(self):
        self.runned_job_metas = {}

        job_meta_file = os.path.join(self.working_dir,
                                     JOB_META_STATUS_FILENAME)
        if os.path.exists(job_meta_file) and \
            os.path.getsize(job_meta_file) > 0:
            try:
                with open(job_meta_file) as f:
                    self.runned_job_metas = pickle.load(f)
            except:
                pass

    def save(self):
        job_meta_file = os.path.join(self.working_dir,
                                     JOB_META_STATUS_FILENAME)
        with open(job_meta_file, 'w') as f:
            pickle.dump(self.runned_job_metas, f)

    def _register_rpc(self):
        self.rpc_server.register_function(self.run_job, 'run_job')
        self.rpc_server.register_function(self.stop_job, 'stop_job')
        self.rpc_server.register_function(self.pack_job_error,
                                          'pack_job_error')
        self.rpc_server.register_function(self.list_runnable_jobs,
                                          'runnable_jobs')
        self.rpc_server.register_function(
            lambda: self.job_tracker.running_jobs.keys(), 'running_jobs')
        self.rpc_server.register_function(self.list_workers, 'list_workers')
        self.rpc_server.register_function(self.shutdown, 'shutdown')
        self.rpc_server.register_function(self.register_heartbeat,
                                          'register_heartbeat')

    def register_heartbeat(self, worker):
        self.worker_tracker.register_worker(worker)
        return self.worker_tracker.workers.keys()

    def _init_log_server(self, logger):
        self.log_server = LogRecordSocketReceiver(host=self.ctx.ip,
                                                  logger=self.logger)
        self.log_t = threading.Thread(target=self.log_server.serve_forever)
        self.log_t.start()

    def _shutdown_log_server(self):
        if hasattr(self, 'log_server'):
            self.log_server.shutdown()
            self.log_t.join()

    def _check_workers(self):
        while not self.stopped.is_set():
            for worker, info in self.worker_tracker.workers.iteritems():
                # if loose connection
                if int(time.time()) - info.last_update \
                    > HEARTBEAT_CHECK_INTERVAL:

                    info.continous_register = 0
                    if info.status == RUNNING:
                        info.status = HANGUP
                    elif info.status == HANGUP:
                        info.status = STOPPED
                        self.black_list.append(worker)

                        for job in self.job_tracker.running_jobs:
                            self.job_tracker.remove_worker(job, worker)

                # if continously connect for more than 10 min
                elif info.continous_register >= CONTINOUS_HEARTBEAT:
                    if info.status != RUNNING:
                        info.status = RUNNING
                    if worker in self.black_list:
                        self.black_list.remove(worker)

                    for job in self.job_tracker.running_jobs:
                        if not client_call(worker, 'has_job'):
                            client_call(worker, 'prepare', job)
                            client_call(worker, 'run_job', job)
                        self.job_tracker.add_worker(job, worker)

            self.stopped.wait(HEARTBEAT_CHECK_INTERVAL)

    def _check_jobs(self):
        while not self.stopped.is_set():
            for job_master in self.job_tracker.running_jobs.values():
                if job_master.budget_server.get_status() == ALLFINISHED:
                    self.stop_job(job_master.job_name)
                    self.job_tracker.remove_job(job_master.job_name)
            self.stopped.wait(JOB_CHECK_INTERVAL)

    def _unzip(self, job_name):
        zip_file = os.path.join(self.zip_dir, job_name + '.zip')
        if os.path.exists(zip_file):
            ZipHandler.uncompress(zip_file, self.job_dir)

    def _register_runned_job(self, job_name, job_desc):
        self.runned_job_metas[job_name] = {
            'job_name': job_desc.name,
            'created': time.time()
        }

    def run(self):
        self._worker_t = threading.Thread(target=self._check_workers)
        self._worker_t.start()

        self._job_t = threading.Thread(target=self._check_jobs)
        self._job_t.start()

    def run_job(self, job_name, unzip=False, wait_for_workers=False):
        if wait_for_workers:
            while not self.stopped.is_set():
                if len(self.worker_tracker.workers) > 0:
                    break
                stopped = self.stopped.wait(3)
                if stopped:
                    return

        if unzip:
            self._unzip(job_name)

        job_path = os.path.join(self.job_dir, job_name)
        job_desc = import_job_desc(job_path)
        job_master = JobMaster(self.ctx, job_name, job_desc,
                               self.worker_tracker.workers.keys())
        job_master.init()
        self.job_tracker.register_job(job_name, job_master)
        self._register_runned_job(job_name, job_desc)

        zip_file = os.path.join(self.zip_dir, job_name + '.zip')
        for worker in job_master.workers:
            FileTransportClient(worker, zip_file).send_file()

        self.logger.debug('entering the master prepare stage, job id: %s' %
                          job_name)
        self.logger.debug('job available workers: %s' % job_master.workers)
        stage = Stage(job_master.workers, 'prepare')
        prepared_ok = stage.barrier(True, job_name)
        if not prepared_ok:
            self.logger.error("prepare for running failed")
            return

        self.logger.debug('entering the master run_job stage, job id: %s' %
                          job_name)
        stage = Stage(job_master.workers, 'run_job')
        run_ok = stage.barrier(True, job_name)
        if not run_ok:
            self.logger.error("run job failed, job id: %s" % job_name)

    def stop_job(self, job_name):
        job_master = self.job_tracker.get_job_master(job_name)
        stage = Stage(job_master.workers, 'stop_job')
        stage.barrier(True, job_name)

        stage = Stage(job_master.workers, 'clear_job')
        stage.barrier(True, job_name)

        self.job_tracker.remove_job(job_name)

        self.logger.debug('stop job: %s' % job_name)

    def pack_job_error(self, job_name):
        job_master = self.job_tracker.get_job_master(job_name)
        stage = Stage(job_master.workers, 'pack_job_error')
        stage.barrier(True, job_name)

        error_dir = os.path.join(self.working_dir, 'errors')
        if not os.path.exists(error_dir):
            os.makedirs(error_dir)
        error_filename = os.path.join(error_dir, '%s_errors.zip' % job_name)

        suffix = '%s_errors.zip' % job_name
        temp_dir = tempfile.mkdtemp()
        try:
            for name in os.listdir(self.zip_dir):
                if name.endswith(suffix):
                    shutil.move(os.path.join(self.zip_dir, name), temp_dir)
            ZipHandler.compress(error_filename, temp_dir)
        finally:
            shutil.rmtree(temp_dir)

        return error_filename

    def list_runnable_jobs(self):
        job_dirs = filter(
            lambda s: os.path.isdir(os.path.join(self.job_dir, s)),
            os.listdir(self.job_dir))

        jobs = {}
        for job_dir in job_dirs:
            desc = import_job_desc(os.path.join(self.job_dir, job_dir))
            jobs[job_dir] = desc.name
        return jobs

    def has_running_jobs(self):
        return len(self.job_tracker.running_jobs) > 0

    def list_workers(self):
        return [(worker, STATUSES[worker_info.status]) for worker, worker_info \
                in self.worker_tracker.workers.iteritems()]

    def _stop_all_jobs(self):
        for job_name in self.job_tracker.running_jobs.keys():
            self.stop_job(job_name)

    def _shutdown_all_workers(self):
        stage = Stage(self.worker_tracker.workers.keys(), 'shutdown')
        stage.barrier(True)

    def shutdown(self):
        if not hasattr(self, '_worker_t'):
            return
        if not hasattr(self, '_job_t'):
            return

        self.logger.debug('master starts to shutdown')

        self.stopped.set()
        self._stop_all_jobs()
        self._shutdown_all_workers()

        self._worker_t.join()
        self._job_t.join()

        self.save()
        self.rpc_server.shutdown()
        self.logger.debug('master shutdown finished')
        self._shutdown_log_server()
Ejemplo n.º 4
0
class Master(object):
    def __init__(self, ctx):
        self.ctx = ctx
        self.rpc_server = self.ctx.master_rpc_server
        assert self.rpc_server is not None
        
        self.working_dir = os.path.join(self.ctx.working_dir, 'master')
        self.zip_dir = os.path.join(self.working_dir, 'zip')
        self.job_dir = os.path.join(self.working_dir, 'jobs')
        if not os.path.exists(self.zip_dir):
            os.makedirs(self.zip_dir)
        if not os.path.exists(self.job_dir):
            os.makedirs(self.job_dir)
        
        self.worker_tracker = WorkerTracker()
        self.job_tracker = JobTracker()

        self.black_list = []
        
        self.stopped = threading.Event()
        
        self.logger = get_logger("cola_master")
        self._init_log_server(self.logger)
        
        self._register_rpc()
        self.load()
        FileTransportServer(self.rpc_server, self.zip_dir)
        
    def load(self):
        self.runned_job_metas = {}
        
        job_meta_file = os.path.join(self.working_dir, JOB_META_STATUS_FILENAME)
        if os.path.exists(job_meta_file) and \
            os.path.getsize(job_meta_file) > 0:
            try:
                with open(job_meta_file) as f:
                    self.runned_job_metas = pickle.load(f)
            except:
                pass
    
    def save(self):
        job_meta_file = os.path.join(self.working_dir, JOB_META_STATUS_FILENAME)
        with open(job_meta_file, 'w') as f:
            pickle.dump(self.runned_job_metas, f)
        
    def _register_rpc(self):
        self.rpc_server.register_function(self.run_job, 'run_job')
        self.rpc_server.register_function(self.stop_job, 'stop_job')
        self.rpc_server.register_function(self.pack_job_error, 'pack_job_error')
        self.rpc_server.register_function(self.list_runnable_jobs, 
                                          'runnable_jobs')
        self.rpc_server.register_function(lambda: self.job_tracker.running_jobs.keys(),
                                          'running_jobs')
        self.rpc_server.register_function(self.list_workers,
                                          'list_workers')
        self.rpc_server.register_function(self.shutdown, 'shutdown')
        self.rpc_server.register_function(self.register_heartbeat, 
                                          'register_heartbeat')
        
    def register_heartbeat(self, worker):
        self.worker_tracker.register_worker(worker)
        return self.worker_tracker.workers.keys()
    
    def _init_log_server(self, logger):
        self.log_server = LogRecordSocketReceiver(host=self.ctx.ip, 
                                                  logger=self.logger)
        self.log_t = threading.Thread(target=self.log_server.serve_forever)
        self.log_t.start()
        
    def _shutdown_log_server(self):
        if hasattr(self, 'log_server'):
            self.log_server.shutdown()
            self.log_t.join()
    
    def _check_workers(self):
        while not self.stopped.is_set():
            for worker, info in self.worker_tracker.workers.iteritems():
                # if loose connection
                if int(time.time()) - info.last_update \
                    > HEARTBEAT_CHECK_INTERVAL:
                    
                    info.continous_register = 0
                    if info.status == RUNNING:
                        info.status = HANGUP
                    elif info.status == HANGUP:
                        info.status = STOPPED
                        self.black_list.append(worker)
                        
                        for job in self.job_tracker.running_jobs:
                            self.job_tracker.remove_worker(job, worker)
                        
                # if continously connect for more than 10 min
                elif info.continous_register >= CONTINOUS_HEARTBEAT:
                    if info.status != RUNNING:
                        info.status = RUNNING
                    if worker in self.black_list:
                        self.black_list.remove(worker)
                        
                    for job in self.job_tracker.running_jobs:
                        if not client_call(worker, 'has_job'):
                            client_call(worker, 'prepare', job)
                            client_call(worker, 'run_job', job)
                        self.job_tracker.add_worker(job, worker)
                
            self.stopped.wait(HEARTBEAT_CHECK_INTERVAL)
                        
    def _check_jobs(self):
        while not self.stopped.is_set():
            for job_master in self.job_tracker.running_jobs.values():
                if job_master.budget_server.get_status() == ALLFINISHED:
                    self.stop_job(job_master.job_name)
                    self.job_tracker.remove_job(job_master.job_name)
            self.stopped.wait(JOB_CHECK_INTERVAL)
                        
    def _unzip(self, job_name):
        zip_file = os.path.join(self.zip_dir, job_name+'.zip')
        if os.path.exists(zip_file):
            ZipHandler.uncompress(zip_file, self.job_dir)
            
    def _register_runned_job(self, job_name, job_desc):
        self.runned_job_metas[job_name] = {'job_name': job_desc.name,
                                           'created': time.time()}
                        
    def run(self):
        self._worker_t = threading.Thread(target=self._check_workers)
        self._worker_t.start()
        
        self._job_t = threading.Thread(target=self._check_jobs)
        self._job_t.start()
        
    def run_job(self, job_name, unzip=False, 
                wait_for_workers=False):
        if wait_for_workers:
            while not self.stopped.is_set():
                if len(self.worker_tracker.workers) > 0:
                    break
                stopped = self.stopped.wait(3)
                if stopped:
                    return

        if unzip:
            self._unzip(job_name)
        
        job_path = os.path.join(self.job_dir, job_name)
        job_desc = import_job_desc(job_path)
        job_master = JobMaster(self.ctx, job_name, job_desc, 
                               self.worker_tracker.workers.keys())
        job_master.init()
        self.job_tracker.register_job(job_name, job_master)
        self._register_runned_job(job_name, job_desc)
        
        zip_file = os.path.join(self.zip_dir, job_name+'.zip')
        for worker in job_master.workers:
            FileTransportClient(worker, zip_file).send_file()
        
        self.logger.debug(
            'entering the master prepare stage, job id: %s' % job_name)
        self.logger.debug(
            'job available workers: %s' % job_master.workers)
        stage = Stage(job_master.workers, 'prepare')
        stage.barrier(True, job_name)
        
        self.logger.debug(
            'entering the master run_job stage, job id: %s' % job_name)
        stage = Stage(job_master.workers, 'run_job')
        stage.barrier(True, job_name)
        
    def stop_job(self, job_name):
        job_master = self.job_tracker.get_job_master(job_name)
        stage = Stage(job_master.workers, 'stop_job')
        stage.barrier(True, job_name)
        
        stage = Stage(job_master.workers, 'clear_job')
        stage.barrier(True, job_name)
        
        self.job_tracker.remove_job(job_name)

        self.logger.debug('stop job: %s' % job_name)
        
    def pack_job_error(self, job_name):
        job_master = self.job_tracker.get_job_master(job_name)
        stage = Stage(job_master.workers, 'pack_job_error')
        stage.barrier(True, job_name)
        
        error_dir = os.path.join(self.working_dir, 'errors')
        if not os.path.exists(error_dir):
            os.makedirs(error_dir)
        error_filename = os.path.join(error_dir, '%s_errors.zip'%job_name)
        
        suffix = '%s_errors.zip' % job_name
        temp_dir = tempfile.mkdtemp()
        try:
            for name in os.listdir(self.zip_dir):
                if name.endswith(suffix):
                    shutil.move(os.path.join(self.zip_dir, name), temp_dir)
            ZipHandler.compress(error_filename, temp_dir)
        finally:
            shutil.rmtree(temp_dir)
            
        return error_filename
    
    def list_runnable_jobs(self):
        job_dirs = filter(lambda s: os.path.isdir(os.path.join(self.job_dir, s)), 
                          os.listdir(self.job_dir))
        
        jobs = {}
        for job_dir in job_dirs:
            desc = import_job_desc(os.path.join(self.job_dir, job_dir))
            jobs[job_dir] = desc.name
        return jobs
        
    def has_running_jobs(self):
        return len(self.job_tracker.running_jobs) > 0
    
    def list_workers(self):
        return [(worker, STATUSES[worker_info.status]) for worker, worker_info \
                in self.worker_tracker.workers.iteritems()]
        
    def _stop_all_jobs(self):
        for job_name in self.job_tracker.running_jobs.keys():
            self.stop_job(job_name)
            
    def _shutdown_all_workers(self):
        stage = Stage(self.worker_tracker.workers.keys(), 'shutdown')
        stage.barrier(True)
        
    def shutdown(self):
        if not hasattr(self, '_worker_t'):
            return
        if not hasattr(self, '_job_t'):
            return
        
        self.logger.debug('master starts to shutdown')
        
        self.stopped.set()
        self._stop_all_jobs()
        self._shutdown_all_workers()
        
        self._worker_t.join()
        self._job_t.join()
        
        self.save()
        self.rpc_server.shutdown()
        self.logger.debug('master shutdown finished')
        self._shutdown_log_server()