def run_job(self, job_path, overwrite=False, init_rpc=False, settings=None): rpc_server = None if init_rpc: rpc_server = ThreadedColaRPCServer((self.ip, self.port)) if self.is_local_mode: self._run_local_job(job_path, overwrite=overwrite, rpc_server=rpc_server, settings=settings) else: job_name = import_job_desc(job_path).uniq_name def create_zip(working_dir): zip_dir = os.path.join(self.working_dir, 'zip') filename = job_name + '.zip' zip_file = os.path.join(zip_dir, filename) ZipHandler.compress(zip_file, job_path, type_filters=('pyc', )) return job_name if hasattr(self, 'master'): create_zip(os.path.join(self.working_dir, 'master')) self.master.run_job(job_name, unzip=True) elif hasattr(self, 'worker'): create_zip(os.path.join(self.working_dir, 'worker')) self.worker.prepare(job_name, unzip=True) self.worker.run_job(job_name)
def prepare(self, job_name, unzip=True, overwrite=False, settings=None): self.logger.debug('entering worker prepare phase, job id: %s' % job_name) if unzip: self._unzip(job_name) src_job_name = job_name job_path = os.path.join(self.job_dir, job_name) if not os.path.exists(job_path): return False job_desc = import_job_desc(job_path) if settings is not None: job_desc.update_settings(settings) job_id = self.ctx.ips.index(self.ctx.ip) clear = job_desc.settings.job.clear \ if self.ctx.is_local_mode else False job_name, working_dir = self.ctx._get_name_and_dir( self.working_dir, job_name, overwrite=overwrite, clear=clear) job = Job(self.ctx, job_path, job_name, job_desc=job_desc, working_dir=working_dir, rpc_server=self.rpc_server, manager=self.ctx.manager, job_offset=job_id) t = threading.Thread(target=job.run, args=(True, )) job_info = WorkerJobInfo(job_name, working_dir) job_info.job = job job_info.thread = t self.running_jobs[src_job_name] = job_info self.logger.debug('worker prepare phase finished, job id: %s' % job_name) return True
def run_job(self, job_path, overwrite=False, init_rpc=False, settings=None): rpc_server = None if init_rpc: rpc_server = ThreadedColaRPCServer( (self.ip, main_conf.worker.port)) if self.is_local_mode: self._run_local_job(job_path, overwrite=overwrite, rpc_server=rpc_server) else: job_name = import_job_desc(job_path).uniq_name def create_zip(working_dir): zip_dir = os.path.join(self.working_dir, 'zip') filename = job_name + '.zip' zip_file = os.path.join(zip_dir, filename) ZipHandler.compress(zip_file, job_path, type_filters=('pyc', )) return job_name if hasattr(self, 'master'): create_zip(os.path.join(self.working_dir, 'master')) self.master.run_job(job_name, unzip=True) elif hasattr(self, 'worker'): create_zip(os.path.join(self.working_dir, 'worker')) self.worker.prepare(job_name, unzip=True) self.worker.run_job(job_name)
def list_runnable_jobs(self): job_dirs = filter(lambda s: os.path.isdir(os.path.join(self.job_dir, s)), os.listdir(self.job_dir)) jobs = {} for job_dir in job_dirs: desc = import_job_desc(os.path.join(self.job_dir, job_dir)) jobs[job_dir] = desc.name return jobs
def run(self, args): master_addr = args.master ctx = Context(is_client=True, master_addr=master_addr) if args.list is True: jobs = ctx.list_jobs() self.logger.info('list jobs at master: %s' % ctx.master_addr) for job_id, info in jobs.iteritems(): self.logger.info( '====> job id: %s, job description: %s, status: %s' % \ (job_id, info['name'], info['status'])) if len(jobs) == 0: self.logger.info('no jobs exist') elif args.kill is not None: job_id = self._get_matched_job_name(ctx, args.kill) if job_id is not None: ctx.kill_job(job_id) self.logger.info('killed job: %s' % job_id) elif args.upload is not None: path = os.path.abspath(args.upload) if not os.path.exists(path): self.logger.error('upload path does not exist') return job_id = None try: job_id = import_job_desc(path).uniq_name except Exception, e: self.logger.exception(e) self.logger.error('uploading job description failed') return new_upload_dir = os.path.join(tempfile.gettempdir(), job_id) if os.path.exists(new_upload_dir): shutil.rmtree(new_upload_dir) shutil.copytree(path, new_upload_dir) temp_filename = os.path.join(tempfile.gettempdir(), job_id + '.zip') ZipHandler.compress(temp_filename, new_upload_dir, type_filters=('pyc', )) try: FileTransportClient(ctx.master_addr, temp_filename).send_file() finally: os.remove(temp_filename) shutil.rmtree(new_upload_dir) self.logger.info('upload job <id: %s> finished' % job_id) if args.run == 'U': client_call(ctx.master_addr, 'run_job', job_id, True) self.logger.info('submit job <id: %s> to the cluster' % job_id)
def __init__(self, container_id, working_dir, job_path, job_name, env, mq, counter_server, budget_server, speed_server, stopped, nonsuspend, idle_statuses, n_tasks=1, is_local=False, master_ip=None, logger=None, task_start_id=0): self.container_id = container_id self.working_dir = working_dir self.mq = mq self.env = env self.job_name = job_name self.job_desc = env['job_desc'].get(job_name) or \ import_job_desc(job_path) self.counter_server = counter_server self.budget_server = budget_server self.speed_server = speed_server self.stopped = stopped self.nonsuspend = nonsuspend self.idle_statuses = idle_statuses self.n_tasks = n_tasks self.is_local = is_local self.master_ip = master_ip self.logger = logger self.task_start_id = task_start_id self.ip = self.env.get('ip', None) or get_ip() self.counter_clients = [None for _ in range(self.n_tasks)] self.budget_clients = [None for _ in range(self.n_tasks)] self.speed_clients = [None for _ in range(self.n_tasks)] self.task_threads = [] self.inited = False self.lock = multiprocessing.Lock()
def run(self, args): master_addr = args.master ctx = Context(is_client=True, master_addr=master_addr) if args.list is True: jobs = ctx.list_jobs() self.logger.info('list jobs at master: %s' % ctx.master_addr) for job_id, info in jobs.iteritems(): self.logger.info( '====> job id: %s, job description: %s, status: %s' % \ (job_id, info['name'], info['status'])) if len(jobs) == 0: self.logger.info('no jobs exist') elif args.kill is not None: job_id = self._get_matched_job_name(ctx, args.kill) if job_id is not None: ctx.kill_job(job_id) self.logger.info('killed job: %s' % job_id) elif args.upload is not None: path = os.path.abspath(args.upload) if not os.path.exists(path): self.logger.error('upload path does not exist') return job_id = None try: job_id = import_job_desc(path).uniq_name except Exception, e: self.logger.exception(e) self.logger.error('uploading job description failed') return new_upload_dir = os.path.join(tempfile.gettempdir(), job_id) if os.path.exists(new_upload_dir): shutil.rmtree(new_upload_dir) shutil.copytree(path, new_upload_dir) temp_filename = os.path.join(tempfile.gettempdir(), job_id+'.zip') ZipHandler.compress(temp_filename, new_upload_dir, type_filters=('pyc', )) try: FileTransportClient(ctx.master_addr, temp_filename).send_file() finally: os.remove(temp_filename) shutil.rmtree(new_upload_dir) self.logger.info('upload job <id: %s> finished' % job_id) if args.run == 'U': client_call(ctx.master_addr, 'run_job', job_id, True) self.logger.info('submit job <id: %s> to the cluster' % job_id)
def __init__(self, ctx, job_def_path, job_name, job_desc=None, working_dir=None, rpc_server=None, manager=None, job_offset=0): self.status = NOTSTARTED self.ctx = ctx self.shutdown_callbacks = [] self.stopped = multiprocessing.Event() self.nonsuspend = multiprocessing.Event() self.nonsuspend.set() self.job_def_path = job_def_path self.job_name = job_name self.working_dir = working_dir or os.path.join(self.ctx.working_dir, self.job_name) self.logger = get_logger(name='cola_job' + str(time.time())) self.job_desc = job_desc or import_job_desc(job_def_path) self.settings = self.job_desc.settings self.is_bundle = self.settings.job.mode == 'bundle' self.rpc_server = rpc_server self.n_instances = self.job_desc.settings.job.instances self.n_containers = min(get_cpu_count(), max(self.n_instances, 1)) self.job_offset = job_offset self.is_multi_process = self.n_containers > 1 self.processes = [] self.idle_statuses = manager.list([False] * self.n_containers) self.manager = manager if not os.path.exists(self.working_dir): os.makedirs(self.working_dir) self.inited = False self._register_rpc()
class Test(unittest.TestCase): def setUp(self): self.working_dir = tempfile.mkdtemp() self.job_dir = os.path.join(self.working_dir, 'master', 'jobs') self.zip_dir = os.path.join(self.working_dir, 'master', 'zip') if not os.path.exists(self.zip_dir): os.makedirs(self.zip_dir) wiki_path = os.path.join( os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'app', 'wiki') try: shutil.copytree(wiki_path, os.path.join(self.job_dir, 'wiki')) except OSError, e: if e.errno == errno.ENOTDIR: shutil.copy(wiki_path, os.path.join(self.job_dir, 'wiki')) else: raise self.job_name = import_job_desc(wiki_path).uniq_name old_wiki_path = os.path.join(self.job_dir, 'wiki') new_wiki_path = os.path.join(self.job_dir, self.job_name) os.rename(old_wiki_path, new_wiki_path) ZipHandler.compress(os.path.join(self.zip_dir, self.job_name+'.zip'), new_wiki_path) config_file = os.path.join(new_wiki_path, 'wiki.yaml') try: os.remove(os.path.join(new_wiki_path, 'test.yaml')) except: pass with open(config_file) as f: yaml_obj = yaml.load(f) yaml_obj['job']['size'] = 5 yaml_obj['job']['instances'] = 1 yaml_obj['job']['priorities'] = 1 with open(config_file, 'w') as f: yaml.dump(yaml_obj, f)
def run_job(self, job_name, unzip=False, wait_for_workers=False): if wait_for_workers: while not self.stopped.is_set(): if len(self.worker_tracker.workers) > 0: break stopped = self.stopped.wait(3) if stopped: return if unzip: self._unzip(job_name) job_path = os.path.join(self.job_dir, job_name) job_desc = import_job_desc(job_path) job_master = JobMaster(self.ctx, job_name, job_desc, self.worker_tracker.workers.keys()) job_master.init() self.job_tracker.register_job(job_name, job_master) self._register_runned_job(job_name, job_desc) zip_file = os.path.join(self.zip_dir, job_name+'.zip') for worker in job_master.workers: FileTransportClient(worker, zip_file).send_file() self.logger.debug( 'entering the master prepare stage, job id: %s' % job_name) self.logger.debug( 'job available workers: %s' % job_master.workers) stage = Stage(job_master.workers, 'prepare', logger=self.logger) prepared_ok = stage.barrier(True, job_name) if not prepared_ok: self.logger.error("prepare for running failed") return self.logger.debug( 'entering the master run_job stage, job id: %s' % job_name) stage = Stage(job_master.workers, 'run_job', logger=self.logger) run_ok = stage.barrier(True, job_name) if not run_ok: self.logger.error("run job failed, job id: %s" % job_name)
def run_job(self, job_name, unzip=False, wait_for_workers=False): if wait_for_workers: while not self.stopped.is_set(): if len(self.worker_tracker.workers) > 0: break stopped = self.stopped.wait(3) if stopped: return if unzip: self._unzip(job_name) job_path = os.path.join(self.job_dir, job_name) job_desc = import_job_desc(job_path) job_master = JobMaster(self.ctx, job_name, job_desc, self.worker_tracker.workers.keys()) job_master.init() self.job_tracker.register_job(job_name, job_master) self._register_runned_job(job_name, job_desc) zip_file = os.path.join(self.zip_dir, job_name + '.zip') for worker in job_master.workers: FileTransportClient(worker, zip_file).send_file() self.logger.debug('entering the master prepare stage, job id: %s' % job_name) self.logger.debug('job available workers: %s' % job_master.workers) stage = Stage(job_master.workers, 'prepare') prepared_ok = stage.barrier(True, job_name) if not prepared_ok: self.logger.error("prepare for running failed") return self.logger.debug('entering the master run_job stage, job id: %s' % job_name) stage = Stage(job_master.workers, 'run_job') run_ok = stage.barrier(True, job_name) if not run_ok: self.logger.error("run job failed, job id: %s" % job_name)
def __init__(self, ctx, job_def_path, job_name, job_desc=None, working_dir=None, rpc_server=None, manager=None, job_offset=0): self.status = NOTSTARTED self.ctx = ctx self.shutdown_callbacks = [] self.stopped = multiprocessing.Event() self.nonsuspend = multiprocessing.Event() self.nonsuspend.set() self.job_def_path = job_def_path self.job_name = job_name self.working_dir = working_dir or os.path.join(self.ctx.working_dir, self.job_name) self.logger = get_logger(name='cola_job'+str(time.time())) self.job_desc = job_desc or import_job_desc(job_def_path) self.settings = self.job_desc.settings self.is_bundle = self.settings.job.mode == 'bundle' self.rpc_server = rpc_server self.n_instances = self.job_desc.settings.job.instances self.n_containers = min(get_cpu_count(), max(self.n_instances, 1)) self.job_offset = job_offset self.is_multi_process = self.n_containers > 1 self.processes = [] self.idle_statuses = manager.list([False] * self.n_containers) self.manager = manager if not os.path.exists(self.working_dir): os.makedirs(self.working_dir) self.inited = False self._register_rpc()
def _run_local_job(self, job_path, overwrite=False, rpc_server=None, settings=None): job_desc = import_job_desc(job_path) if settings is not None: job_desc.update_settings(settings) base_name = job_desc.uniq_name self.env['job_desc'][base_name] = job_desc addr_dirname = self.addr.replace('.', '_').replace(':', '_') working_dir = os.path.join(self.working_dir, 'worker', addr_dirname) clear = job_desc.settings.job.clear job_name, working_dir = self._get_name_and_dir( working_dir, base_name, overwrite=overwrite, clear=clear) clock = Clock() job = Job(self, job_path, job_name, job_desc=job_desc, working_dir=working_dir, rpc_server=rpc_server, manager=self.manager) t = threading.Thread(target=job.run, args=(True, )) t.start() stopped = multiprocessing.Event() def stop(signum, frame): if 'main' not in multiprocessing.current_process().name.lower(): return if stopped.is_set(): return else: stopped.set() self.logger.debug("Catch interrupt signal, start to stop") job.shutdown() if rpc_server: rpc_server.shutdown() signal.signal(signal.SIGINT, stop) signal.signal(signal.SIGTERM, stop) idle_times = 0 while t.is_alive(): if job.get_status() == FINISHED: break if job.get_status() == IDLE: idle_times += 1 if idle_times > MAX_IDLE_TIMES: break else: idle_times = 0 try: t.join(5) except IOError: break need_shutdown = False if not job.stopped.is_set() and job.get_status() == FINISHED: self.logger.debug('All objects have been fetched, try to finish job') need_shutdown = True elif not stopped.is_set() and not t.is_alive(): need_shutdown = True elif not job.stopped.is_set() and job.get_status() == IDLE: self.logger.debug('No bundle or url to perform, try to finish job') need_shutdown = True if need_shutdown is True: job.shutdown() if rpc_server: rpc_server.shutdown() self.logger.debug('Job id:%s finished, spend %.2f seconds for running' % ( job_name, clock.clock()))
def _run_local_job(self, job_path, overwrite=False, rpc_server=None, settings=None): job_desc = import_job_desc(job_path) if settings is not None: job_desc.update_settings(settings) base_name = job_desc.uniq_name self.env['job_desc'][base_name] = job_desc working_dir = os.path.join(self.working_dir, 'worker') clear = job_desc.settings.job.clear job_name, working_dir = self._get_name_and_dir(working_dir, base_name, overwrite=overwrite, clear=clear) clock = Clock() job = Job(self, job_path, job_name, job_desc=job_desc, working_dir=working_dir, rpc_server=rpc_server, manager=self.manager) t = threading.Thread(target=job.run, args=(True, )) t.start() stopped = multiprocessing.Event() def stop(signum, frame): if 'main' not in multiprocessing.current_process().name.lower(): return if stopped.is_set(): return else: stopped.set() self.logger.debug("Catch interrupt signal, start to stop") job.shutdown() if rpc_server: rpc_server.shutdown() signal.signal(signal.SIGINT, stop) signal.signal(signal.SIGTERM, stop) idle_times = 0 while t.is_alive(): if job.get_status() == FINISHED: break if job.get_status() == IDLE: idle_times += 1 if idle_times > MAX_IDLE_TIMES: break else: idle_times = 0 try: t.join(5) except IOError: break need_shutdown = False if not job.stopped.is_set() and job.get_status() == FINISHED: self.logger.debug( 'All objects have been fetched, try to finish job') need_shutdown = True elif not stopped.is_set() and not t.is_alive(): need_shutdown = True elif not job.stopped.is_set() and job.get_status() == IDLE: self.logger.debug('No bundle or url to perform, try to finish job') need_shutdown = True if need_shutdown is True: job.shutdown() if rpc_server: rpc_server.shutdown() self.logger.debug( 'Job id:%s finished, spend %.2f seconds for running' % (job_name, clock.clock()))