def load_job(path, nodes, context=None): if not os.path.exists(path): raise ValueError('Job definition does not exist.') job = import_job(path) job_name = job.name.replace(' ', '_') if job.debug: job_name += '_debug' holder = os.path.join(root_dir(), 'data', 'master', 'jobs', job_name) if not os.path.exists(holder): os.makedirs(holder) lock_f = os.path.join(holder, 'lock') if os.path.exists(lock_f): raise JobMasterRunning('There has been a running job master') open(lock_f, 'w').close() rpc_server = create_rpc_server(job) try: loader = JobLoader(job, nodes, rpc_server, context=context) loader.run() # nofify master watcher finishing master_watcher = '%s:%s' % (get_ip(), main_conf.master.port) client_call(master_watcher, 'finish_job', job.real_name) finally: os.remove(lock_f) rpc_server.shutdown()
def _check_workers(self): while not self.stopped.is_set(): for worker, info in self.worker_tracker.workers.iteritems(): # if loose connection if int(time.time()) - info.last_update \ > HEARTBEAT_CHECK_INTERVAL: info.continous_register = 0 if info.status == RUNNING: info.status = HANGUP elif info.status == HANGUP: info.status = STOPPED self.black_list.append(worker) for job in self.job_tracker.running_jobs: self.job_tracker.remove_worker(job, worker) # if continously connect for more than 10 min elif info.continous_register >= CONTINOUS_HEARTBEAT: if info.status != RUNNING: info.status = RUNNING if worker in self.black_list: self.black_list.remove(worker) for job in self.job_tracker.running_jobs: if not client_call(worker, 'has_job'): client_call(worker, 'prepare', job) client_call(worker, 'run_job', job) self.job_tracker.add_worker(job, worker) self.stopped.wait(HEARTBEAT_CHECK_INTERVAL)
def stop(self): for node in self.nodes: try: client_call(node, 'stop') except socket.error: pass self.stopped = True
def add_node(self, node): for node in self.nodes: self.logger.debug('add node %s to nodes' % node) client_call(node, 'add_node', node, ignore=True) self.nodes.append(node) self.logger.debug('run new node %s ' % node) client_call(node, 'run', ignore=True)
def stop(self): for node in self.nodes: try: client_call(node, 'stop') except socket.error: pass self.finish()
def _remote_or_local_batch_put(self, addr, objs): if self._check_empty(objs): return if addr == self.addr_: self.mq_node.batch_put(objs) else: client_call(addr, self.prefix+'batch_put', pickle.dumps(objs))
def clear_job(self, job_name): job_name = job_name.replace(' ', '_') path = os.path.join(self.job_dir, job_name) shutil.rmtree(path) for watcher in self.nodes_watchers: client_call(watcher, 'clear_job')
def finish(self): all_pages = self.pages() self.release_lock(self.ready_lock) self.release_lock(self.finish_lock) LimitionJobLoader.finish(self) JobLoader.finish(self) self.stop_logger_server() try: for handler in self.logger.handlers: handler.close() except: pass if self.client is not None: rpc_client = '%s:%s' % ( self.client.split(':')[0], main_conf.client.port ) client_call(rpc_client, 'stop', ignore=True) self.logger.info('All nodes finishes visiting pages size: %s' % all_pages) self.stopped = True
def stop(self): # stop all jobs for job_name in self.running_jobs.keys(): self.stop_job(job_name) for watcher in self.nodes_watchers: client_call(watcher, 'stop') self.finish()
def add_worker(self, worker): if worker in self.workers: return # rpc call the other workers to add this worker for node in self.workers: client_call(node, 'add_node', worker) self.workers.append(worker)
def _remote_or_local_put(self, addr, objs, force=False, priority=0): if self._check_empty(objs): return if addr == self.addr_: self.mq_node.put(objs, force=force, priority=priority) else: client_call(addr, self.prefix+'put', pickle.dumps(objs), force, priority)
def stop(self): # stop all jobs for job_name in self.running_jobs.keys(): self.stop_job(job_name) for watcher in self.nodes_watchers: client_call(watcher, 'stop', ignore=True) self.finish()
def remove_worker(self, worker): if worker not in self.workers: return # rpc call the other workers to remove this worker self.workers.remove(worker) for node in self.workers: client_call(node, 'remove_node', worker)
def _remote_or_local_put_backup(self, addr, backup_addr, objs, force=False): if self._check_empty(objs): return if addr == self.addr_: self.mq_node.put_backup(backup_addr, objs, force=force) else: client_call(addr, self.prefix+'put_backup', backup_addr, pickle.dumps(objs), force)
def put(self, objs): addrs_objs, addrs_backup_objs = \ self.distributors.distribute(objs) for addr, objs in addrs_objs.iteritems(): client_call(addr, self.prefix + 'batch_put', pickle.dumps(objs)) for addr, m in addrs_backup_objs.iteritems(): for b_addr, objs in m.iteritems(): client_call(addr, self.prefix + 'put_backup', b_addr, pickle.dumps(objs))
def stop(self): for watcher in self.nodes_watchers: client_call(watcher, 'stop') # stop all jobs for job_info in self.running_jobs.values(): try: client_call(job_info.job_master, 'stop') except socket.error: pass self.stopped = True
def put(self, objs): addrs_objs, addrs_backup_objs = \ self.distributors.distribute(objs) for addr, objs in addrs_objs.iteritems(): client_call(addr, self.prefix+'batch_put', pickle.dumps(objs)) for addr, m in addrs_backup_objs.iteritems(): for b_addr, objs in m.iteritems(): client_call(addr, self.prefix+'put_backup', b_addr, pickle.dumps(objs))
def action(self, name): if name == 'stop all': print 'Trying to stop master and all workers.' try: client_call(self.master, 'stop') except socket.error: print 'Cannot connect to cola master.' else: print 'Cola cluster has been shutdown.' elif name == 'list jobs': print 'Running jobs: ' for job in client_call(self.master, 'list_jobs'): print job elif name == 'list workers': print 'Cola workers: ' for worker in client_call(self.master, 'list_workers'): print worker elif name == 'list job dirs': print 'Runnable job dirs: ' for dir_ in client_call(self.master, 'list_job_dirs'): print dir_ elif name.startswith('run remote job '): print 'Remote job will run in background.' job_dir = name[len('run remote job '):] if job_dir not in client_call(self.master, 'list_job_dirs'): print 'Remote job dir not exists!' else: client_call(self.master, 'start_job', job_dir, False) elif name.startswith('run local job '): print 'Job has been committed and will run in background.' start = len('run local job ') path = name[start:].strip().strip('"').strip("'") if not os.path.exists(path): print 'Job path not exists!' else: try: job = import_job(path) except (ImportError, AttributeError): print 'Job path is illegal!' return dir_ = tempfile.mkdtemp() try: zip_filename = os.path.split(path)[1].replace(' ', '_') + '.zip' zip_file = os.path.join(dir_, zip_filename) ZipHandler.compress(zip_file, path, type_filters=("pyc", )) FileTransportClient(self.master, zip_file).send_file() client_call(self.master, 'start_job', zip_filename) finally: shutil.rmtree(dir_)
def sync(self): with self.lock: if isinstance(self.server, basestring): client_call(self.server, self.prefix + 'inc_merge', self.inc_counter.container) client_call(self.server, self.prefix + 'acc_merge', self.acc_counter.container) else: self.server.inc_merge(self.inc_counter.container) self.server.acc_merge(self.acc_counter.container) self.inc_counter.reset() self.acc_counter.reset()
def sync(self): with self.lock: if isinstance(self.server, basestring): client_call(self.server, self.prefix+'inc_merge', self.inc_counter.container) client_call(self.server, self.prefix+'acc_merge', self.acc_counter.container) else: self.server.inc_merge(self.inc_counter.container) self.server.acc_merge(self.acc_counter.container) self.inc_counter.reset() self.acc_counter.reset()
def run(self): self.ready_lock.acquire() if not self.stopped and len(self.not_registered) == 0: self.mq_client.put(self.job.starts) for node in self.nodes: client_call(node, 'run') self.finish_lock.acquire() master_watcher = '%s:%s' % (get_ip(), main_conf.master.port) client_call(master_watcher, 'finish_job', self.job.real_name, ignore=True)
def run(self, args): master_addr = args.master ctx = Context(is_client=True, master_addr=master_addr) if args.list is True: jobs = ctx.list_jobs() self.logger.info('list jobs at master: %s' % ctx.master_addr) for job_id, info in jobs.iteritems(): self.logger.info( '====> job id: %s, job description: %s, status: %s' % \ (job_id, info['name'], info['status'])) if len(jobs) == 0: self.logger.info('no jobs exist') elif args.kill is not None: job_id = self._get_matched_job_name(ctx, args.kill) if job_id is not None: ctx.kill_job(job_id) self.logger.info('killed job: %s' % job_id) elif args.upload is not None: path = os.path.abspath(args.upload) if not os.path.exists(path): self.logger.error('upload path does not exist') return job_id = None try: job_id = import_job_desc(path).uniq_name except Exception, e: self.logger.exception(e) self.logger.error('uploading job description failed') return new_upload_dir = os.path.join(tempfile.gettempdir(), job_id) if os.path.exists(new_upload_dir): shutil.rmtree(new_upload_dir) shutil.copytree(path, new_upload_dir) temp_filename = os.path.join(tempfile.gettempdir(), job_id + '.zip') ZipHandler.compress(temp_filename, new_upload_dir, type_filters=('pyc', )) try: FileTransportClient(ctx.master_addr, temp_filename).send_file() finally: os.remove(temp_filename) shutil.rmtree(new_upload_dir) self.logger.info('upload job <id: %s> finished' % job_id) if args.run == 'U': client_call(ctx.master_addr, 'run_job', job_id, True) self.logger.info('submit job <id: %s> to the cluster' % job_id)
def stop_job(self, job_real_name): if job_real_name not in self.running_jobs: return False job_info = self.running_jobs[job_real_name] try: client_call(job_info.job_master, 'stop') finally: for watcher in self.nodes_watchers.keys(): client_call(watcher, 'kill', job_real_name) self.kill(job_real_name) return True
def stop_job(self, job_real_name): if job_real_name not in self.running_jobs: return False job_info = self.running_jobs[job_real_name] try: client_call(job_info.job_master, 'stop', ignore=True) finally: for watcher in self.nodes_watchers.keys(): client_call(watcher, 'kill', job_real_name, ignore=True) self.kill(job_real_name) return True
def load_job(path, master=None): if not os.path.exists(path): raise ValueError('Job definition does not exist.') job = import_job(path) holder = os.path.join(root_dir(), 'data', 'worker', 'jobs', job.real_name) mq_holder = os.path.join(holder, 'mq') if not os.path.exists(mq_holder): os.makedirs(mq_holder) # Logger logger = get_logger(os.path.join(holder, 'job.log')) local_node = '%s:%s' % (get_ip(), job.context.job.port) nodes = [local_node] if master is not None: nodes = client_call(master, 'get_nodes') # Bloom filter hook bloom_filter_file = os.path.join(holder, 'bloomfilter') bloom_filter_hook = create_bloom_filter_hook(bloom_filter_file, job) rpc_server = create_rpc_server(job) loader = JobLoader(job, rpc_server, logger=logger, master=master) loader.init_mq(nodes, local_node, mq_holder, verify_exists_hook=bloom_filter_hook, copies=2 if master else 1) if master is None: try: loader.mq.put(job.starts) loader.run() finally: rpc_server.shutdown() else: try: client_call(master, 'ready', local_node) def _start(): while not loader.stopped: time.sleep(TIME_SLEEP) loader.run() thread = threading.Thread(target=_start) thread.start() thread.join() finally: rpc_server.shutdown()
def start_job(self, zip_filename, uncompress=True, client=None): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) # transfer zip file to workers for watcher in self.nodes_watchers: if watcher.split(':')[0] == self.ip_address: continue file_trans_client = FileTransportClient(watcher, zip_file) file_trans_client.send_file() job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0]) job = import_job(job_dir) worker_port = job.context.job.port port = job.context.job.master_port nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers] if len(nodes) > 0: info = MasterJobInfo(port, nodes, worker_port) self.running_jobs[job.real_name] = info dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, 'loader.py') workers = ['%s:%s' % (node, worker_port) for node in nodes] cmds = [ 'python', f, '-j', job_dir, '-i', self.ip_address, '-n', ' '.join(workers) ] if self.data_path is not None: cmds.extend(['-d', self.data_path]) if self.force: cmds.append('-f') if client is not None: cmds.extend(['-c', client]) popen = subprocess.Popen(cmds) info.popen = popen # call workers to start job for worker_watcher in self.nodes_watchers: client_call(worker_watcher, 'start_job', zip_filename, uncompress, ignore=True)
def load_job(path, master=None): if not os.path.exists(path): raise ValueError('Job definition does not exist.') job = import_job(path) holder = os.path.join( root_dir(), 'data', 'worker', 'jobs', job.real_name) mq_holder = os.path.join(holder, 'mq') if not os.path.exists(mq_holder): os.makedirs(mq_holder) # Logger logger = get_logger(os.path.join(holder, 'job.log')) local_node = '%s:%s' % (get_ip(), job.context.job.port) nodes = [local_node] if master is not None: nodes = client_call(master, 'get_nodes') # Bloom filter hook bloom_filter_file = os.path.join(holder, 'bloomfilter') bloom_filter_hook = create_bloom_filter_hook(bloom_filter_file, job) rpc_server = create_rpc_server(job) loader = JobLoader(job, rpc_server, logger=logger, master=master) loader.init_mq(nodes, local_node, mq_holder, verify_exists_hook=bloom_filter_hook, copies=2 if master else 1) if master is None: try: loader.mq.put(job.starts) loader.run() finally: rpc_server.shutdown() else: try: client_call(master, 'ready', local_node) def _start(): while not loader.stopped: time.sleep(TIME_SLEEP) loader.run() thread = threading.Thread(target=_start) thread.start() thread.join() finally: rpc_server.shutdown()
def run(self, args): master_addr = args.master ctx = Context(is_client=True, master_addr=master_addr) if args.list is True: jobs = ctx.list_jobs() self.logger.info('list jobs at master: %s' % ctx.master_addr) for job_id, info in jobs.iteritems(): self.logger.info( '====> job id: %s, job description: %s, status: %s' % \ (job_id, info['name'], info['status'])) if len(jobs) == 0: self.logger.info('no jobs exist') elif args.kill is not None: job_id = self._get_matched_job_name(ctx, args.kill) if job_id is not None: ctx.kill_job(job_id) self.logger.info('killed job: %s' % job_id) elif args.upload is not None: path = os.path.abspath(args.upload) if not os.path.exists(path): self.logger.error('upload path does not exist') return job_id = None try: job_id = import_job_desc(path).uniq_name except Exception, e: self.logger.exception(e) self.logger.error('uploading job description failed') return new_upload_dir = os.path.join(tempfile.gettempdir(), job_id) if os.path.exists(new_upload_dir): shutil.rmtree(new_upload_dir) shutil.copytree(path, new_upload_dir) temp_filename = os.path.join(tempfile.gettempdir(), job_id+'.zip') ZipHandler.compress(temp_filename, new_upload_dir, type_filters=('pyc', )) try: FileTransportClient(ctx.master_addr, temp_filename).send_file() finally: os.remove(temp_filename) shutil.rmtree(new_upload_dir) self.logger.info('upload job <id: %s> finished' % job_id) if args.run == 'U': client_call(ctx.master_addr, 'run_job', job_id, True) self.logger.info('submit job <id: %s> to the cluster' % job_id)
def get(self, size=1, priority=0): size = max(size, 1) addrs = list(self.addrs) shuffle(addrs) results = [] for addr in addrs: left = size - len(results) if left <= 0: break objs = pickle.loads(client_call(addr, self.prefix+'get', left, priority)) if objs is None: continue if not isinstance(objs, list): objs = [objs, ] results.extend(objs) if size == 1: if len(results) == 0: return return results[0] return results
def get(self, size=1, priority=0): size = max(size, 1) addrs = list(self.addrs) shuffle(addrs) results = [] for addr in addrs: left = size - len(results) if left <= 0: break objs = pickle.loads( client_call(addr, self.prefix + 'get', left, priority)) if objs is None: continue if not isinstance(objs, list): objs = [ objs, ] results.extend(objs) if size == 1: if len(results) == 0: return return results[0] return results
def require(self, size=1): if isinstance(self.server, basestring): return client_call(self.server, self.prefix+'require', self.addr, self.instance_id, size) else: return self.server.require(self.addr, self.instance_id, size=size)
def pages(self): all_pages = 0 for node in self.nodes: pages = client_call(node, 'pages', ignore=True) if pages is not None: all_pages += int(pages) return all_pages
def _call(i, worker): try: result = client_call(worker, self.remote_func, *args) except Exception, e: if self.logger: self.logger.error(e) result = False
def _client_call(*args): try: return client_call(*args) except socket.error: logger.error('Cannot connect to single running worker.') except: pass
def list_jobs(self): jobs = {} if self.is_master and self.master is not None: runnable_jobs = self.master.list_runnable_jobs() running_jobs = self.master.job_tracker.running_jobs else: runnable_jobs = client_call(self.master_addr, 'runnable_jobs') running_jobs = client_call(self.master_addr, 'running_jobs') for job_id, job_name in runnable_jobs.iteritems(): jobs[job_id] = {'name': job_name} if job_id in running_jobs: jobs[job_id]['status'] = 'running' else: jobs[job_id]['status'] = 'stopped' return jobs
def pages(self): all_pages = 0 for node in self.nodes: self.logger.debug('get pages from node %s' % node) pages = client_call(node, 'pages', ignore=True) if pages is not None: all_pages += int(pages) return all_pages
def _report(): while not self.stopped.is_set(): workers = client_call(self.master, 'register_heartbeat', self.ctx.worker_addr) self.ctx.addrs = [self.ctx.fix_addr(worker) for worker in workers] self.ctx.ips = [self.ctx.fix_ip(worker) for worker in workers] self.stopped.wait(HEARTBEAT_INTERVAL)
def run(self): # wait until all the workers initialized while not self.is_ready: pass if self.limit_speed: self._in_minute_clear() self.mq_client.put(self.job.starts) for node in self.nodes: client_call(node, 'run') def _run(): while not self.stopped: time.sleep(TIME_SLEEP) main_thread = threading.Thread(target=_run) main_thread.start() main_thread.join()
def get_job_counter(self, job_id): if self.is_master and self.master is not None: return self.master.counter_server.output() else: from cola.functions.counter import FUNC_PREFIX from cola.core.utils import get_rpc_prefix func_name = '%s%s' % (get_rpc_prefix(job_id, FUNC_PREFIX), 'get_global') return client_call(self.master_addr, func_name)
def _require_budget(self): if self.master is None or self.ctx.job.limits == 0: return if self.budget > 0: self.budget -= 1 return while self.budget == 0 and not self.stopped: self.budget = client_call(self.master, 'require', BUDGET_REQUIRE)
def finish(self): self.release_lock(self.ready_lock) self.release_lock(self.finish_lock) LimitionJobLoader.finish(self) JobLoader.finish(self) self.stop_logger_server() try: for handler in self.logger.handlers: handler.close() except: pass if self.client is not None: rpc_client = '%s:%s' % (self.client.split(':')[0], main_conf.client.port) client_call(rpc_client, 'stop', ignore=True) self.stopped = True
def load_job(job_path, data_path=None, master=None, force=False): if not os.path.exists(job_path): raise ValueError('Job definition does not exist.') job = import_job(job_path) if data_path is None: data_path = os.path.join(root_dir(), 'data') root = os.path.join(data_path, 'worker', 'jobs', job.real_name) if not os.path.exists(root): os.makedirs(root) if master is None: with StandaloneWorkerJobLoader(job, root, force=force) as job_loader: job_loader.run() else: nodes = client_call(master, 'get_nodes') local = '%s:%s' % (get_ip(), job.context.job.port) client_call(master, 'ready', local) with WorkerJobLoader(job, root, master, local=local, nodes=nodes, force=force) \ as job_loader: client_call(master, 'ready', local) job_loader.ready_for_run()
def start_job(self, zip_filename, uncompress=True): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) # transfer zip file to workers for watcher in self.nodes_watchers: if watcher.split(':')[0] == self.ip_address: continue file_trans_client = FileTransportClient(watcher, zip_file) file_trans_client.send_file() job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0]) job = import_job(job_dir) worker_port = job.context.job.port port = job.context.job.master_port nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers] if len(nodes) > 0: info = MasterJobInfo(port, nodes, worker_port) self.running_jobs[job.real_name] = info dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, 'loader.py') workers = ['%s:%s'%(node, worker_port) for node in nodes] subprocess.Popen('python "%(py)s" "%(job_dir)s" %(nodes)s' % { 'py': f, 'job_dir': job_dir, 'nodes': ' '.join(workers) }) # call workers to start job for worker_watcher in self.nodes_watchers: client_call(worker_watcher, 'start_job', zip_filename, uncompress)
def _remote_or_local_get(self, addr, size=1, priority=0): objs = None if addr == self.addr_: objs = self.mq_node.get(size=size, priority=priority) else: objs = pickle.loads(client_call(addr, self.prefix+'get', size, priority)) addr_caches = self.caches.get(addr, []) if size == 1 and objs is None and len(addr_caches) > 0: return addr_caches.pop(0) elif size > 1 and len(objs) == 0 and len(addr_caches) > 0: return addr_caches[:size] return objs