def runLocalJob(master, job_path): ''' push local job to cola cluster and run ''' if not os.path.exists(job_path): logger.error('Job path not exists!') return try: import_job(job_path) except (ImportError, AttributeError): logger.error('Job path is illegal!') return start_log_server() thread = start_rpc_server() logger.info('Pushing job to cola cluster...') dir_ = tempfile.mkdtemp() try: zip_filename = os.path.split(job_path)[1].replace(' ', '_') + '.zip' zip_file = os.path.join(dir_, zip_filename) ZipHandler.compress(zip_file, job_path, type_filters=("pyc", )) FileTransportClient(master, zip_file).send_file() logger.info('Push finished.') finally: shutil.rmtree(dir_) logger.info('Start to run job.') _client_call(master, 'start_job', zip_filename, True, client) thread.join()
def recover(job_path): job = import_job(job_path) data_path = os.path.join(root_dir(), 'data') root = os.path.join(data_path, 'worker', 'jobs', job.real_name) if os.path.exists(root): lock_path = os.path.join(root, 'lock') if os.path.exists(lock_path): os.remove(lock_path) def _recover_dir(dir_): for f in os.listdir(dir_): if f.endswith('.old'): f_path = os.path.join(dir_, f) os.remove(f_path) for f in os.listdir(dir_): if f == 'lock': lock_f = os.path.join(dir_, f) os.remove(lock_f) f_path = os.path.join(dir_, f) if os.path.isfile(f_path) and not f.endswith('.old'): os.rename(f_path, f_path+'.old') mq_store_dir = os.path.join(root, 'store') mq_backup_dir = os.path.join(root, 'backup') if os.path.exists(mq_store_dir): _recover_dir(mq_store_dir) if os.path.exists(mq_backup_dir): _recover_dir(mq_backup_dir)
def load_job(path, nodes, context=None): if not os.path.exists(path): raise ValueError('Job definition does not exist.') job = import_job(path) job_name = job.name.replace(' ', '_') if job.debug: job_name += '_debug' holder = os.path.join(root_dir(), 'data', 'master', 'jobs', job_name) if not os.path.exists(holder): os.makedirs(holder) lock_f = os.path.join(holder, 'lock') if os.path.exists(lock_f): raise JobMasterRunning('There has been a running job master') open(lock_f, 'w').close() rpc_server = create_rpc_server(job) try: loader = JobLoader(job, nodes, rpc_server, context=context) loader.run() # nofify master watcher finishing master_watcher = '%s:%s' % (get_ip(), main_conf.master.port) client_call(master_watcher, 'finish_job', job.real_name) finally: os.remove(lock_f) rpc_server.shutdown()
def recover(job_path): job = import_job(job_path) data_path = os.path.join(root_dir(), 'data') root = os.path.join(data_path, 'worker', 'jobs', job.real_name) if os.path.exists(root): lock_path = os.path.join(root, 'lock') if os.path.exists(lock_path): os.remove(lock_path) def _recover_dir(dir_): for f in os.listdir(dir_): if f.endswith('.old'): f_path = os.path.join(dir_, f) os.remove(f_path) for f in os.listdir(dir_): if f == 'lock': lock_f = os.path.join(dir_, f) os.remove(lock_f) f_path = os.path.join(dir_, f) if os.path.isfile(f_path) and not f.endswith('.old'): os.rename(f_path, f_path + '.old') mq_store_dir = os.path.join(root, 'store') mq_backup_dir = os.path.join(root, 'backup') if os.path.exists(mq_store_dir): _recover_dir(mq_store_dir) if os.path.exists(mq_backup_dir): _recover_dir(mq_backup_dir)
def load_job(job_path, nodes, ip_address=None, data_path=None, client=None, context=None, force=False): if not os.path.exists(job_path): raise ValueError('Job definition does not exist.') job = import_job(job_path) if data_path is None: data_path = os.path.join(root_dir(), 'data') root = os.path.join(data_path, 'master', 'jobs', job.real_name) if not os.path.exists(root): os.makedirs(root) with MasterJobLoader(job, root, nodes, local_ip=ip_address, client=client, context=context, force=force) as job_loader: job_loader.run()
def action(self, name): if name == 'stop all': print 'Trying to stop master and all workers.' try: client_call(self.master, 'stop') except socket.error: print 'Cannot connect to cola master.' else: print 'Cola cluster has been shutdown.' elif name == 'list jobs': print 'Running jobs: ' for job in client_call(self.master, 'list_jobs'): print job elif name == 'list workers': print 'Cola workers: ' for worker in client_call(self.master, 'list_workers'): print worker elif name == 'list job dirs': print 'Runnable job dirs: ' for dir_ in client_call(self.master, 'list_job_dirs'): print dir_ elif name.startswith('run remote job '): print 'Remote job will run in background.' job_dir = name[len('run remote job '):] if job_dir not in client_call(self.master, 'list_job_dirs'): print 'Remote job dir not exists!' else: client_call(self.master, 'start_job', job_dir, False) elif name.startswith('run local job '): print 'Job has been committed and will run in background.' start = len('run local job ') path = name[start:].strip().strip('"').strip("'") if not os.path.exists(path): print 'Job path not exists!' else: try: job = import_job(path) except (ImportError, AttributeError): print 'Job path is illegal!' return dir_ = tempfile.mkdtemp() try: zip_filename = os.path.split(path)[1].replace(' ', '_') + '.zip' zip_file = os.path.join(dir_, zip_filename) ZipHandler.compress(zip_file, path, type_filters=("pyc", )) FileTransportClient(self.master, zip_file).send_file() client_call(self.master, 'start_job', zip_filename) finally: shutil.rmtree(dir_)
def load_job(path, master=None): if not os.path.exists(path): raise ValueError('Job definition does not exist.') job = import_job(path) holder = os.path.join(root_dir(), 'data', 'worker', 'jobs', job.real_name) mq_holder = os.path.join(holder, 'mq') if not os.path.exists(mq_holder): os.makedirs(mq_holder) # Logger logger = get_logger(os.path.join(holder, 'job.log')) local_node = '%s:%s' % (get_ip(), job.context.job.port) nodes = [local_node] if master is not None: nodes = client_call(master, 'get_nodes') # Bloom filter hook bloom_filter_file = os.path.join(holder, 'bloomfilter') bloom_filter_hook = create_bloom_filter_hook(bloom_filter_file, job) rpc_server = create_rpc_server(job) loader = JobLoader(job, rpc_server, logger=logger, master=master) loader.init_mq(nodes, local_node, mq_holder, verify_exists_hook=bloom_filter_hook, copies=2 if master else 1) if master is None: try: loader.mq.put(job.starts) loader.run() finally: rpc_server.shutdown() else: try: client_call(master, 'ready', local_node) def _start(): while not loader.stopped: time.sleep(TIME_SLEEP) loader.run() thread = threading.Thread(target=_start) thread.start() thread.join() finally: rpc_server.shutdown()
def start_job(self, zip_filename, uncompress=True): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0]) job = import_job(job_dir) master_port = job.context.job.master_port master = '%s:%s' % (self.master.split(':')[0], master_port) dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, 'loader.py') subprocess.Popen('python "%s" "%s" %s' % (f, job_dir, master))
def start_job(self, zip_filename, uncompress=True, client=None): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) # transfer zip file to workers for watcher in self.nodes_watchers: if watcher.split(':')[0] == self.ip_address: continue file_trans_client = FileTransportClient(watcher, zip_file) file_trans_client.send_file() job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0]) job = import_job(job_dir) worker_port = job.context.job.port port = job.context.job.master_port nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers] if len(nodes) > 0: info = MasterJobInfo(port, nodes, worker_port) self.running_jobs[job.real_name] = info dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, 'loader.py') workers = ['%s:%s' % (node, worker_port) for node in nodes] cmds = [ 'python', f, '-j', job_dir, '-i', self.ip_address, '-n', ' '.join(workers) ] if self.data_path is not None: cmds.extend(['-d', self.data_path]) if self.force: cmds.append('-f') if client is not None: cmds.extend(['-c', client]) popen = subprocess.Popen(cmds) info.popen = popen # call workers to start job for worker_watcher in self.nodes_watchers: client_call(worker_watcher, 'start_job', zip_filename, uncompress, ignore=True)
def load_job(path, master=None): if not os.path.exists(path): raise ValueError('Job definition does not exist.') job = import_job(path) holder = os.path.join( root_dir(), 'data', 'worker', 'jobs', job.real_name) mq_holder = os.path.join(holder, 'mq') if not os.path.exists(mq_holder): os.makedirs(mq_holder) # Logger logger = get_logger(os.path.join(holder, 'job.log')) local_node = '%s:%s' % (get_ip(), job.context.job.port) nodes = [local_node] if master is not None: nodes = client_call(master, 'get_nodes') # Bloom filter hook bloom_filter_file = os.path.join(holder, 'bloomfilter') bloom_filter_hook = create_bloom_filter_hook(bloom_filter_file, job) rpc_server = create_rpc_server(job) loader = JobLoader(job, rpc_server, logger=logger, master=master) loader.init_mq(nodes, local_node, mq_holder, verify_exists_hook=bloom_filter_hook, copies=2 if master else 1) if master is None: try: loader.mq.put(job.starts) loader.run() finally: rpc_server.shutdown() else: try: client_call(master, 'ready', local_node) def _start(): while not loader.stopped: time.sleep(TIME_SLEEP) loader.run() thread = threading.Thread(target=_start) thread.start() thread.join() finally: rpc_server.shutdown()
def start_job(self, zip_filename, uncompress=True, client=None): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) # transfer zip file to workers for watcher in self.nodes_watchers: if watcher.split(':')[0] == self.ip_address: continue file_trans_client = FileTransportClient(watcher, zip_file) file_trans_client.send_file() job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0]) job = import_job(job_dir) worker_port = job.context.job.port port = job.context.job.master_port nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers] if len(nodes) > 0: info = MasterJobInfo(port, nodes, worker_port) self.running_jobs[job.real_name] = info dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, 'loader.py') workers = ['%s:%s'%(node, worker_port) for node in nodes] cmds = ['python', f, '-j', job_dir, '-i', self.ip_address, '-n', ' '.join(workers)] if self.data_path is not None: cmds.extend(['-d', self.data_path]) if self.force: cmds.append('-f') if client is not None: cmds.extend(['-c', client]) popen = subprocess.Popen(cmds) info.popen = popen # call workers to start job for worker_watcher in self.nodes_watchers: client_call(worker_watcher, 'start_job', zip_filename, uncompress, ignore=True)
def start_job(self, zip_filename, uncompress=True, client=None): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) # transfer zip file to workers for watcher in self.nodes_watchers: if watcher.split(":")[0] == self.ip_address: continue file_trans_client = FileTransportClient(watcher, zip_file) file_trans_client.send_file() job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit(".", 1)[0]) job = import_job(job_dir) worker_port = job.context.job.port port = job.context.job.master_port nodes = [watcher.split(":")[0] for watcher in self.nodes_watchers] if len(nodes) > 0: info = MasterJobInfo(port, nodes, worker_port) self.running_jobs[job.real_name] = info dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, "loader.py") workers = ["%s:%s" % (node, worker_port) for node in nodes] cmds = ["python", f, "-j", job_dir, "-i", self.ip_address, "-n", " ".join(workers)] if self.data_path is not None: cmds.extend(["-d", self.data_path]) if self.force: cmds.append("-f") if client is not None: cmds.extend(["-c", client]) popen = subprocess.Popen(cmds) info.popen = popen # call workers to start job for worker_watcher in self.nodes_watchers: client_call(worker_watcher, "start_job", zip_filename, uncompress)
def start_job(self, zip_filename, uncompress=True): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0]) job = import_job(job_dir) master_port = job.context.job.master_port master = '%s:%s' % (self.master.split(':')[0], master_port) dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, 'loader.py') cmds = ['python', f, '-j', job_dir, '-m', master] if self.data_path is not None: cmds.extend(['-d', self.data_path]) if self.force: cmds.append('-f') popen = subprocess.Popen(cmds) self.running_jobs[job.real_name] = WorkerJobInfo(job.context.job.port, popen)
def start_job(self, zip_filename, uncompress=True): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit(".", 1)[0]) job = import_job(job_dir) master_port = job.context.job.master_port master = "%s:%s" % (self.master.split(":")[0], master_port) dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, "loader.py") cmds = ["python", f, "-j", job_dir, "-m", master] if self.data_path is not None: cmds.extend(["-d", self.data_path]) if self.force: cmds.append("-f") popen = subprocess.Popen(cmds) self.running_jobs[job.real_name] = WorkerJobInfo(job.context.job.port, popen)
def load_job(job_path, data_path=None, master=None, force=False): if not os.path.exists(job_path): raise ValueError("Job definition does not exist.") job = import_job(job_path) if data_path is None: data_path = os.path.join(root_dir(), "data") root = os.path.join(data_path, "worker", "jobs", job.real_name) if not os.path.exists(root): os.makedirs(root) if master is None: with StandaloneWorkerJobLoader(job, root, force=force) as job_loader: job_loader.run() else: nodes = client_call(master, "get_nodes") local = "%s:%s" % (get_ip(), job.context.job.port) client_call(master, "ready", local) with WorkerJobLoader(job, root, master, local=local, nodes=nodes, force=force) as job_loader: client_call(master, "ready", local) job_loader.ready_for_run()
def load_job(job_path, data_path=None, master=None, force=False): if not os.path.exists(job_path): raise ValueError('Job definition does not exist.') job = import_job(job_path) if data_path is None: data_path = os.path.join(root_dir(), 'data') root = os.path.join(data_path, 'worker', 'jobs', job.real_name) if not os.path.exists(root): os.makedirs(root) if master is None: with StandaloneWorkerJobLoader(job, root, force=force) as job_loader: job_loader.run() else: nodes = client_call(master, 'get_nodes') local = '%s:%s' % (get_ip(), job.context.job.port) client_call(master, 'ready', local) with WorkerJobLoader(job, root, master, local=local, nodes=nodes, force=force) \ as job_loader: client_call(master, 'ready', local) job_loader.ready_for_run()
def start_job(self, zip_filename, uncompress=True): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0]) job = import_job(job_dir) master_port = job.context.job.master_port master = '%s:%s' % (self.master.split(':')[0], master_port) dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, 'loader.py') cmds = ['python', f, '-j', job_dir, '-m', master] if self.data_path is not None: cmds.extend(['-d', self.data_path]) if self.force: cmds.append('-f') popen = subprocess.Popen(cmds) self.running_jobs[job.real_name] = WorkerJobInfo( job.context.job.port, popen)
def start_job(self, zip_filename, uncompress=True): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) # transfer zip file to workers for watcher in self.nodes_watchers: if watcher.split(':')[0] == self.ip_address: continue file_trans_client = FileTransportClient(watcher, zip_file) file_trans_client.send_file() job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0]) job = import_job(job_dir) worker_port = job.context.job.port port = job.context.job.master_port nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers] if len(nodes) > 0: info = MasterJobInfo(port, nodes, worker_port) self.running_jobs[job.real_name] = info dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, 'loader.py') workers = ['%s:%s'%(node, worker_port) for node in nodes] subprocess.Popen('python "%(py)s" "%(job_dir)s" %(nodes)s' % { 'py': f, 'job_dir': job_dir, 'nodes': ' '.join(workers) }) # call workers to start job for worker_watcher in self.nodes_watchers: client_call(worker_watcher, 'start_job', zip_filename, uncompress)
def load_job(job_path, data_path=None, master=None, force=False): if not os.path.exists(job_path): raise ValueError('Job definition does not exist.') job = import_job(job_path) if data_path is None: data_path = os.path.join(root_dir(), 'data') root = os.path.join( data_path, 'worker', 'jobs', job.real_name) if not os.path.exists(root): os.makedirs(root) if master is None: with StandaloneWorkerJobLoader(job, root, force=force) as job_loader: job_loader.run() else: nodes = client_call(master, 'get_nodes') local = '%s:%s' % (get_ip(), job.context.job.port) client_call(master, 'ready', local) with WorkerJobLoader(job, root, master, local=local, nodes=nodes, force=force) \ as job_loader: client_call(master, 'ready', local) job_loader.ready_for_run()