def start_job(self, zip_filename, uncompress=True, client=None): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) # transfer zip file to workers for watcher in self.nodes_watchers: if watcher.split(':')[0] == self.ip_address: continue file_trans_client = FileTransportClient(watcher, zip_file) file_trans_client.send_file() job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0]) job = import_job(job_dir) worker_port = job.context.job.port port = job.context.job.master_port nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers] if len(nodes) > 0: info = MasterJobInfo(port, nodes, worker_port) self.running_jobs[job.real_name] = info dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, 'loader.py') workers = ['%s:%s' % (node, worker_port) for node in nodes] cmds = [ 'python', f, '-j', job_dir, '-i', self.ip_address, '-n', ' '.join(workers) ] if self.data_path is not None: cmds.extend(['-d', self.data_path]) if self.force: cmds.append('-f') if client is not None: cmds.extend(['-c', client]) popen = subprocess.Popen(cmds) info.popen = popen # call workers to start job for worker_watcher in self.nodes_watchers: client_call(worker_watcher, 'start_job', zip_filename, uncompress, ignore=True)
def runLocalJob(master, job_path): ''' push local job to cola cluster and run ''' if not os.path.exists(job_path): logger.error('Job path not exists!') return try: import_job(job_path) except (ImportError, AttributeError): logger.error('Job path is illegal!') return start_log_server() thread = start_rpc_server() logger.info('Pushing job to cola cluster...') dir_ = tempfile.mkdtemp() try: zip_filename = os.path.split(job_path)[1].replace(' ', '_') + '.zip' zip_file = os.path.join(dir_, zip_filename) ZipHandler.compress(zip_file, job_path, type_filters=("pyc", )) FileTransportClient(master, zip_file).send_file() logger.info('Push finished.') finally: shutil.rmtree(dir_) logger.info('Start to run job.') _client_call(master, 'start_job', zip_filename, True, client) thread.join()
def start_job(self, zip_filename, uncompress=True, client=None): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) # transfer zip file to workers for watcher in self.nodes_watchers: if watcher.split(':')[0] == self.ip_address: continue file_trans_client = FileTransportClient(watcher, zip_file) file_trans_client.send_file() job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0]) job = import_job(job_dir) worker_port = job.context.job.port port = job.context.job.master_port nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers] if len(nodes) > 0: info = MasterJobInfo(port, nodes, worker_port) self.running_jobs[job.real_name] = info dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, 'loader.py') workers = ['%s:%s'%(node, worker_port) for node in nodes] cmds = ['python', f, '-j', job_dir, '-i', self.ip_address, '-n', ' '.join(workers)] if self.data_path is not None: cmds.extend(['-d', self.data_path]) if self.force: cmds.append('-f') if client is not None: cmds.extend(['-c', client]) popen = subprocess.Popen(cmds) info.popen = popen # call workers to start job for worker_watcher in self.nodes_watchers: client_call(worker_watcher, 'start_job', zip_filename, uncompress, ignore=True)
def start_job(self, zip_filename, uncompress=True, client=None): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) # transfer zip file to workers for watcher in self.nodes_watchers: if watcher.split(":")[0] == self.ip_address: continue file_trans_client = FileTransportClient(watcher, zip_file) file_trans_client.send_file() job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit(".", 1)[0]) job = import_job(job_dir) worker_port = job.context.job.port port = job.context.job.master_port nodes = [watcher.split(":")[0] for watcher in self.nodes_watchers] if len(nodes) > 0: info = MasterJobInfo(port, nodes, worker_port) self.running_jobs[job.real_name] = info dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, "loader.py") workers = ["%s:%s" % (node, worker_port) for node in nodes] cmds = ["python", f, "-j", job_dir, "-i", self.ip_address, "-n", " ".join(workers)] if self.data_path is not None: cmds.extend(["-d", self.data_path]) if self.force: cmds.append("-f") if client is not None: cmds.extend(["-c", client]) popen = subprocess.Popen(cmds) info.popen = popen # call workers to start job for worker_watcher in self.nodes_watchers: client_call(worker_watcher, "start_job", zip_filename, uncompress)
def pack_job_error(self, job_name): working_dir = os.path.join(self.working_dir, job_name) pack_dir = pack_local_job_error(job_name, working_dir=working_dir, logger=self.logger) zip_filename = os.path.join(self.zip_dir, '%s_%s_errors.zip'%(self.ctx.ip.replace('.', '_'), job_name)) if os.path.exists(zip_filename): os.remove(zip_filename) ZipHandler.compress(zip_filename, pack_dir) FileTransportClient(self.master, zip_filename).send_file()
def run(self, args): master_addr = args.master ctx = Context(is_client=True, master_addr=master_addr) if args.list is True: jobs = ctx.list_jobs() self.logger.info('list jobs at master: %s' % ctx.master_addr) for job_id, info in jobs.iteritems(): self.logger.info( '====> job id: %s, job description: %s, status: %s' % \ (job_id, info['name'], info['status'])) if len(jobs) == 0: self.logger.info('no jobs exist') elif args.kill is not None: job_id = self._get_matched_job_name(ctx, args.kill) if job_id is not None: ctx.kill_job(job_id) self.logger.info('killed job: %s' % job_id) elif args.upload is not None: path = os.path.abspath(args.upload) if not os.path.exists(path): self.logger.error('upload path does not exist') return job_id = None try: job_id = import_job_desc(path).uniq_name except Exception, e: self.logger.exception(e) self.logger.error('uploading job description failed') return new_upload_dir = os.path.join(tempfile.gettempdir(), job_id) if os.path.exists(new_upload_dir): shutil.rmtree(new_upload_dir) shutil.copytree(path, new_upload_dir) temp_filename = os.path.join(tempfile.gettempdir(), job_id + '.zip') ZipHandler.compress(temp_filename, new_upload_dir, type_filters=('pyc', )) try: FileTransportClient(ctx.master_addr, temp_filename).send_file() finally: os.remove(temp_filename) shutil.rmtree(new_upload_dir) self.logger.info('upload job <id: %s> finished' % job_id) if args.run == 'U': client_call(ctx.master_addr, 'run_job', job_id, True) self.logger.info('submit job <id: %s> to the cluster' % job_id)
def start_job(self, zip_filename, uncompress=True): if uncompress: zip_file = os.path.join(self.zip_dir, zip_filename) # transfer zip file to workers for watcher in self.nodes_watchers: if watcher.split(':')[0] == self.ip_address: continue file_trans_client = FileTransportClient(watcher, zip_file) file_trans_client.send_file() job_dir = ZipHandler.uncompress(zip_file, self.job_dir) else: job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0]) job = import_job(job_dir) worker_port = job.context.job.port port = job.context.job.master_port nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers] if len(nodes) > 0: info = MasterJobInfo(port, nodes, worker_port) self.running_jobs[job.real_name] = info dirname = os.path.dirname(os.path.abspath(__file__)) f = os.path.join(dirname, 'loader.py') workers = ['%s:%s'%(node, worker_port) for node in nodes] subprocess.Popen('python "%(py)s" "%(job_dir)s" %(nodes)s' % { 'py': f, 'job_dir': job_dir, 'nodes': ' '.join(workers) }) # call workers to start job for worker_watcher in self.nodes_watchers: client_call(worker_watcher, 'start_job', zip_filename, uncompress)
def run_job(self, job_name, unzip=False, wait_for_workers=False): if wait_for_workers: while not self.stopped.is_set(): if len(self.worker_tracker.workers) > 0: break stopped = self.stopped.wait(3) if stopped: return if unzip: self._unzip(job_name) job_path = os.path.join(self.job_dir, job_name) job_desc = import_job_desc(job_path) job_master = JobMaster(self.ctx, job_name, job_desc, self.worker_tracker.workers.keys()) job_master.init() self.job_tracker.register_job(job_name, job_master) self._register_runned_job(job_name, job_desc) zip_file = os.path.join(self.zip_dir, job_name + '.zip') for worker in job_master.workers: FileTransportClient(worker, zip_file).send_file() self.logger.debug('entering the master prepare stage, job id: %s' % job_name) self.logger.debug('job available workers: %s' % job_master.workers) stage = Stage(job_master.workers, 'prepare') prepared_ok = stage.barrier(True, job_name) if not prepared_ok: self.logger.error("prepare for running failed") return self.logger.debug('entering the master run_job stage, job id: %s' % job_name) stage = Stage(job_master.workers, 'run_job') run_ok = stage.barrier(True, job_name) if not run_ok: self.logger.error("run job failed, job id: %s" % job_name)