def scale_in(self, size): count = 0 if not self.resources: print("No resources online, cannot scale down") else: for resource in self.resources[0:size]: print("Cancelling : ", resource['job_id']) retcode, stdout, stderr = execute_wait( "scancel {0}".format(resource['job_id']), 1) print(retcode, stdout, stderr) return count
def status(self): job_id_list = ','.join([j['job_id'] for j in self.resources]) retcode, stdout, stderr = execute_wait( "squeue --job {0}".format(job_id_list), 1) for line in stdout.split('\n'): parts = line.split() if parts and parts[0] != 'JOBID': print("Parts : ", parts) job_id = parts[0] status = translate_table.get(parts[4], 'UNKNOWN') for job in self.resources: if job['job_id'] == job_id: job['status'] = status print(self.resources)
def submit (self, cmd_string, blocksize, job_name="parsl.auto"): ''' Submits the cmd_string onto an Local Resource Manager job of blocksize parallel elements. example file with the complex case of multiple submits per job: Universe =vanilla output = out.$(Cluster).$(Process) error = err.$(Cluster).$(Process) log = log.$(Cluster) leave_in_queue = true executable = test.sh queue 5 executable = foo queue 1 $ condor_submit test.sub Submitting job(s)...... 5 job(s) submitted to cluster 118907. 1 job(s) submitted to cluster 118908. ''' blocksize = 1 job_config["nodes"] = 1 job_config["condor_overrides"] = job_config.get("condor_overrides", '') job_config["user_script"] = cmd_string ret = self._write_submit_script(template_string, script_path, job_name, job_config) retcode, stdout, stderr = execute_wait("condor_submit {0}".format(script_path), 3) logger.debug ("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip()) job_id = [] if retcode == 0 : for line in stdout.split('\n'): if re.match('^[0-9]', line) is not None: cluster = line.split(" ")[5] # We know the first job id ("process" in condor terms) within a # cluster is 0 and we know the total number of jobs from # condor_submit, so we use some list comprehensions to expand # the condor_submit output into job IDs # e.g., ['118907.0', '118907.1', '118907.2', '118907.3', '118907.4', '118908.0'] processes = [str(x) for x in range(0,int(line[0]))] job_id += [cluster + process for process in processes] return job_id
def scale_out(self, size, name=None): from datetime import datetime ipengine_json = None with open( os.path.expanduser( "~/.ipython/profile_default/security/ipcontroller-engine.json" ), 'r') as f: ipengine_json = f.read() job_name = "midway.parsl_auto.{0}".format( datetime.now().strftime("%Y-%m-%d_%H:%M:%S")) script_name = job_name + ".submit" submit_script = None with open( os.path.join(os.path.dirname(__file__), './midway.template.submit'), 'r') as f: submit_script = Template( f.read()).safe_substitute(**self.config, nodes=1, jobname=job_name, ipengine_json=ipengine_json) with open(script_name, 'w') as f: f.write(submit_script) retcode, stdout, stderr = execute_wait( "sbatch {0}".format(script_name), 1) print("retcode : ", retcode) print("stdout : ", stdout) print("stderr : ", stderr) if retcode == 0: for line in stdout.split('\n'): if line.startswith("Submitted batch job"): job_id = line.split("Submitted batch job")[1] self.resources.extend([{ 'job_id': job_id.strip(), 'status': 'submitted', 'size': size }]) else: print("Submission of command to scale_out failed")
def cancel(self, job_ids): ''' Cancels the jobs specified by a list of job ids Args: job_ids : [<job_id> ...] Returns : [True/False...] : If the cancel operation fails the entire list will be False. ''' job_id_list = ' '.join(job_ids) retcode, stdout, stderr = execute_wait("condor_rm {0}".format(job_id_list), 3) rets = None if retcode == 0 : for jid in job_ids: self.resources[jid]['status'] = translate_table['CA'] # Setting state to cancelled rets = [True for i in job_ids] else: rets = [False for i in job_ids] return rets
def _status(self): ''' Internal: Do not call. Returns the status list for a list of job_ids Args: self Returns: [status...] : Status list of all jobs ''' #job_id_list = ','.join(self.resources.keys()) jobs_missing = list(self.resources.keys()) retcode, stdout, stderr = execute_wait("qstat -u $USER", 3) for line in stdout.split('\n'): if line.startswith('='): continue parts = line.upper().split() if parts and parts[0] != 'JOBID': job_id = parts[0] print(parts) if job_id not in self.resources: continue status = translate_table.get(parts[4], 'UNKNOWN') self.resources[job_id]['status'] = status jobs_missing.remove(job_id) print("Jobs list : ", self.resources) # squeue does not report on jobs that are not running. So we are filling in the # blanks for missing jobs, we might lose some information about why the jobs failed. for missing_job in jobs_missing: if self.resources[missing_job]['status'] in ['PENDING', 'RUNNING']: self.resources[missing_job]['status'] = translate_table['CD']
def _get_job_status(self, job_id): retcode, stdout, stderr = execute_wait( "squeue {0}".format(script_name), 1) print("Stdout : ", stdout)
def submit(self, cmd_string, blocksize, job_name="parsl.auto"): ''' Submits the cmd_string onto an Local Resource Manager job of blocksize parallel elements. Submit returns an ID that corresponds to the task that was just submitted. If tasks_per_node < 1 : ! This is illegal. tasks_per_node should be integer If tasks_per_node == 1: A single node is provisioned If tasks_per_node > 1 : tasks_per_node * blocksize number of nodes are provisioned. Args: - cmd_string :(String) Commandline invocation to be made on the remote side. - blocksize :(float) Kwargs: - job_name (String): Name for job, must be unique Returns: - None: At capacity, cannot provision more - job_id: (string) Identifier for the job ''' if self.current_blocksize >= self.config["execution"]["options"][ "max_parallelism"]: logger.warn("[%s] at capacity, cannot add more blocks now", self.sitename) return None # Note: Fix this later to avoid confusing behavior. # We should always allocate blocks in integer counts of node_granularity if blocksize < self.config["execution"]["options"]["node_granularity"]: blocksize = self.config["execution"]["options"]["node_granularity"] account_opt = "-A {0}".format(self.config["execution"]["options"].get( "account", '')) job_name = "parsl.{0}.{1}".format(job_name, time.time()) script_path = "{0}/{1}.submit".format( self.config["execution"]["options"]["submit_script_dir"], job_name) nodes = math.ceil( float(blocksize) / self.config["execution"]["options"]["tasks_per_node"]) logger.debug("Requesting blocksize:%s tasks_per_node:%s nodes:%s", blocksize, self.config["execution"]["options"]["tasks_per_node"], nodes) job_config = self.config["execution"]["options"] job_config["nodes"] = nodes job_config["slurm_overrides"] = job_config.get("slurm_overrides", '') job_config["user_script"] = cmd_string ret = self._write_submit_script(template_string, script_path, job_name, job_config) retcode, stdout, stderr = execute_wait( "qsub -n {0} -t {1} {2} {3}".format(nodes, self.max_walltime, account_opt, script_path), 3) logger.debug("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip()) job_id = None if retcode == 0: for line in stdout.split('\n'): if line.startswith("Submitted batch job"): job_id = line.split("Submitted batch job")[1].strip() self.resources[job_id] = { 'job_id': job_id, 'status': 'PENDING', 'blocksize': blocksize } else: print("Submission of command to scale_out failed") return job_id