def get_configs(self, command, tasks_per_node): """Compose a dictionary with information for writing the submit script.""" logger.debug( "Requesting one block with {} nodes per block and {} tasks per node" .format(self.nodes_per_block, tasks_per_node)) job_config = {} job_config["submit_script_dir"] = self.channel.script_dir job_config["nodes"] = self.nodes_per_block job_config["walltime"] = wtime_to_minutes(self.walltime) job_config["scheduler_options"] = self.scheduler_options job_config["worker_init"] = self.worker_init job_config["user_script"] = command job_config["user_script"] = self.launcher(command, tasks_per_node, self.nodes_per_block) return job_config
def submit(self, command, tasks_per_node, job_name="parsl.slurm"): """Submit the command as a slurm job. Parameters ---------- command : str Command to be made on the remote side. tasks_per_node : int Command invocations to be launched per node job_name : str Name for the job Returns ------- None or str If at capacity, returns None; otherwise, a string identifier for the job """ scheduler_options = self.scheduler_options worker_init = self.worker_init if self.mem_per_node is not None: scheduler_options += '#SBATCH --mem={}g\n'.format(self.mem_per_node) worker_init += 'export PARSL_MEMORY_GB={}\n'.format(self.mem_per_node) if self.cores_per_node is not None: cpus_per_task = math.floor(self.cores_per_node / tasks_per_node) scheduler_options += '#SBATCH --cpus-per-task={}'.format(cpus_per_task) worker_init += 'export PARSL_CORES={}\n'.format(cpus_per_task) job_name = "{0}.{1}".format(job_name, time.time()) script_path = "{0}/{1}.submit".format(self.script_dir, job_name) script_path = os.path.abspath(script_path) logger.debug("Requesting one block with {} nodes".format(self.nodes_per_block)) job_config = {} job_config["submit_script_dir"] = self.channel.script_dir job_config["nodes"] = self.nodes_per_block job_config["tasks_per_node"] = tasks_per_node job_config["walltime"] = wtime_to_minutes(self.walltime) job_config["scheduler_options"] = scheduler_options job_config["worker_init"] = worker_init job_config["user_script"] = command # Wrap the command job_config["user_script"] = self.launcher(command, tasks_per_node, self.nodes_per_block) logger.debug("Writing submit script") self._write_submit_script(template_string, script_path, job_name, job_config) if self.move_files: logger.debug("moving files") channel_script_path = self.channel.push_file(script_path, self.channel.script_dir) else: logger.debug("not moving files") channel_script_path = script_path retcode, stdout, stderr = self.execute_wait("sbatch {0}".format(channel_script_path)) job_id = None if retcode == 0: for line in stdout.split('\n'): if line.startswith("Submitted batch job"): job_id = line.split("Submitted batch job")[1].strip() self.resources[job_id] = {'job_id': job_id, 'status': JobStatus(JobState.PENDING)} else: logger.error("Submit command failed") logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip()) return job_id
def submit(self, command, tasks_per_node, job_name="parsl.lsf"): """Submit the command as an LSF job. Parameters ---------- command : str Command to be made on the remote side. tasks_per_node : int Command invocations to be launched per node job_name : str Name for the job (must be unique). Returns ------- None or str If at capacity, returns None; otherwise, a string identifier for the job """ if self.provisioned_blocks >= self.max_blocks: logger.warning("LSF provider '{}' is at capacity (no more blocks will be added)".format(self.label)) return None job_name = "{0}.{1}".format(job_name, time.time()) script_path = "{0}/{1}.submit".format(self.script_dir, job_name) script_path = os.path.abspath(script_path) logger.debug("Requesting one block with {} nodes".format(self.nodes_per_block)) job_config = {} job_config["submit_script_dir"] = self.channel.script_dir job_config["nodes"] = self.nodes_per_block job_config["tasks_per_node"] = tasks_per_node job_config["walltime"] = wtime_to_minutes(self.walltime) job_config["scheduler_options"] = self.scheduler_options job_config["worker_init"] = self.worker_init job_config["project"] = self.project job_config["user_script"] = command # Wrap the command job_config["user_script"] = self.launcher(command, tasks_per_node, self.nodes_per_block) logger.debug("Writing submit script") self._write_submit_script(template_string, script_path, job_name, job_config) if self.move_files: logger.debug("moving files") channel_script_path = self.channel.push_file(script_path, self.channel.script_dir) else: logger.debug("not moving files") channel_script_path = script_path retcode, stdout, stderr = super().execute_wait("bsub {0}".format(channel_script_path)) job_id = None if retcode == 0: for line in stdout.split('\n'): if line.lower().startswith("job") and "is submitted to" in line.lower(): job_id = line.split()[1].strip('<>') self.resources[job_id] = {'job_id': job_id, 'status': JobStatus(JobState.PENDING)} else: logger.warning("Submission of command to scale_out failed") logger.error("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip()) return job_id
def submit(self, command, tasks_per_node, job_name="parsl.cobalt"): """ Submits the command onto an Local Resource Manager job of parallel elements. Submit returns an ID that corresponds to the task that was just submitted. If tasks_per_node < 1 : ! This is illegal. tasks_per_node should be integer If tasks_per_node == 1: A single node is provisioned If tasks_per_node > 1 : tasks_per_node number of nodes are provisioned. Args: - command :(String) Commandline invocation to be made on the remote side. - tasks_per_node (int) : command invocations to be launched per node Kwargs: - job_name (String): Name for job, must be unique Returns: - None: At capacity, cannot provision more - job_id: (string) Identifier for the job """ account_opt = '-A {}'.format( self.account) if self.account is not None else '' job_name = "parsl.{0}.{1}".format(job_name, time.time()) script_path = "{0}/{1}.submit".format(self.script_dir, job_name) script_path = os.path.abspath(script_path) job_config = {} job_config["scheduler_options"] = self.scheduler_options job_config["worker_init"] = self.worker_init logger.debug("Requesting nodes_per_block:%s tasks_per_node:%s", self.nodes_per_block, tasks_per_node) # Wrap the command job_config["user_script"] = self.launcher(command, tasks_per_node, self.nodes_per_block) queue_opt = '-q {}'.format( self.queue) if self.queue is not None else '' logger.debug("Writing submit script") self._write_submit_script(template_string, script_path, job_name, job_config) channel_script_path = self.channel.push_file(script_path, self.channel.script_dir) command = 'qsub -n {0} {1} -t {2} {3} {4}'.format( self.nodes_per_block, queue_opt, wtime_to_minutes(self.walltime), account_opt, channel_script_path) logger.debug("Executing {}".format(command)) retcode, stdout, stderr = self.execute_wait(command) # TODO : FIX this block if retcode != 0: logger.error("Failed command: {0}".format(command)) logger.error("Launch failed stdout:\n{0} \nstderr:{1}\n".format( stdout, stderr)) logger.debug("Retcode:%s STDOUT:%s STDERR:%s", retcode, stdout.strip(), stderr.strip()) job_id = None if retcode == 0: # We should be getting only one line back job_id = stdout.strip() self.resources[job_id] = { 'job_id': job_id, 'status': JobStatus(JobState.PENDING) } else: logger.error( "Submission of command to scale_out failed: {0}".format( stderr)) raise (ScaleOutFailed( self.__class__, "Request to submit job to local scheduler failed")) logger.debug("Returning job id : {0}".format(job_id)) return job_id