def stop_job(self, ssh_client, name, job_options, is_singularity, logger, workdir=None): """ Stops a job from the HPC @type ssh_client: SshClient @param ssh_client: ssh client connected to an HPC login node @type name: string @param name: name of the job @type job_settings: dictionary @param job_settings: dictionary with the job options @type is_singularity: bool @param is_singularity: True if the job is in a container @rtype string @return Slurm's job name stopped. None if an error arise. """ if not SshClient.check_ssh_client(ssh_client, logger): return False if job_options['type'] == "SPARK": call = self._build_job_cancellation_call(name, ssh_client, logger) else: call = self._build_job_cancellation_call(name, job_options, logger) if call is None: return False if not SshClient.check_ssh_client(ssh_client, logger): return False return ssh_client.execute_shell_command(call, workdir=workdir)
def clean_job_aux_files(self, ssh_client, name, job_options, is_singularity, logger, workdir=None): """ Cleans no more needed job files in the HPC @type ssh_client: SshClient @param ssh_client: ssh client connected to an HPC login node @type name: string @param name: name of the job @type job_settings: dictionary @param job_settings: dictionary with the job options @type is_singularity: bool @param is_singularity: True if the job is in a container @rtype string @return Slurm's job name stopped. None if an error arise. """ if not SshClient.check_ssh_client(ssh_client, logger): return False if is_singularity: return ssh_client.execute_shell_command("rm " + name + ".script", workdir=workdir) return True
def publish(self, ssh_client, logger, workdir=None): """ Publish the local file in the external repository @type ssh_client: SshClient @param ssh_client: ssh client connected to an HPC login node @rtype string @return False if something went wrong """ if not SshClient.check_ssh_client(ssh_client, logger): return False call = self._build_publish_call(logger) if call is None: return False return ssh_client.execute_shell_command(call, workdir=workdir, wait_result=False)
def submit_job(self, ssh_client, name, job_settings, is_singularity, logger, workdir=None, context=None): """ Sends a job to the HPC @type ssh_client: SshClient @param ssh_client: ssh client connected to an HPC login node @type name: string @param name: name of the job @type job_settings: dictionary @param job_settings: dictionary with the job options @type is_singularity: bool @param is_singularity: True if the job is in a container @rtype string @param logger: Logger object to print log messages @rtype logger @param workdir: Path of the working directory of the job @rtype string @param context: Dictionary containing context env vars @rtype dictionary of strings @return Slurm's job name sent. None if an error arise. """ if not SshClient.check_ssh_client(ssh_client, logger): return False # Build script if there is no one, or Singularity if 'script' not in job_settings or is_singularity: # generate script content if is_singularity: script_content = self._build_container_script( name, job_settings, logger) else: script_content = self._build_script(name, job_settings, logger) if script_content is None: return False if not self._create_shell_script(ssh_client, name + ".script", script_content, logger, workdir=workdir): return False # @TODO: use more general type names (e.g., BATCH/INLINE, etc) settings = {"script": name + ".script"} if 'arguments' in job_settings: settings['arguments'] = job_settings['arguments'] if 'scale' in job_settings: settings['scale'] = job_settings['scale'] if 'scale_max_in_parallel' in job_settings: settings['scale_max_in_parallel'] = \ job_settings['scale_max_in_parallel'] else: settings = job_settings # build the call to submit the job response = self._build_job_submission_call(name, settings) if 'error' in response: logger.error("Couldn't build the call to send the job: " + response['error']) return False # prepare the scale env variables if 'scale_env_mapping_call' in response: scale_env_mapping_call = response['scale_env_mapping_call'] output, exit_code = ssh_client.execute_shell_command( scale_env_mapping_call, workdir=workdir, wait_result=True) if exit_code != 0: logger.error("Scale env vars mapping '" + scale_env_mapping_call + "' failed with code " + str(exit_code) + ":\n" + output) return False # submit the job call = response['call'] output, exit_code = ssh_client.execute_shell_command(call, env=context, workdir=workdir, wait_result=True) if exit_code != 0: logger.error("Job submission '" + call + "' exited with code " + str(exit_code) + ":\n" + output) return False return True
def submit_job(self, ssh_client, name, job_settings, is_singularity, logger, workdir=None, context=None): """ Sends a job to the HPC @type ssh_client: SshClient @param ssh_client: ssh client connected to an HPC login node @type name: string @param name: name of the job @type job_settings: dictionary @param job_settings: dictionary with the job options @type is_singularity: bool @param is_singularity: True if the job is in a container @rtype string @param logger: Logger object to print log messages @rtype logger @param workdir: Path of the working directory of the job @rtype string @param context: Dictionary containing context env vars @rtype dictionary of strings @return Slurm's job name sent. None if an error arise. """ if not SshClient.check_ssh_client(ssh_client, logger): return False if is_singularity: # generate script content for singularity script_content = self._build_container_script( name, job_settings, logger) if script_content is None: return False if not self._create_shell_script(ssh_client, name + ".script", script_content, logger, workdir=workdir): return False # @TODO: use more general type names (e.g., BATCH/INLINE, etc) settings = {"type": "SBATCH", "command": name + ".script"} if 'scale' in job_settings: settings['scale'] = job_settings['scale'] if 'scale_max_in_parallel' in job_settings: settings['scale_max_in_parallel'] = \ job_settings['scale_max_in_parallel'] else: settings = job_settings # build the call to submit the job response = self._build_job_submission_call(name, settings, logger) if 'error' in response: logger.error("Couldn't build the call to send the job: " + response['error']) return False # prepare the scale env variables if 'scale_env_mapping_call' in response: scale_env_mapping_call = response['scale_env_mapping_call'] output, exit_code = ssh_client.execute_shell_command( scale_env_mapping_call, workdir=workdir, wait_result=True) if exit_code != 0: logger.error("Scale env vars mapping '" + scale_env_mapping_call + "' failed with code " + str(exit_code) + ":\n" + output) return False # submit the job call = response['call'] if (settings['type'] == 'SPARK'): exit_code = ssh_client.execute_shell_command(call, env=context, workdir=workdir, wait_result=False) if exit_code is True: exit_code = 0 logger.debug("Job execution with exit code : " + str(exit_code)) import time time.sleep(30) else: output, exit_code = ssh_client.execute_shell_command( call, env=context, workdir=workdir, wait_result=True) # if (job_settings['type'] == 'SPARK'): # output, exit_code = ssh_client.execute_shell_command( \ # call, env=context, workdir=workdir, wait_result=False) # else: # output, exit_code = ssh_client.execute_shell_command( \ # call, env=context, workdir=workdir, wait_result=True) if exit_code != 0: logger.error("Job submission '" + call + "' exited with code " + str(exit_code) + ":\n" + output) return False # Job is successfully submitted, get the framework ID info # to manage the jobs in future # if (settings['type'] == 'SPARK'): # Parse output to get the framework ID # framework_id = _parse_spark_output(output) # Store framework_id in each executables return True