def ssh(): ssh_config = load_config("credentials.json")["ssh"] client = SshClient(hostname=ssh_config["hostname"], username=ssh_config["username"], password=ssh_config["password"], port=ssh_config["port"]) yield client client.destroy() return client
def stop_job(job_options, **kwargs): # pylint: disable=W0613 """ Stops a job in the HPC """ try: simulate = ctx.instance.runtime_properties['simulate'] credentials = ctx.instance.runtime_properties['credentials'] name = kwargs['name'] is_singularity = 'hpc.nodes.singularity_job' in ctx.node.\ type_hierarchy if not simulate: workdir = ctx.instance.runtime_properties['workdir'] wm_type = ctx.instance.runtime_properties['workload_manager'] client = SshClient(credentials['host'], credentials['user'], credentials['password']) # TODO(emepetres): manage errors wm = WorkloadManager.factory(wm_type) if not wm: raise NonRecoverableError("Workload Manager '" + wm_type + "' not supported.") is_stopped = wm.stop_job(client, name, job_options, is_singularity, ctx.logger, workdir=workdir) client.close_connection() else: ctx.logger.warning('Instance ' + ctx.instance.id + ' simulated') is_stopped = True if is_stopped: ctx.logger.info('Job ' + name + ' (' + ctx.instance.id + ') stopped.') else: ctx.logger.error('Job ' + name + ' (' + ctx.instance.id + ') not stopped.') raise NonRecoverableError('Job ' + name + ' (' + ctx.instance.id + ') not stopped.') except KeyError: # The job wasn't configured properly, no need to be stopped ctx.logger.warning('Job was not stopped as it was not configured.')
def cleanup_job(job_options, skip, **kwargs): # pylint: disable=W0613 """Clean the aux files of the job in the HPC""" if skip: return try: simulate = ctx.instance.runtime_properties['simulate'] name = kwargs['name'] if not simulate: is_singularity = 'hpc.nodes.singularity_job' in ctx.node.\ type_hierarchy credentials = ctx.instance.runtime_properties['credentials'] workdir = ctx.instance.runtime_properties['workdir'] wm_type = ctx.instance.runtime_properties['workload_manager'] client = SshClient(credentials['host'], credentials['user'], credentials['password']) # TODO(emepetres): manage errors wm = WorkloadManager.factory(wm_type) if not wm: raise NonRecoverableError("Workload Manager '" + wm_type + "' not supported.") is_clean = wm.clean_job_aux_files(client, name, job_options, is_singularity, ctx.logger, workdir=workdir) client.close_connection() else: ctx.logger.warning('Instance ' + ctx.instance.id + ' simulated') is_clean = True if is_clean: ctx.logger.info('Job ' + name + ' (' + ctx.instance.id + ') cleaned.') else: ctx.logger.error('Job ' + name + ' (' + ctx.instance.id + ') not cleaned.') except KeyError: # The job wasn't configured properly, so no cleanup needed ctx.logger.warning('Job was not cleaned up as it was not configured.')
def send_job(job_options, **kwargs): # pylint: disable=W0613 """ Sends a job to the HPC """ simulate = ctx.instance.runtime_properties['simulate'] credentials = ctx.instance.runtime_properties['credentials'] name = kwargs['name'] is_singularity = 'hpc.nodes.singularity_job' in ctx.node.\ type_hierarchy if not simulate: workdir = ctx.instance.runtime_properties['workdir'] wm_type = ctx.instance.runtime_properties['workload_manager'] client = SshClient(credentials['host'], credentials['user'], credentials['password']) wm = WorkloadManager.factory(wm_type) if not wm: raise NonRecoverableError("Workload Manager '" + wm_type + "' not supported.") is_submitted = wm.submit_job(client, name, job_options, is_singularity, ctx.logger, workdir=workdir) client.close_connection() else: ctx.logger.warning('Instance ' + ctx.instance.id + ' simulated') is_submitted = True if is_submitted: ctx.logger.info('Job ' + name + ' (' + ctx.instance.id + ') sent.') else: ctx.logger.error('Job ' + name + ' (' + ctx.instance.id + ') not sent.') raise NonRecoverableError('Job ' + name + ' (' + ctx.instance.id + ') not sent.') ctx.instance.runtime_properties['job_name'] = name
def prepare_hpc(config, base_dir, workdir_prefix, simulate, **kwargs): # pylint: disable=W0613 """ Tries to connect to a login node """ ctx.logger.info('Connecting to login node..') if not simulate: wm_type = config['workload_manager'] wm = WorkloadManager.factory(wm_type) if not wm: raise NonRecoverableError("Workload Manager '" + wm_type + "' not supported.") credentials = config['credentials'] client = SshClient(credentials['host'], credentials['user'], credentials['password']) _, exit_code = wm._execute_shell_command(client, 'uname', wait_result=True) if exit_code is not 0: raise NonRecoverableError("failed to connect to HPC: exit code " + str(exit_code)) ctx.instance.runtime_properties['login'] = exit_code is 0 prefix = workdir_prefix if workdir_prefix is "": prefix = ctx.blueprint.id workdir = wm.create_new_workdir(client, base_dir, prefix) client.close_connection() if workdir is None: raise NonRecoverableError( "failed to create the working directory, base dir: " + base_dir) ctx.instance.runtime_properties['workdir'] = workdir ctx.logger.info('..HPC ready on ' + workdir) else: ctx.instance.runtime_properties['login'] = True ctx.instance.runtime_properties['workdir'] = "simulation" ctx.logger.warning('HPC login connection simulated')
def cleanup_hpc(config, skip, simulate, **kwargs): # pylint: disable=W0613 """ Tries to connect to a login node """ if skip: return ctx.logger.info('Cleaning up...') if not simulate: workdir = ctx.instance.runtime_properties['workdir'] wm_type = config['workload_manager'] wm = WorkloadManager.factory(wm_type) if not wm: raise NonRecoverableError("Workload Manager '" + wm_type + "' not supported.") credentials = config['credentials'] client = SshClient(credentials['host'], credentials['user'], credentials['password']) _, exit_code = wm._execute_shell_command(client, 'rm -r ' + workdir, wait_result=True) client.close_connection() ctx.logger.info('..all clean.') else: ctx.logger.warning('HPC clean up simulated.')
def get_states(monitor_jobs, logger): """ Retrieves the status of every job asking to the monitors""" states = {} for host, settings in monitor_jobs.iteritems(): if settings['type'] == "PROMETHEUS": # external partial_states = _get_prometheus(host, settings['config'], settings['names']) else: # internal wm = WorkloadManager.factory(settings['type']) if wm: credentials = settings['config'] client = SshClient(credentials['host'], credentials['user'], credentials['password']) partial_states = wm.get_states(client, settings['names'], logger) client.close_connection() else: partial_states = _no_states(host, settings['type'], settings['names'], logger) states.update(partial_states) return states
def deploy_job(script, inputs, credentials, wm_type, workdir, name, logger, skip_cleanup): # pylint: disable=W0613 """ Exec a eployment job script that receives SSH credentials as input """ wm = WorkloadManager.factory(wm_type) if not wm: raise NonRecoverableError("Workload Manager '" + wm_type + "' not supported.") # Execute the script and manage the output client = SshClient(credentials['host'], credentials['user'], credentials['password']) if wm._create_shell_script(client, name, ctx.get_resource(script), logger, workdir=workdir): call = "./" + name for dinput in inputs: call += ' ' + dinput _, exit_code = wm._execute_shell_command(client, call, workdir=workdir, wait_result=True) if exit_code is not 0: logger.warning("failed to deploy job: call '" + call + "', exit code " + str(exit_code)) if not skip_cleanup: if not wm._execute_shell_command( client, "rm " + name, workdir=workdir): logger.warning("failed removing bootstrap script") client.close_connection() return exit_code is 0