def _get_states_detailed(self, job_names): """ Get job states by job names This function uses `qstat` command to query PBSPro. Please don't launch this call very frequently. Polling it frequently, especially across all users on the cluster, will slow down response times and may bring scheduling to a crawl. It allows to a precise mapping of Torque states to Slurm states by taking into account `exit_code`. Unlike `get_states_tabular` it parses output on host and uses several SSH commands. """ # identify job ids # Read environment, required by some HPC (e.g. HLRS Hawk) read_environment = "source /etc/profile > /dev/null 2>&1; " call = read_environment + "echo {} | xargs -n 1 qselect -x -N".format( shlex_quote(' '.join(map(shlex_quote, job_names)))) client = SshClient(self.credentials) output, exit_code = client.execute_shell_command(call, workdir=self.workdir, wait_result=True) job_ids = Pbspro._parse_qselect(output) if not job_ids: return {} # get detailed information about jobs call = read_environment + "qstat -x -f {}".format(' '.join( map(str, job_ids))) output, exit_code = client.execute_shell_command(call, workdir=self.workdir, wait_result=True) client.close_connection() try: job_states, audits = Pbspro._parse_qstat_detailed(output) except SyntaxError as e: self.logger.warning( "cannot parse state response for job ids=[{}]".format(','.join( map(str, job_ids)))) self.logger.warning( "{err}\n`qstat -x -f` output to parse:\n\\[\n{text}\n\\]". format(err=str(e), text=output)) # TODO: think whether error ignoring is better # for the correct lifecycle raise e return job_states, audits
def get_states(self, credentials, job_names): monitor_start_time_str = start_time_tostr(self.monitor_start_time, self.timezone) call = "sacct -n -o JobName,State -X -P --name=" + ','.join(job_names) + " -S " + monitor_start_time_str client = SshClient(credentials) output, exit_code = client.execute_shell_command(call, workdir=self.workdir, wait_result=True) states = {} if exit_code == 0: states = _parse_states(output, self.logger) else: self.logger.error("Failed to get job states: " + output) # Get job execution audits for monitoring metrics audits = {} for name in job_names: if name in states: if states[name] != 'PENDING': audits[name] = get_job_metrics(name, client, self.workdir, monitor_start_time_str, self.logger) else: self.logger.warning("Could not parse the state of job: " + name + "Parsed dict:" + str(states)) client.close_connection() return states, audits
def _upload_data(self): if not self.dataset_info['package_id']: self._create_dataset() ssh_credentials = self.from_infra['credentials'] filepath = self.dt_config['from_source']['filepath'] workdir = self.from_infra['workdir'] action = 'update' if self._resource_exists() else 'create' command = 'curl {0}/api/action/resource_{1}'.format( self.endpoint, action) command += ' --form upload=@{0}'.format(filepath) command += ' --form package_id={0}'.format( self.dataset_info['package_id']) for arg in self.ckan_resource: if self.ckan_resource[arg]: command += ' --form {0}={1}'.format(arg, self.ckan_resource[arg]) if self.apikey: command += " -H 'Authorization: {0}'".format(self.apikey) ssh_client = SshClient(ssh_credentials) exit_code, exit_msg = ssh_client.execute_shell_command( command, workdir, wait_result=True) if exit_code != 0: self.logger.error( 'There was a problem publishing the results in CKAN ({0}):\n{1}' .format(exit_code, exit_msg)) else: self.logger.info('Data published in CKAN')
def configure_execution(config, credentials, base_dir, workdir_prefix, simulate, **kwargs): # pylint: disable=W0613 """ Creates the working directory for the execution """ ctx.logger.info('Connecting to infrastructure interface..') if not simulate: if 'infrastructure_interface' not in config: raise NonRecoverableError( "'infrastructure_interface' key missing on config") interface_type = config['infrastructure_interface'] ctx.logger.info(' - manager: {interface_type}'.format( interface_type=interface_type)) wm = InfrastructureInterface.factory(interface_type) if not wm: raise NonRecoverableError("Infrastructure Interface '" + interface_type + "' not supported.") if 'credentials' in ctx.instance.runtime_properties: credentials = ctx.instance.runtime_properties['credentials'] try: client = SshClient(credentials) except Exception as exp: raise NonRecoverableError( "Failed trying to connect to infrastructure interface: " + str(exp)) # TODO: use command according to wm _, exit_code = client.execute_shell_command('uname', wait_result=True) if exit_code != 0: client.close_connection() raise NonRecoverableError( "Failed executing on the infrastructure: exit code " + str(exit_code)) ctx.instance.runtime_properties['login'] = exit_code == 0 prefix = workdir_prefix if workdir_prefix == "": prefix = ctx.blueprint.id workdir = wm.create_new_workdir(client, base_dir, prefix, ctx.logger) client.close_connection() if workdir is None: raise NonRecoverableError( "failed to create the working directory, base dir: " + base_dir) ctx.instance.runtime_properties['workdir'] = workdir ctx.logger.info('..infrastructure ready to be used on ' + workdir) else: ctx.logger.info(' - [simulation]..') ctx.instance.runtime_properties['login'] = True ctx.instance.runtime_properties['workdir'] = "simulation" ctx.logger.warning('Infrastructure Interface connection simulated')
def deploy_job(script, inputs, credentials, wm_type, workdir, name, logger, skip_cleanup): # pylint: disable=W0613 """ Exec a deployment job script that receives SSH credentials as input """ wm = WorkloadManager.factory(wm_type) if not wm: raise NonRecoverableError("Workload Manager '" + wm_type + "' not supported.") # Execute the script and manage the output success = False client = SshClient(credentials) if wm._create_shell_script(client, name, ctx.get_resource(script), logger, workdir=workdir): call = "./" + name for dinput in inputs: str_input = str(dinput) if ('\n' in str_input or ' ' in str_input) and str_input[0] != '"': call += ' "' + str_input + '"' else: call += ' ' + str_input _, exit_code = client.execute_shell_command(call, workdir=workdir, wait_result=True) if exit_code != 0: logger.warning("failed to deploy job: call '" + call + "', exit code " + str(exit_code)) else: success = True if not skip_cleanup: if not client.execute_shell_command("rm " + name, workdir=workdir): logger.warning("failed removing bootstrap script") client.close_connection() return success
def cleanup_execution(config, credentials, skip, simulate, **kwargs): # pylint: disable=W0613 """ Cleans execution working directory """ if skip: return ctx.logger.info('Cleaning up...') if not simulate: workdir = ctx.instance.runtime_properties['workdir'] wm_type = config['workload_manager'] wm = WorkloadManager.factory(wm_type) if not wm: raise NonRecoverableError("Workload Manager '" + wm_type + "' not supported.") if 'credentials' in ctx.instance.runtime_properties: credentials = ctx.instance.runtime_properties['credentials'] client = SshClient(credentials) client.execute_shell_command('rm -r ' + workdir, wait_result=True) client.close_connection() ctx.logger.info('..all clean.') else: ctx.logger.warning('clean up simulated.')
def get_states(self, workdir, credentials, job_names, logger): # TODO set start time of consulting # (sacct only check current day) call = "cat croupier-monitor.data" client = SshClient(credentials) output, exit_code = client.execute_shell_command( call, workdir=workdir, wait_result=True) client.close_connection() states = {} if exit_code == 0: states = self._parse_states(output, logger) return states
def get_states(self, credentials, job_names): call = "cat croupier-monitor.dat" client = SshClient(credentials) output, exit_code = client.execute_shell_command(call, workdir=self.workdir, wait_result=True) client.close_connection() states = {} audits = {} if exit_code == 0: states = self._parse_states(output) for job_name in job_names: audits[job_name] = {} return states, audits
def get_states(self, workdir, credentials, job_names, logger): # TODO set start time of consulting # (sacct only check current day) call = "sacct -n -o JobName,State -X -P --name=" + ','.join(job_names) client = SshClient(credentials) output, exit_code = client.execute_shell_command(call, workdir=workdir, wait_result=True) client.close_connection() states = {} if exit_code == 0: states = self._parse_states(output, logger) else: logger.warning("Failed to get states") return states
def get_states(self, workdir, credentials, job_names, logger): states = {} frameinfo = getframeinfo(currentframe()) logger.debug("{2}: {0} - {1}".format(frameinfo.filename, frameinfo.lineno, frameinfo.function)) call = "curl http://{0}:`cat /security/secrets/{0}.mesos" + \ "`@localhost:5050/frameworks" for i in range(5): try: client = SshClient(credentials) user = client._user except AuthenticationException as ae: logger.debug(ae) import time time.sleep(5) continue call_format = call.format(user) logger.debug("{2}: cal_fmt: {0}, usr: {1}".format(call_format, user, frameinfo.function)) output, exit_code = client.execute_shell_command(call_format, workdir=workdir, wait_result=True) if exit_code == 0: json_output = json.loads(output) states = self._parse_frameworks_states(json_output, job_names[0], logger) else: logger.warning("failed to get states from {0}".format( call_format)) logger.debug("{0}: job_state:{1}".format(frameinfo.function, states)) client.close_connection() return states
def process_http_transfer(self): try: ctx.logger.info( 'Processing http data transfer from source {} to target {}'. format(self.dt_config['from_source']['name'], self.dt_config['to_target']['name'])) # Copy source data into target data by invoking wget command at target data infrastructure # Create wget command # Invoke command in target infrastructure # Source DS resource = self.dt_config['from_source']['resource'] endpoint = self.dt_config['from_source']['located_at']['endpoint'] url = resource if resource.startswith('http') else \ '{endpoint}/{resource}'.format(endpoint=endpoint[:-1] if endpoint.endswith('/') else endpoint, resource=resource[1:] if resource.startswith('/') else resource) # Target DS to_target_type = self.dt_config['to_target']['type'] to_target_data_url = None if 'FileDataSource' in to_target_type: to_target_data_url = self.dt_config['to_target']['filepath'] workdir = self.dt_config['to_target']['located_at']['workdir'] to_target_infra_credentials = self.dt_config['to_target'][ 'located_at']['credentials'] target_is_file = isFile(to_target_data_url) # Specifying target to copy using wget if target_is_file: wget_command = 'wget {url} -O {ds_target}'.format( url=url, ds_target=to_target_data_url) curl_command = 'curl {url} -o {ds_target}'.format( url=url, ds_target=to_target_data_url) else: wget_command = 'wget {url} -P {ds_target}'.format( url=url, ds_target=to_target_data_url) curl_command = 'cd {ds_target} & curl -O {url}'.format( url=url, ds_target=to_target_data_url) source_credentials = self.dt_config['from_source']['located_at'][ 'credentials'] if 'user' in source_credentials and 'password' in source_credentials and \ source_credentials['user'] and source_credentials['password']: user = source_credentials['user'] password = source_credentials['password'] wget_command += ' --user {0} --password {1}'.format( user, password) curl_command += ' -u {0}:{1}'.format(user, password) elif 'auth-header' in source_credentials and source_credentials[ 'auth-header']: auth_header = ' --header \'' + source_credentials[ 'auth-header-label'] + ': ' + source_credentials[ 'auth-header'] + '\'' wget_command += auth_header curl_command += auth_header ssh_client = SshClient(to_target_infra_credentials) # Execute data transfer command exit_msg, exit_code = ssh_client.execute_shell_command( wget_command, workdir=workdir, wait_result=True) if exit_code != 0: error_msg = 'Could not download using wget, trying with curl (exit code: {0}, error:{1})\n'.format( str(exit_code), exit_msg) ctx.logger.warning(error_msg) exit_msg, exit_code = ssh_client.execute_shell_command( curl_command, workdir=workdir, wait_result=True) if exit_code != 0: error_msg = 'Could not download using curl (exit code: {0}, error:{1})\n'.format( str(exit_code), exit_msg) raise CommandExecutionError(error_msg) else: ctx.logger.info("Data downloaded successfully with curl") else: ctx.logger.info("Data downloaded successfully with wget") except Exception as exp: ctx.logger.error( "There was a problem executing the data transfer: " + str(exp)) raise finally: if 'ssh_client' in locals(): ssh_client.close_connection()
def process_rsync_transfer(self, rsync_source_to_target): ssh_client = None ftp_client = None try: ctx.logger.info('Processing rsync data transfer') # Copy source data into target data by invoking rsync command at target data infrastructure Create rsync # command (check available credentials for target data infrastructure) If credential include # user/password, rsync command is: rsync -ratlz --rsh="/usr/bin/sshpass -p <passwd> ssh -o # StrictHostKeyChecking=no -o IdentitiesOnly=yes -l <user>" <source files to copy> <HPC remote # server>:<target folder> If credential include user/key, rsync command is: rsync -ratlz -e "ssh -o # IdentitiesOnly=yes -i <key_file>" <files to copy> <user>@<HPC remote server>:<target folder> Copy key # in temporary file and destroy it (whatsoever) after usage (or failure) Invoke command in target # infrastructure dt_command = None # Source DS from_source_type = self.dt_config['from_source']['type'] from_source_data_url = None if 'FileDataSource' in from_source_type: from_source_data_url = self.dt_config['from_source'][ 'filepath'] from_source_infra_endpoint = self.dt_config['from_source'][ 'located_at']['endpoint'] from_source_infra_credentials = self.dt_config['from_source'][ 'located_at']['credentials'] # Target DS to_target_type = self.dt_config['to_target']['type'] to_target_data_url = None if 'FileDataSource' in to_target_type: to_target_data_url = self.dt_config['to_target']['filepath'] to_target_infra_endpoint = self.dt_config['to_target'][ 'located_at']['endpoint'] to_target_infra_credentials = self.dt_config['to_target'][ 'located_at']['credentials'] if rsync_source_to_target: credentials = from_source_infra_credentials else: credentials = to_target_infra_credentials ssh_client = SshClient(credentials) ftp_client = SFtpClient(credentials) if rsync_source_to_target: if "user" in to_target_infra_credentials and "password" in to_target_infra_credentials: # NOTE rsync authentication with username/password requires sshpass which it is not installed # some HPC frontends target_username = to_target_infra_credentials['user'] target_password = to_target_infra_credentials['password'] dt_command = 'rsync -ratlz --rsh="/usr/bin/sshpass -p {password} ssh -o StrictHostKeyChecking=no ' \ '-o IdentitiesOnly=yes -l {username}" {ds_source} {target_endpoint}:{ds_target}'\ .format( username=target_username, password=target_password, target_endpoint=to_target_infra_endpoint, ds_source=from_source_data_url, ds_target=to_target_data_url ) elif "user" in to_target_infra_credentials and "private_key" in to_target_infra_credentials: target_username = to_target_infra_credentials['user'] target_key = to_target_infra_credentials['private_key'] # Save key in temporary file with tempfile.NamedTemporaryFile() as key_file: key_file.write(bytes(target_key, 'utf-8')) key_file.flush() key_filepath = key_file.name target_key_filepath = key_file.name.split('/')[-1] # Transfer key_file ftp_client.sendKeyFile(ssh_client, key_filepath, target_key_filepath) dt_command = 'rsync -ratlz -e "ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i ~/{key_file}" {ds_source} ' \ '{username}@{target_endpoint}:{ds_target}'.format( username=target_username, key_file=target_key_filepath, target_endpoint=to_target_infra_endpoint, ds_source=from_source_data_url, ds_target=to_target_data_url ) else: if "user" in from_source_infra_credentials and "password" in from_source_infra_credentials: # NOTE rsync authentication with username/password requires sshpass which it is not installed # some HPC frontends source_username = from_source_infra_credentials['user'] source_password = from_source_infra_credentials['password'] dt_command = 'rsync -ratlz --rsh="/usr/bin/sshpass -p {password} ssh -o StrictHostKeyChecking=no ' \ '-o IdentitiesOnly=yes -l {username}" {source_endpoint}:{ds_source} {ds_target}'\ .format( username=source_username, password=source_password, source_endpoint=from_source_infra_endpoint, ds_source=from_source_data_url, ds_target=to_target_data_url ) elif "username" in from_source_infra_credentials and "private_key" in from_source_infra_credentials: source_username = from_source_infra_credentials['user'] source_key = from_source_infra_credentials['private_key'] # Save key in temporary file with tempfile.NamedTemporaryFile() as key_file: key_file.write(bytes(source_key, 'utf-8')) key_file.flush() key_filepath = key_file.name source_key_filepath = key_file.name.split('/')[-1] # Transfer key_file ftp_client.sendKeyFile(ssh_client, key_filepath, source_key_filepath) dt_command = 'rsync -ratlz -e "ssh -o IdentitiesOnly=yes -o StrictHostKeyChecking=no -i ~/{key_file}" ' \ '{username}@{source_endpoint}:{ds_source} {ds_target}'.format( username=source_username, key_file=source_key_filepath, source_endpoint=from_source_infra_endpoint, ds_source=from_source_data_url, ds_target=to_target_data_url ) # Execute data transfer command ctx.logger.info( 'rsync data transfer: executing command: {}'.format( dt_command)) exit_msg, exit_code = ssh_client.execute_shell_command( dt_command, wait_result=True) if exit_code != 0: raise CommandExecutionError( "Failed executing rsync data transfer: exit code " + str(exit_code) + " and msg: " + exit_msg) except Exception as exp: raise CommandExecutionError( "Failed trying to connect to data source infrastructure: " + str(exp)) finally: ftp_client.close_connection() ssh_client.close_connection()