def transfer_files(self): JobDeploymentBase.transfer_files(self) # Here we transfer any input files to the relevant directory on the # target platform. # Use SAGA-Python to handle the file transfer. LOG.debug('Transfer files...') job_dir = self.platform_config.storage_job_directory host = self.platform_config.platform_service_host try: directory = Directory('sftp://%s%s' % (host, job_dir), session=self.session) except saga.BadParameter as e: LOG.error('The specified job directory does not exist on PBS ' 'submission node <%s> (%s).' % (host, str(e))) raise JobError('The specified job directory does not exist on PBS' 'submission node <%s> (%s)' % (host, str(e))) try: # directory.make_dir() does not return a handle to the new directory # so need to create the directory URL manually. directory.make_dir(self.job_config.job_id) job_data_dir = os.path.join(str(directory.url), self.job_config.job_id) except saga.NoSuccess as e: LOG.error('The specified job data directory already exists on ' 'PBS submission node <%s> (%s).' % (host, str(e))) raise JobError('The specified job directory already exists on PBS' 'submission node <%s> (%s)' % (host, str(e))) # Now upload the file(s) to the job data directory # and create an input file list containing the resulting locations # of the files. # There are some cases where jobs may not have input files (they may, # for example pull the input files from a remote location as part of # the job process) so we first check whether there are any input files # to process, if not, then return from this function if not self.job_config.input_files: LOG.debug('There are no input files to transfer for this job...') return self.transferred_input_files = [] for f in self.job_config.input_files: try: f_obj = File('file://%s' % f, session=self.session) f_obj.copy(job_data_dir) dest_dir = os.path.join(directory.url.path, self.job_config.job_id) self.transferred_input_files.append( os.path.join(dest_dir, os.path.basename(f_obj.url.path))) except: LOG.error('Error copying the input file <%s> to the remote ' 'platform.' % f) raise JobError('Error copying the input file <%s> to the ' 'remote platform.' % f)
def collect_output(self, destination): # We're using the default implementation of the file transfer code # This doesn't take into account a different port for the remote host # connection. To work around this, we temporarily set the host property # to include the port and the revert to the original value after the # file transfer is complete. host_tmp = self.host self.host = ('%s:%s' % (self.host, self.port)) # Using the base implementation of job output file collection... JobDeploymentBase.collect_output(self, destination) # If job_config delete_job_files is True, we can now delete the job # files on the remote platform if self.job_config.delete_job_files: jobs_dir = self.platform_config.storage_job_directory # Check that the job storage directory exists and then create a # sub-directory specifically for this job. try: LOG.debug('URL for file job directory: sftp://%s%s' % (self.host, jobs_dir)) directory = Directory('sftp://%s%s' % (self.host, jobs_dir), session=self.session) except saga.BadParameter as e: LOG.error('The specified job directory does not exist on ' 'resource <%s> (%s).' % (self.host, str(e))) raise JobError('The specified job directory does not exist ' 'on resource <%s> (%s)' % (self.host, str(e))) try: LOG.debug('Deleting job directory after job completion ' '<sftp://%s%s/%s>' % (self.host, jobs_dir, self.job_config.job_id)) directory.remove(self.job_config.job_id, RECURSIVE) except saga.NoSuccess as e: LOG.error('The specified job data directory couldn\'t be ' 'removed <%s> (%s).' % (self.job_config.job_id, str(e))) raise JobError('The specified job data directory couldn\'t be ' 'removed <%s> (%s)' % (self.job_config.job_id, str(e))) # Set the host value back to its original value self.host = host_tmp
def transfer_files(self): JobDeploymentBase.transfer_files(self) LOG.debug('SSH Deployer: Transfer files...') # Here we transfer any input files to the relevant directory on the # target platform. # Use SAGA-Python to handle the file transfer. job_dir = self.platform_config.storage_job_directory # Check that the job storage directory exists and then create a # sub-directory specifically for this job. try: LOG.debug('URL for file transfer: <sftp://%s:%s%s>' % (self.host, self.port, job_dir)) directory = Directory('sftp://%s:%s%s' % (self.host, self.port, job_dir), session=self.session) except saga.BadParameter as e: LOG.error('Error setting up connection to resource directory.') if 'connection refused' in str(e).lower(): raise ConnectionError('Unable to connect to remote resource ' 'to set up connection to directory.') raise StorageDirectoryNotFoundError( 'The specified job data base ' 'directory does not exist on resource <%s> (%s)' % (self.host, str(e))) try: # directory.make_dir() does not return a handle to the new directory # so need to create the directory URL manually. directory.make_dir(self.job_config.job_id) job_data_dir = os.path.join(str(directory.url), self.job_config.job_id) except saga.NoSuccess as e: LOG.error('The specified job data directory already exists on ' 'resource <%s> (%s).' % (self.host, str(e))) raise DirectoryExistsError('The specified job directory already ' 'exists on resource <%s> (%s)' % (self.host, str(e))) # Now upload the file(s) to the job data directory # and create an input file list containing the resulting locations # of the files. # There are some cases where jobs may not have input files (they may, # for example pull the input files from a remote location as part of # the job process) so we first check whether there are any input files # to process, if not, then return from this function if not self.job_config.input_files: LOG.debug('There are no input files to transfer for this job...') return self.transferred_input_files = [] for f in self.job_config.input_files: try: f_obj = File('file://%s' % f, session=self.session) f_obj.copy(job_data_dir) dest_dir = os.path.join(directory.url.path, self.job_config.job_id) self.transferred_input_files.append( os.path.join(dest_dir, os.path.basename(f_obj.url.path))) except: LOG.error('Error copying the input file <%s> to the remote ' 'platform.' % f) raise JobError('Error copying the input file <%s> to the ' 'remote platform.' % f)
def _setup_job_account(self, pty_conn, platform_config): user_id = platform_config.user_id user_home = platform_config.user_home public_key = platform_config.user_public_key admin_user = platform_config.image_unconfigured_admin_key_user # Creating the job user on the remote node LOG.debug( 'Creating job user account for user <%s> on remote node <%s>' % (user_id, pty_conn.url)) # First check if the user directory exists cmd = 'sudo test -d %s' result, out, err = pty_conn.run_sync(cmd % (user_home)) if result != 1: raise JobError( 'The specified user home directory <%s> for the job ' 'user <%s> already exists. Unable to proceed with ' 'resource configuration.' % (user_home, user_id)) cmd = 'useradd -d %s -m %s' if admin_user != 'root': cmd = 'sudo ' + cmd result, out, err = pty_conn.run_sync(cmd % (user_home, user_id)) LOG.debug('useradd command completed - Exit code: <%s>, ' 'StdOut: <%s>, StdErr:\n<%s>' % (result, out, err)) # Check if user home created during user account creation # If account already existed, we may need to create the directory here try: home_dir = Directory(pty_conn.url + user_home, session=pty_conn.session) except BadParameter: # Assume home directory doesn't exist and create it here. rootdir = Directory(pty_conn.url + '/', session=pty_conn.session) rootdir.make_dir(user_home) home_dir = Directory(pty_conn.url + user_home, session=pty_conn.session) try: home_dir.make_dir(os.path.join(user_home, '.ssh')) except saga.NoSuccess as e: if 'exists' in str(e): LOG.debug('Directory <%s> already exists...' % os.path.join(user_home, '.ssh')) else: raise JobError('Unable to create the SSH directory in user ' 'home <%s>...' % os.path.join(user_home, '.ssh')) try: home_dir.make_dir(platform_config.storage_job_directory) except saga.NoSuccess as e: if 'exists' in str(e): LOG.debug('Job data directory <%s> already exists...' % platform_config.storage_job_directory) else: raise JobError('Unable to create platform data directory ' '<%s>.' % platform_config.storage_job_directory) # Write the public key to the authorized keys file on the remote node pty_conn.write_to_remote( public_key, os.path.join(user_home, '.ssh', 'authorized_keys')) # Change ownership of all created directories/files to the job user pty_conn.run_sync('chown -R %s:%s %s' % (user_id, user_id, user_home))
def _wait_for_node_accessbility_saga(self, node_ip_list, user_id, key_file, port=22, retries=3, pre_check_delay=10): # Using saga to check if remote resources are accessible #retries = 3 retries = 5 attempts_made = 0 connection_successful = False LOG.debug('Waiting <%s> seconds to check for resource accessibility.' % (pre_check_delay)) time.sleep(pre_check_delay) # Create an empty session with no contexts self.session = saga.Session(default=False) if self.admin_ctx: self.session.add_context(self.admin_ctx) else: self.session.add_context(self.job_ctx) # TODO: Shouldn't try other security contexts until we've tried one # context with all nodes, at present the connection fails because we # switch contexts before checking each node... while attempts_made < retries and not connection_successful: nodes_ok = [] for ip in node_ip_list: try: LOG.debug('Attempt <%s> to connect to remote resource ' '<%s> using SAGA...' % (attempts_made + 1, ip)) dir_obj = Directory('sftp://%s/' % ip, session=self.session) LOG.debug('Triggering connection to remote node by ' 'attempting root dir list...') dir_obj.list() LOG.debug('Connected to remote node successfully...') dir_obj.close() LOG.debug('Closed connection to remote node...') nodes_ok.append(ip) except socket.timeout: LOG.debug('Timed out trying to connect to <%s>...' % ip) except OSError as e: LOG.debug('OSError trying to connect to <%s>: %s' % (ip, str(e))) except NoSuccess as e: LOG.debug( 'NoSuccess making connection to resource <%s>: %s' % (ip, str(e))) except BadParameter as e: LOG.debug('BadParameter making connection to resource <%s>' ': %s' % (ip, str(e))) except AuthenticationFailed as e: LOG.debug('Authentication failure when making connection ' 'to resource <%s>: %s\nTrying next security ' 'context...' % (ip, str(e))) raise NoSuccess('No valid security context for ' 'connection to resource <%s>.' % ip) node_ip_list = [ item for item in node_ip_list if item not in nodes_ok ] # if node list is empty and all nodes are running set flag to true if not node_ip_list: connection_successful = True attempts_made += 1 if not connection_successful and attempts_made < retries: wait_time = 10 * attempts_made LOG.debug( 'Waiting <%s> seconds before retrying connection...' % wait_time) time.sleep(wait_time) if not connection_successful: LOG.debug('ERROR: Unable to connect to remote node...') else: LOG.debug('**** SAGA CONNECTION TO REMOTE NODE(S) SUCCESSFUL ****') return connection_successful
def transfer_files(self): JobDeploymentBase.transfer_files(self) # Here we transfer any input files to the relevant directory on the # target platform. # Use SAGA-Python to handle the file transfer. LOG.debug('Transfer files...') job_dir = self.platform_config.storage_job_directory # At this point we need to switch back to using the job secruity # context. If we were using unconfigured resources, these will have # been configured using an admin context by now. self.session = saga.Session(default=False) self.session.add_context(self.job_ctx) # Begin by checking if we're working with more than one instance, if # so we have a master and one or more slave nodes. We'll push the data # to the master and then direct the master to distribute it to the # slave nodes. master_node = self.running_nodes[0][0] slave_nodes = [] if len(self.running_nodes) > 1: slave_nodes = [node[0] for node in self.running_nodes[1:]] # On the master node: Check that the job storage directory exists and # then create a sub-directory specifically for this job. # Node is a tuple consisting of two items, the node object and an # IP list. For now we work with the node object directly. node_ip = master_node.public_ips[0] try: directory = Directory('sftp://%s%s' % (node_ip, job_dir), session=self.session) except saga.BadParameter as e: LOG.error('The specified job directory does not exist on node ' '<%s> (%s).' % (node_ip, str(e))) #raise JobError('The specified job directory does not exist ' # 'on node <%s> (%s)' % (node_ip, str(e))) try: # directory.make_dir() does not return a handle to the new directory # so need to create the directory URL manually. directory.make_dir(self.job_config.job_id) except saga.NoSuccess as e: LOG.warning('The specified job data directory already exists on ' 'node <%s> (%s).' % (node_ip, str(e))) #raise JobError('The specified job directory already exists on ' # 'on node <%s> (%s)' % (node_ip, str(e))) job_data_dir = os.path.join(str(directory.url), self.job_config.job_id) # Now upload the file(s) to the job data directory # and create an input file list containing the resulting locations # of the files. # There are some cases where jobs may not have input files (they may, # for example pull the input files from a remote location as part of # the job process) so we first check whether there are any input files # to process, if not, then return from this function if not self.job_config.input_files: LOG.debug('There are no input files to transfer for this job...') return self.transferred_input_files = [] for f in self.job_config.input_files: try: f_obj = File('file://%s' % f, session=self.session) f_obj.copy(job_data_dir) dest_dir = os.path.join(directory.url.path, self.job_config.job_id) self.transferred_input_files.append( os.path.join(dest_dir, os.path.basename(f_obj.url.path))) except: LOG.error('Error copying the input file <%s> to the remote ' 'platform.' % f) raise JobError('Error copying the input file <%s> to the ' 'remote platform.' % f) # At this point input files have been successfully transferred to # the master node. We now direct the master node to send the files # to each of the slave nodes: if slave_nodes: slave_private_ips = [node.private_ips[0] for node in slave_nodes] self._distribute_job_data(master_node.public_ips[0], slave_private_ips, self.platform_config.user_id, self.platform_config.user_key_file, job_dir, self.job_config.job_id)
def _setup_job_account(self, pty_conn, platform_config): user_id = platform_config.user_id user_home = platform_config.user_home public_key = platform_config.user_public_key admin_user = platform_config.image_unconfigured_admin_key_user # Creating the job user on the remote node LOG.debug( 'Creating job user account for user <%s> on remote node <%s>' % (user_id, pty_conn.url)) # First check if the user directory exists cmd = 'sudo test -d %s' result, out, err = pty_conn.run_sync(cmd % (user_home)) if result != 1: raise JobError( 'The specified user home directory <%s> for the job ' 'user <%s> already exists. Unable to proceed with ' 'resource configuration.' % (user_home, user_id)) cmd = 'useradd -d %s -m %s' if admin_user != 'root': cmd = 'sudo ' + cmd result, out, err = pty_conn.run_sync(cmd % (user_home, user_id)) LOG.debug('useradd command completed - Exit code: <%s>, ' 'StdOut: <%s>, StdErr:\n<%s>' % (result, out, err)) # Check if user home created during user account creation # If account already existed, we may need to create the directory here try: home_dir = Directory(pty_conn.url + user_home, session=pty_conn.session) except BadParameter: # Assume home directory doesn't exist and create it here. cmd = 'mkdir -p %s' % user_home if pty_conn.session.contexts[0].user_id != 'root': cmd = 'sudo ' + cmd res, out, err = pty_conn.run_sync(cmd) LOG.debug('Make directory <%s> result <%s>, out <%s>, err <%s>' % (res, out, err)) if res != 0: raise JobError('Unable to create the user home directory ' '<%s>...' % user_home) home_dir = Directory(pty_conn.url + user_home, session=pty_conn.session) # try: # home_dir.make_dir(os.path.join(user_home,'.ssh')) # except saga.NoSuccess as e: # if 'exists' in str(e): # LOG.debug('Directory <%s> already exists...' # % os.path.join(user_home,'.ssh')) # else: # raise JobError('Unable to create the SSH directory in user ' # 'home <%s>...' % os.path.join(user_home,'.ssh')) cmd = 'mkdir -p %s' % os.path.join(user_home, '.ssh') if pty_conn.session.contexts[0].user_id != 'root': cmd = 'sudo ' + cmd res, out, err = pty_conn.run_sync(cmd) LOG.debug('Make directory <%s> result <%s>, out <%s>, err <%s>' % (os.path.join(user_home, '.ssh'), res, out, err)) if res != 0: raise JobError('Unable to create the SSH directory in user ' 'home <%s>...' % os.path.join(user_home, '.ssh')) cmd = 'mkdir -p %s' % platform_config.storage_job_directory if pty_conn.session.contexts[0].user_id != 'root': cmd = 'sudo ' + cmd res, out, err = pty_conn.run_sync(cmd) LOG.debug('Make directory <%s> result <%s>, out <%s>, err <%s>' % (platform_config.storage_job_directory, res, out, err)) if res != 0: raise JobError('Unable to create platform data directory ' '<%s>.' % platform_config.storage_job_directory) # TODO: Need a much nicer way of handling this. Since # write_to_remote might upload the file as a non-root user into a # directory created with admin rights and hence owned by root, if the # user used by pty_conn is not root, we temporarily change ownership of # the home directory to the current user, write the public key and # then apply the chown of all files to the libhpc user... if pty_conn.session.contexts[0].user_id != 'root': current_user = pty_conn.session.contexts[0].user_id cmd = 'sudo chown -R %s:%s %s' % (current_user, current_user, user_home) pty_conn.run_sync(cmd) # Write the public key to the authorized keys file on the remote node pty_conn.write_to_remote( public_key, os.path.join(user_home, '.ssh', 'authorized_keys')) # Change ownership of the authorised keys file just created... cmd = 'chown -R %s:%s %s' % (user_id, user_id, user_home) if pty_conn.session.contexts[0].user_id != 'root': cmd = 'sudo ' + cmd pty_conn.run_sync(cmd)