def transfer_files(self): JobDeploymentBase.transfer_files(self) # Here we transfer any input files to the relevant directory on the # target platform. # Use SAGA-Python to handle the file transfer. LOG.debug('Transfer files...') job_dir = self.platform_config.storage_job_directory host = self.platform_config.platform_service_host try: directory = Directory('sftp://%s%s' % (host, job_dir), session=self.session) except saga.BadParameter as e: LOG.error('The specified job directory does not exist on PBS ' 'submission node <%s> (%s).' % (host, str(e))) raise JobError('The specified job directory does not exist on PBS' 'submission node <%s> (%s)' % (host, str(e))) try: # directory.make_dir() does not return a handle to the new directory # so need to create the directory URL manually. directory.make_dir(self.job_config.job_id) job_data_dir = os.path.join(str(directory.url), self.job_config.job_id) except saga.NoSuccess as e: LOG.error('The specified job data directory already exists on ' 'PBS submission node <%s> (%s).' % (host, str(e))) raise JobError('The specified job directory already exists on PBS' 'submission node <%s> (%s)' % (host, str(e))) # Now upload the file(s) to the job data directory # and create an input file list containing the resulting locations # of the files. # There are some cases where jobs may not have input files (they may, # for example pull the input files from a remote location as part of # the job process) so we first check whether there are any input files # to process, if not, then return from this function if not self.job_config.input_files: LOG.debug('There are no input files to transfer for this job...') return self.transferred_input_files = [] for f in self.job_config.input_files: try: f_obj = File('file://%s' % f, session=self.session) f_obj.copy(job_data_dir) dest_dir = os.path.join(directory.url.path, self.job_config.job_id) self.transferred_input_files.append( os.path.join(dest_dir, os.path.basename(f_obj.url.path))) except: LOG.error('Error copying the input file <%s> to the remote ' 'platform.' % f) raise JobError('Error copying the input file <%s> to the ' 'remote platform.' % f)
def collect_output(self, destination): # Here we collect the output from the remote cloud nodes when a job has # completed, the output data is transferred to the specified location # for storage so that it is available for users to collect it after # the cloud resources have been shut down. LOG.debug('Collect output...') # TODO: Need to bundle the output files into a tar or similar archive # to pull them back to the host. There may be a large number of files # so this is preferable to pulling each file back separately. # For now we pull back files individually #======================================================================= # # ### Looking at running a separate SSH job to bundle the output # # ### files into an archive that can then be transferred back. # # ### TODO: Need to find a cross-platform way of handling this. #======================================================================= # Work out whether we have an array of running nodes (e.g. cloud nodes) # or whether we're dealing with a single host. If the former is true # then we get the IP/hostname of the target resource from the # running_nodes array, otherwise we can just use the host variable. remote_host = self.host if not getattr( self, 'running_nodes', None) else self.running_nodes[0][0].public_ips[0] LOG.debug('Remote host for file transfer source: %s' % remote_host) LOG.debug('Preparing output archiving job...') archive_file = self.job_config.job_id + '.tar.gz' jd = Description() jd.environment = getattr(self.job_config, 'environment', {}) jd.executable = 'touch' jd.arguments = ['.', ';', 'tar', 'zcvf', archive_file, '*'] jd.working_directory = getattr(self.job_config, 'working_dir', None) self.svc = Service('ssh://%s/' % remote_host, session=self.session) self.job = self.svc.create_job(jd) LOG.debug('Running output archiving job...') self.job.run() self.job.wait() LOG.debug('Output archiving job complete...') working_dir = getattr(self.job_config, 'working_dir', None) if not working_dir: raise ValueError('There is no working directory set. Unable to ' 'retrieve output files.') # Get a list of the directories to pull the output files back from # TODO: For now we just pull the archive file from the master node # but assume that we also need to consider output egnerated on other # nodes. output_files = [] #output_file_dirs = [] #for node in self.running_nodes: # node_ip = node.public_ips[0] # output_file_dirs.append('sftp://%s%s' % (node_ip, working_dir)) output_file_archive = 'sftp://%s%s' % ( remote_host, os.path.join(working_dir, archive_file)) LOG.debug('Output file archive: %s' % output_file_archive) output_files.append(output_file_archive) LOG.debug('Got output files: %s' % output_files) parsed_destination = urlparse.urlparse(destination) if parsed_destination.scheme == '': destination = 'file://' + destination for output_file in output_files: of = File(output_file, session=self.session) of.copy(destination)
def transfer_files(self): JobDeploymentBase.transfer_files(self) LOG.debug('SSH Deployer: Transfer files...') # Here we transfer any input files to the relevant directory on the # target platform. # Use SAGA-Python to handle the file transfer. job_dir = self.platform_config.storage_job_directory # Check that the job storage directory exists and then create a # sub-directory specifically for this job. try: LOG.debug('URL for file transfer: <sftp://%s:%s%s>' % (self.host, self.port, job_dir)) directory = Directory('sftp://%s:%s%s' % (self.host, self.port, job_dir), session=self.session) except saga.BadParameter as e: LOG.error('Error setting up connection to resource directory.') if 'connection refused' in str(e).lower(): raise ConnectionError('Unable to connect to remote resource ' 'to set up connection to directory.') raise StorageDirectoryNotFoundError( 'The specified job data base ' 'directory does not exist on resource <%s> (%s)' % (self.host, str(e))) try: # directory.make_dir() does not return a handle to the new directory # so need to create the directory URL manually. directory.make_dir(self.job_config.job_id) job_data_dir = os.path.join(str(directory.url), self.job_config.job_id) except saga.NoSuccess as e: LOG.error('The specified job data directory already exists on ' 'resource <%s> (%s).' % (self.host, str(e))) raise DirectoryExistsError('The specified job directory already ' 'exists on resource <%s> (%s)' % (self.host, str(e))) # Now upload the file(s) to the job data directory # and create an input file list containing the resulting locations # of the files. # There are some cases where jobs may not have input files (they may, # for example pull the input files from a remote location as part of # the job process) so we first check whether there are any input files # to process, if not, then return from this function if not self.job_config.input_files: LOG.debug('There are no input files to transfer for this job...') return self.transferred_input_files = [] for f in self.job_config.input_files: try: f_obj = File('file://%s' % f, session=self.session) f_obj.copy(job_data_dir) dest_dir = os.path.join(directory.url.path, self.job_config.job_id) self.transferred_input_files.append( os.path.join(dest_dir, os.path.basename(f_obj.url.path))) except: LOG.error('Error copying the input file <%s> to the remote ' 'platform.' % f) raise JobError('Error copying the input file <%s> to the ' 'remote platform.' % f)
def transfer_files(self): JobDeploymentBase.transfer_files(self) # Here we transfer any input files to the relevant directory on the # target platform. # Use SAGA-Python to handle the file transfer. LOG.debug('Transfer files...') job_dir = self.platform_config.storage_job_directory # At this point we need to switch back to using the job secruity # context. If we were using unconfigured resources, these will have # been configured using an admin context by now. self.session = saga.Session(default=False) self.session.add_context(self.job_ctx) # Begin by checking if we're working with more than one instance, if # so we have a master and one or more slave nodes. We'll push the data # to the master and then direct the master to distribute it to the # slave nodes. master_node = self.running_nodes[0][0] slave_nodes = [] if len(self.running_nodes) > 1: slave_nodes = [node[0] for node in self.running_nodes[1:]] # On the master node: Check that the job storage directory exists and # then create a sub-directory specifically for this job. # Node is a tuple consisting of two items, the node object and an # IP list. For now we work with the node object directly. node_ip = master_node.public_ips[0] try: directory = Directory('sftp://%s%s' % (node_ip, job_dir), session=self.session) except saga.BadParameter as e: LOG.error('The specified job directory does not exist on node ' '<%s> (%s).' % (node_ip, str(e))) #raise JobError('The specified job directory does not exist ' # 'on node <%s> (%s)' % (node_ip, str(e))) try: # directory.make_dir() does not return a handle to the new directory # so need to create the directory URL manually. directory.make_dir(self.job_config.job_id) except saga.NoSuccess as e: LOG.warning('The specified job data directory already exists on ' 'node <%s> (%s).' % (node_ip, str(e))) #raise JobError('The specified job directory already exists on ' # 'on node <%s> (%s)' % (node_ip, str(e))) job_data_dir = os.path.join(str(directory.url), self.job_config.job_id) # Now upload the file(s) to the job data directory # and create an input file list containing the resulting locations # of the files. # There are some cases where jobs may not have input files (they may, # for example pull the input files from a remote location as part of # the job process) so we first check whether there are any input files # to process, if not, then return from this function if not self.job_config.input_files: LOG.debug('There are no input files to transfer for this job...') return self.transferred_input_files = [] for f in self.job_config.input_files: try: f_obj = File('file://%s' % f, session=self.session) f_obj.copy(job_data_dir) dest_dir = os.path.join(directory.url.path, self.job_config.job_id) self.transferred_input_files.append( os.path.join(dest_dir, os.path.basename(f_obj.url.path))) except: LOG.error('Error copying the input file <%s> to the remote ' 'platform.' % f) raise JobError('Error copying the input file <%s> to the ' 'remote platform.' % f) # At this point input files have been successfully transferred to # the master node. We now direct the master node to send the files # to each of the slave nodes: if slave_nodes: slave_private_ips = [node.private_ips[0] for node in slave_nodes] self._distribute_job_data(master_node.public_ips[0], slave_private_ips, self.platform_config.user_id, self.platform_config.user_key_file, job_dir, self.job_config.job_id)
def deploy_software(self, software_config=None): JobDeploymentBase.deploy_software(self) # Here we undertake transfer of the code to the remote platform if this # is required. In many cases, the software is likely to already be # deployed on the target platform or may have been configured via a # tool such as cloud-init, puppet, etc at resource initialisation time. LOG.debug('Deploy software...') # If we're not using an unconfigured image, we don't need to run the # deploy software function if not self.use_unconfigured: LOG.info('Using a pre-configured image so running software ' 'deployment process...') return # Software deployment requires root access to the target node(s). This # should be possible using the key that has been passed to start the # reosurce(s). # If no software configuration is provided, we ignore this function # call and return. If a configuration is provided, we check that the # configuration is for the right type of platform and then deploy # the software. if not software_config: return if type(software_config) != type([]): software_config = [software_config] LOG.debug('Received a request to deploy the following software ' 'configuration IDs to the target platforms: <%s>...' % software_config) # Check that we have an admin security context available. If we don't # we can't connect to the remote resource(s) to do the required # configuration if not self.admin_ctx: raise JobError( 'deploy_software: There is no admin context ' 'available so it will not be possible to connect ' 'to remote resources to configure them. Ensure that ' '') # Check that we can get each of the software configurations and that # each one supports the target deployment platform. scm = SoftwareConfigManager.get_instance() scm.init_configuration() os_name = self.platform_config.image_unconfigured_os flavour = self.platform_config.image_unconfigured_flavour admin_key_user = self.platform_config.image_unconfigured_admin_key_user admin_key_file = self.platform_config.image_unconfigured_admin_key_file sc_dict = {} for sc in software_config: try: conf = scm.get_software_configuration(sc) sc_dict[sc] = conf except ValueError as e: raise JobError('Job error - no software could be found for ' 'the configuration id <%s>: %s' % (sc, str(e))) if not ((os_name == conf.software_os_type) and (flavour == conf.software_os_flavour)): LOG.error( 'The OS <%s> and flavour <%s> in the provided software ' 'configuration don\'t match the target platform with ' 'OS <%s> and flavour <%s>.' % (conf.software_os_type, conf.software_os_flavour, os_name, flavour)) raise JobError( 'The OS <%s> and flavour <%s> in the provided ' 'software configuration don\'t match the target ' 'platform with OS <%s> and flavour <%s>.' % (conf.software_os_type, conf.software_os_flavour, os_name, flavour)) # If we reach this point we assume that each of the software # configurations has been found and they are for the right target # platform. for sc_key in sc_dict.keys(): sc_obj = sc_dict[sc_key] install_commands = sc_obj.get_install_commands() # Now run each of the install commands synchronously on all of the # target machines to get the software installed. node_ips = [node[0].public_ips[0] for node in self.running_nodes] LOG.debug('Deploying to the following list of nodes: %s' % node_ips) # Set up a new session using the admin user and key provided for # the unconfigured image. adm_session = saga.Session(default=False) adm_ctx = saga.Context("ssh") adm_ctx.user_id = admin_key_user adm_ctx.user_key = admin_key_file adm_session.add_context(adm_ctx) # Open shell connections to each of the machines shell_conns = [] opts = {} opts['ssh_options'] = {'StrictHostKeyChecking': 'no'} for node_ip in node_ips: conn = PTYShell('ssh://%s' % node_ip, session=adm_session, opts=opts) shell_conns.append(conn) if conf.software_os_type == 'linux': self._setup_job_account(conn, self.platform_config) else: LOG.warning( 'Support for creation of job accounts on ' 'platforms other than linux is not yet supported...') # Copy the job account key to the master node job_session = saga.Session(default=False) job_session.add_context(self.job_ctx) keyfile = File('file://%s' % self.platform_config.user_key_file, session=job_session) keyfile_target = shell_conns[0].url + os.path.join( self.platform_config.user_home, '.ssh', 'id_rsa') LOG.debug('Copying job key to target directory <%s>' % keyfile_target) keyfile.copy(keyfile_target) for cmd in install_commands: for shell_connection in shell_conns: if isinstance(cmd, SoftwareConfigFile): LOG.debug( 'Software deployment: About to write data to ' 'remote file <%s> on node <%s>' % (cmd.filename, shell_connection.url)) shell_connection.write_to_remote( cmd.data, cmd.filename) else: LOG.debug('Software deployment: About to run command ' '<%s> on resource <%s>...' % (cmd, shell_connection.url)) if admin_key_user != 'root': cmd = 'sudo ' + cmd result, out, err = shell_connection.run_sync(cmd) LOG.debug('Command completed - Exit code: <%s>, ' 'StdOut: <%s>, StdErr:\n<%s>' % (result, out, err))
def initialise_resources(self, prefer_unconfigured=True, num_processes=1, processes_per_node=1, node_type='m1.small', job_id=None, retries=3, software_config=None): JobDeploymentBase.initialise_resources(self) # Start up the cloud resources here and wait for them to reach the # running state. Need to know the image ID that we're starting. The # image ID is available from the job configuration image_id = None image_preconfigured_id = self.platform_config.image_preconfigured_id image_unconfigured_id = self.platform_config.image_unconfigured_id # Store whether or not we're using an unconfigured image - this # determines whether we end up running the deploy software function # or not. self.use_unconfigured = False if image_preconfigured_id and not image_unconfigured_id: image_id = image_preconfigured_id LOG.debug('Only a configured image identifier has been provided, ' 'using image ID <%s>.' % image_id) elif (not image_preconfigured_id) and image_unconfigured_id: image_id = image_unconfigured_id self.use_unconfigured = True LOG.debug('Only an unconfigured image identifier has been ' 'provided, using image ID <%s>.' % image_id) if not software_config: raise JobError( 'Only an unconfigured image identifier has been ' 'provided but no software config has been specified. ' 'Unable to continue...') elif image_preconfigured_id and image_unconfigured_id: LOG.debug('Both configured and unconfigured images provided...') if prefer_unconfigured: image_id = image_unconfigured_id self.use_unconfigured = True LOG.debug('Using unconfigured image ID <%s>.' % image_id) if not software_config: raise JobError( 'An unconfigured image identifier has been ' 'chosen but no software config has been specified. ' 'Unable to continue...') else: image_id = image_preconfigured_id LOG.debug('Using pre-configured image ID <%s>.' % image_id) else: raise ResourceInitialisationError( 'ERROR: No image information ' 'available in the platform configuration, unable ' 'to initialise resources.') # If we're using an unconfigured image, we need to prepare the admin # security context based on the information that should be provided # in the YAML file with the unconfigured image details. if self.use_unconfigured: self.admin_ctx = saga.Context("ssh") self.admin_ctx.user_id = self.platform_config.image_unconfigured_admin_key_user self.admin_ctx.user_key = self.platform_config.image_unconfigured_admin_key_file # Check that the image is present and then use the libcloud driver to # start the resources and return once they're running. # TODO: This is currently synchronous but could also be done # asynchronously using a callback to notify the caller when the nodes # are ready. #images = self.driver.list_images() #img = next((i for i in images if i.id == image_id), None) #if not img: img = None try: #img = self.driver.get_image(image_id) images = self.driver.list_images() for image in images: if image.id == image_id: img = image break if img == None: raise ResourceInitialisationError('The specified image <%s> ' 'could not be found' % image_id) except socket.error as e: img = None raise ResourceInitialisationError( 'ERROR contacting the remote ' 'cloud platform. Do you have an active network ' 'connection? - <%s>' % str(e)) except Exception as e: LOG.debug('ERROR STRING: %s' % str(e)) img = None if str(e).startswith('Unauthorized:'): raise InvalidCredentialsError( 'ERROR: Access to the cloud ' 'platform at <%s> was not authorised. Are your ' 'credentials correct?' % (self.platform_config.platform_service_host + ':' + str(self.platform_config.platform_service_port))) else: raise ResourceInitialisationError( 'ERROR: The specified image <%s> ' 'is not present on the target platform, unable ' 'to start resources.' % image_id) sizes = self.driver.list_sizes() size = next((s for s in sizes if s.id == node_type), None) if not size: raise ResourceInitialisationError( 'ERROR: The specified resource ' 'size (node_type) <%s> is not present on the ' 'target platform. Unable to start resources. Have ' 'you set the node_type parameter in your job spec?' % node_type) # Get the keypair name from the configuration # If we're using an unconfigured resource, we use the admin key pair # name if provided. if self.use_unconfigured and self.platform_config.image_unconfigured_admin_key_name: keypair_name = self.platform_config.image_unconfigured_admin_key_name else: keypair_name = self.platform_config.user_key_name # Get the number of resources from the job configuration # TODO: Fix this to obtain number of cores per node from the cloud # cloud platform. For now use the specified processes_per_node in the # job specification. cores_per_node = processes_per_node #cores_per_node = self.RESOURCE_TYPE_CORES[node_type] #if cores_per_node < processes_per_node: # LOG.debug('A processes_per_node value <%s> greater than the number ' # 'of cores in a node <%s> has been specified. Altering ' # 'processes per node to the maximum available on this ' # 'node type <%s>.' % (processes_per_node, cores_per_node, # node_type)) # processes_per_node = cores_per_node num_nodes = int(ceil(float(num_processes) / float(processes_per_node))) # At this point we know that the image is available and the specified # resource type is valid so we can request to start the instance(s) LOG.debug('About to start <%s> resources of type <%s> based on image ' '<%s (%s)> with keypair <%s>.' % (num_nodes, size.name, img.id, img.name, keypair_name)) # When starting a resource we need the name, image, type, keypair, # configuration data and details of the number of resources to start. name = job_id if not name: name = generate_instance_id() self.nodes = self.driver.create_node(name=name, image=img, size=size, ex_keyname=keypair_name, ex_mincount=num_nodes, ex_maxcount=num_nodes) if type(self.nodes) != type([]): self.nodes = [self.nodes] self.running_nodes = self.driver.wait_until_running(self.nodes) # Before we return details of the running nodes, we need to check # that they're accessible - it takes some time for the nodes to boot # and become available. We do this by setting up a handle to a # directory - we assume all nodes have a '/' directory - and then # trying to list that directory. If an exception is thrown, we assume # that the nodes are not yet available. # TODO: Need to replace this wait with a reliable check as to whether # the server is up and running. Looks like, for now, this will need to # use Paramiko while awaiting updates on saga-python. #LOG.debug('Waiting 60 seconds for node to boot...') #time.sleep(60) # Replaced 60 second wait with check using Paramiko to see if # resource is accessible... LOG.debug('Checking node is available...') nodes_to_check = [] for node in self.running_nodes: nodes_to_check.append(node[0].public_ips[0]) res = self._wait_for_node_accessbility( nodes_to_check, self.platform_config.user_id, self.platform_config.user_key_file, retries=retries) if not res: # We still have nodes that are not avialable so assume there's a # problem and throw a job error. raise JobError('After <%s> retries, the following nodes are ' 'still not accessible <%s>. Cancelling job.' % (retries, nodes_to_check)) # If we have multiple nodes, now is the time to create the machinefile # for MPI job runs # For the machinefile we need the private IP of each node and the # number of cores. machinefile = tempfile.NamedTemporaryFile('w', delete=True) machinefile.write("# Machine file for MPI job runs\n") for node in self.running_nodes: machinefile.write( '%s slots=%s max_slots=%s\n' % (node[0].private_ips[0], cores_per_node, cores_per_node)) machinefile.flush() LOG.debug('The following machinefile has been created:\n\n%s\n' % machinefile.name) # The master node is always considered to be node 0 in # the self.running_nodes list. LOG.debug('Copying machinefile to master node...') saga_machinefile = File('file://%s' % machinefile.name, session=self.session) saga_machinefile.copy('sftp://%s/tmp/machinefile' % self.running_nodes[0][0].public_ips[0]) machinefile.close() LOG.debug('machinefile copied to master node...') conn = PTYShell('ssh://%s' % self.running_nodes[0][0].public_ips[0], session=self.session) conn.run_sync('chmod 644 /tmp/machinefile') LOG.debug('Set permissions on /tmp/machinefile on master node to 644.') return self.running_nodes