コード例 #1
0
    def shutdown_resources(self):
        JobDeploymentBase.shutdown_resources(self)
        # Here we terminate the running resources for this job and
        # wait until they have been shut down.
        res_ids = [node.id for node in self.nodes]
        LOG.debug('About to shut down the following nodes: %s' % res_ids)

        LOG.debug('Shutdown resources...')
        for node in self.nodes:
            self.driver.destroy_node(node)

        while res_ids:
            nodes_to_wait_for = self.driver.list_nodes(res_ids)
            still_running = [node.id for node in nodes_to_wait_for]
            new_res_ids = []
            # Now go through res_ids and delete the nodes that don't appear
            # in still_running.
            for res_id in res_ids:
                if res_id not in still_running:
                    LOG.debug('Resource <%s> has terminated...' % res_id)
                else:
                    new_res_ids.append(res_id)
            res_ids = new_res_ids
            if res_ids:
                LOG.debug('Still waiting for termination of resources %s...' %
                          res_ids)
                time.sleep(2)

        LOG.debug('All resources terminated.')
コード例 #2
0
 def deploy_software(self):
     JobDeploymentBase.deploy_software(self)
     # Here we undertake transfer of the code to the remote platform if this
     # is required. In many cases, the software is likely to already be
     # deployed on the target platform or may have been configured via a
     # tool such as cloud-init, puppet, etc at resource initialisation time.
     LOG.debug('SSH Deployer: Deploy software...')
コード例 #3
0
    def run_job(self, job_details=None):
        JobDeploymentBase.run_job(self)
        # This function uses the SAGA-Python library to run the job. Separate
        # functionality in the library is used for monitoring the job process.

        # TODO: Add modules to PBS job confiuguration

        # Here we extract the job details from the previously stored job details
        # object into a SAGA Python job description object so that we can run
        # the job.
        job_arguments = getattr(self.job_config, 'args', [])
        input_files = getattr(self, 'transferred_input_files', [])
        job_arguments += input_files

        jd = saga.job.Description()
        jd.environment = getattr(self.job_config, 'environment', {})
        jd.executable = getattr(self.job_config, 'executable', None)
        jd.arguments = job_arguments
        jd.working_directory = getattr(self.job_config, 'working_dir', None)
        jd.output = getattr(self.job_config, 'stdout', None)
        jd.error = getattr(self.job_config, 'stderr', None)
        jd.wall_time_limit = getattr(self.job_config, 'time_limit_mins', 0)
        jd.total_cpu_count = getattr(self.job_config, 'num_processes', 1)
        #jd.processes_per_host = 1
        #jd.total_physical_memory = "2400"

        self.job = self.svc.create_job(jd)
        self.job.run()
コード例 #4
0
    def wait_for_job_completion(self):
        JobDeploymentBase.wait_for_job_completion(self)

        # Wait for job to complete
        self.job.wait()

        return (self.job.state, self.job.exit_code)
コード例 #5
0
    def run_job(self):
        JobDeploymentBase.run_job(self)
        # This function uses the libhpc resource daemon client to talk to the
        # resource daemon that is installed on cloud resources. It uses this
        # interface to run jobs and monitor their state to see when they are
        # complete.
        # TODO: Should this be running/managing the job remotely via a SAGA
        # SSH session or should we be expecting to communicate with a remote
        # resource management service to handle this?
        LOG.debug('Run job...')

        job_arguments = getattr(self.job_config, 'args', [])
        input_files = getattr(self, 'transferred_input_files', [])
        job_arguments += input_files

        # Check if we have a JOB_ID variable in the arguments or input files.
        # If so, replace this variable with the actual job ID.
        job_arguments_tmp = job_arguments
        job_arguments = []
        for item in job_arguments_tmp:
            # Can't do a replace on items that are not string types!
            if isinstance(item, basestring):
                job_arguments.append(
                    item.replace('$JOB_ID', self.job_config.job_id))
            else:
                job_arguments.append(item)

        LOG.debug('Modified job arguments: %s' % job_arguments)

        jd = saga.job.Description()
        jd.environment = getattr(self.job_config, 'environment', {})
        if self.job_config.num_processes > 1:
            jd.executable = ('mpirun -np %s -machinefile /tmp/machinefile' %
                             (self.job_config.num_processes))
            executable = getattr(self.job_config, 'executable', None)
            if executable:
                job_arguments.insert(0, executable)
        else:
            jd.executable = getattr(self.job_config, 'executable', None)
        jd.arguments = job_arguments
        jd.working_directory = getattr(self.job_config, 'working_dir', None)
        jd.output = getattr(self.job_config, 'stdout', None)
        jd.error = getattr(self.job_config, 'stderr', None)
        jd.wall_time_limit = getattr(self.job_config, 'time_limit_mins', 0)
        #jd.number_of_processes = 4
        #jd.processes_per_host = 1
        #jd.total_physical_memory = "2400"

        if not jd.output:
            jd.output = 'std.out'
        if not jd.error:
            jd.error = 'std.err'

        self.svc = saga.job.Service('ssh://%s/' %
                                    self.running_nodes[0][0].public_ips[0],
                                    session=self.session)
        self.job = self.svc.create_job(jd)
        self.job.run()
コード例 #6
0
    def initialise_resources(self, *args, **kwargs):
        JobDeploymentBase.initialise_resources(self)
        # Resource initialisation is not required directly but we use this
        # function to intiialise the connection with the PBS platform via the
        # SAGA-Python library.
        self.svc = saga.job.Service('pbs+ssh://%s/' %
                                    self.platform_config.platform_service_host,
                                    session=self.session)

        return None
コード例 #7
0
    def transfer_files(self):
        JobDeploymentBase.transfer_files(self)
        # Here we transfer any input files to the relevant directory on the
        # target platform.
        # Use SAGA-Python to handle the file transfer.
        LOG.debug('Transfer files...')
        job_dir = self.platform_config.storage_job_directory
        host = self.platform_config.platform_service_host

        try:
            directory = Directory('sftp://%s%s' % (host, job_dir),
                                  session=self.session)
        except saga.BadParameter as e:
            LOG.error('The specified job directory does not exist on PBS '
                      'submission node <%s> (%s).' % (host, str(e)))
            raise JobError('The specified job directory does not exist on PBS'
                           'submission node <%s> (%s)' % (host, str(e)))

        try:
            # directory.make_dir() does not return a handle to the new directory
            # so need to create the directory URL manually.
            directory.make_dir(self.job_config.job_id)
            job_data_dir = os.path.join(str(directory.url),
                                        self.job_config.job_id)
        except saga.NoSuccess as e:
            LOG.error('The specified job data directory already exists on '
                      'PBS submission node <%s> (%s).' % (host, str(e)))
            raise JobError('The specified job directory already exists on PBS'
                           'submission node <%s> (%s)' % (host, str(e)))

        # Now upload the file(s) to the job data directory
        # and create an input file list containing the resulting locations
        # of the files.
        # There are some cases where jobs may not have input files (they may,
        # for example pull the input files from a remote location as part of
        # the job process) so we first check whether there are any input files
        # to process, if not, then return from this function
        if not self.job_config.input_files:
            LOG.debug('There are no input files to transfer for this job...')
            return

        self.transferred_input_files = []
        for f in self.job_config.input_files:
            try:
                f_obj = File('file://%s' % f, session=self.session)
                f_obj.copy(job_data_dir)
                dest_dir = os.path.join(directory.url.path,
                                        self.job_config.job_id)
                self.transferred_input_files.append(
                    os.path.join(dest_dir, os.path.basename(f_obj.url.path)))
            except:
                LOG.error('Error copying the input file <%s> to the remote '
                          'platform.' % f)
                raise JobError('Error copying the input file <%s> to the '
                               'remote platform.' % f)
コード例 #8
0
    def run_job(self):
        JobDeploymentBase.run_job(self)
        # TODO: Should this be running/managing the job remotely via a SAGA
        # SSH session or should we be expecting to communicate with a remote
        # resource management service to handle this?

        # This function could use the libhpc resource daemon client to talk to
        # a resource daemon that is installed on the target resource, however,
        # at present we simply use SSH (via SAGA Python) to trigger job
        # execution and handle compressing and returning the output files.
        LOG.debug('SSH Deployer: Run job...')

        job_arguments = getattr(self.job_config, 'args', [])
        input_files = getattr(self, 'transferred_input_files', [])
        job_arguments += input_files

        # Check if we have a JOB_ID variable in the arguments or input files.
        # If so, replace this variable with the actual job ID.
        job_arguments_tmp = job_arguments
        job_arguments = []
        for item in job_arguments_tmp:
            # Can't do a replace on items that are not string types!
            if isinstance(item, basestring):
                job_arguments.append(
                    item.replace('$JOB_ID', self.job_config.job_id))
            else:
                job_arguments.append(item)

        LOG.debug('Modified job arguments: %s' % job_arguments)

        jd = saga.job.Description()
        jd.environment = getattr(self.job_config, 'environment', {})
        jd.executable = getattr(self.job_config, 'executable', None)
        jd.arguments = job_arguments
        jd.working_directory = getattr(self.job_config, 'working_dir', None)
        jd.output = getattr(self.job_config, 'stdout', None)
        jd.error = getattr(self.job_config, 'stderr', None)
        jd.wall_time_limit = getattr(self.job_config, 'time_limit_mins', 0)

        if not jd.output:
            jd.output = 'std.out'
        if not jd.error:
            jd.error = 'std.err'

        self.svc = saga.job.Service('ssh://%s/' % self.host,
                                    session=self.session)
        self.job = self.svc.create_job(jd)
        self.job.run()
コード例 #9
0
    def shutdown_resources(self):
        JobDeploymentBase.shutdown_resources(self)

        # Number of seconds between chceking for shutdown of resources.
        SHUTDOWN_POLL_DELAY = 4

        # Here we terminate the running resources for this job and
        # wait until they have been shut down.
        res_ids = [node.id for node in self.nodes]
        LOG.debug('About to shut down the following nodes: %s' % res_ids)

        LOG.debug('Shutdown resources...')
        for node in self.nodes:
            self.driver.destroy_node(node)

        while res_ids:
            # TODO: Find a better approach to remove nodes that have vanished
            # from the system, at present we need to manually go through each
            # node to identify individual nodes that are no longer accessible.
            try:
                nodes_to_wait_for = self._get_node_list(res_ids)
            except Exception as e:
                LOG.debug('Exception <%s> getting node list, getting node info'
                          ' individually.' % str(e))
                nodes_to_wait_for = self._get_node_list(res_ids, manual=True)
            still_running = []
            for node_info in nodes_to_wait_for:
                if node_info.state != NodeState.TERMINATED:
                    still_running.append(node_info.id)

            new_res_ids = []
            # Now go through res_ids and delete the nodes that don't appear
            # in still_running.
            for res_id in res_ids:
                if res_id not in still_running:
                    LOG.debug('Resource <%s> has terminated...' % res_id)
                else:
                    new_res_ids.append(res_id)
            res_ids = new_res_ids
            if res_ids:
                LOG.debug('Still waiting for termination of resources %s...' %
                          res_ids)
                time.sleep(SHUTDOWN_POLL_DELAY)

        LOG.debug('All resources terminated.')
コード例 #10
0
    def run_job(self):
        JobDeploymentBase.run_job(self)
        # This function uses the libhpc resource daemon client to talk to the
        # resource daemon that is installed on cloud resources. It uses this
        # interface to run jobs and monitor their state to see when they are
        # complete.
        # TODO: Should this be running/managing the job remotely via a SAGA
        # SSH session or should we be expecting to communicate with a remote
        # resource management service to handle this?
        LOG.debug('Run job...')

        job_arguments = getattr(self.job_config, 'args', [])
        input_files = getattr(self, 'transferred_input_files', [])
        job_arguments += input_files

        jd = Description()
        jd.environment = getattr(self.job_config, 'environment', {})
        if self.job_config.num_processes > 1:
            jd.executable = ('mpirun -np %s -machinefile /tmp/machinefile' %
                             (self.job_config.num_processes))
            executable = getattr(self.job_config, 'executable', None)
            if executable:
                job_arguments.insert(0, executable)
        else:
            jd.executable = getattr(self.job_config, 'executable', None)
        jd.arguments = job_arguments
        jd.working_directory = getattr(self.job_config, 'working_dir', None)
        jd.output = getattr(self.job_config, 'stdout', None)
        jd.error = getattr(self.job_config, 'stderr', None)
        jd.wall_time_limit = getattr(self.job_config, 'time_limit_mins', 0)
        #jd.number_of_processes = 4
        #jd.processes_per_host = 1
        #jd.total_physical_memory = "2400"

        if not jd.output:
            jd.output = 'std.out'
        if not jd.error:
            jd.error = 'std.err'

        self.svc = Service('ssh://%s/' %
                           self.running_nodes[0][0].public_ips[0],
                           session=self.session)
        self.job = self.svc.create_job(jd)
        self.job.run()
コード例 #11
0
    def collect_output(self, destination):
        # We're using the default implementation of the file transfer code
        # This doesn't take into account a different port for the remote host
        # connection. To work around this, we temporarily set the host property
        # to include the port and the revert to the original value after the
        # file transfer is complete.
        host_tmp = self.host
        self.host = ('%s:%s' % (self.host, self.port))

        # Using the base implementation of job output file collection...
        JobDeploymentBase.collect_output(self, destination)

        # If job_config delete_job_files is True, we can now delete the job
        # files on the remote platform
        if self.job_config.delete_job_files:
            jobs_dir = self.platform_config.storage_job_directory
            # Check that the job storage directory exists and then create a
            # sub-directory specifically for this job.
            try:
                LOG.debug('URL for file job directory: sftp://%s%s' %
                          (self.host, jobs_dir))
                directory = Directory('sftp://%s%s' % (self.host, jobs_dir),
                                      session=self.session)
            except saga.BadParameter as e:
                LOG.error('The specified job directory does not exist on '
                          'resource <%s> (%s).' % (self.host, str(e)))
                raise JobError('The specified job directory does not exist '
                               'on resource <%s> (%s)' % (self.host, str(e)))
            try:
                LOG.debug('Deleting job directory after job completion '
                          '<sftp://%s%s/%s>' %
                          (self.host, jobs_dir, self.job_config.job_id))
                directory.remove(self.job_config.job_id, RECURSIVE)
            except saga.NoSuccess as e:
                LOG.error('The specified job data directory couldn\'t be '
                          'removed <%s> (%s).' %
                          (self.job_config.job_id, str(e)))
                raise JobError('The specified job data directory couldn\'t be '
                               'removed <%s> (%s)' %
                               (self.job_config.job_id, str(e)))

        # Set the host value back to its original value
        self.host = host_tmp
コード例 #12
0
    def collect_output(self, destination):
        # Before calling the base implementation of output file collection to
        # pull files back from the master node, we first need to gather output
        # from each of the slave nodes onto the master node
        LOG.debug('Gather files from slave nodes to master...')
        job_dir = self.platform_config.storage_job_directory

        # If we have only one node then we can skip this stage...
        master_node = self.running_nodes[0][0]
        slave_nodes = []
        if len(self.running_nodes) > 1:
            slave_nodes = [node[0] for node in self.running_nodes[1:]]

        if slave_nodes:
            slave_private_ips = [node.private_ips[0] for node in slave_nodes]
            self._gather_results_data(master_node.public_ips[0],
                                      slave_private_ips,
                                      self.platform_config.user_id,
                                      self.platform_config.user_key_file,
                                      job_dir, self.job_config.job_id)

        # Using the base implementation of job output file collection...
        JobDeploymentBase.collect_output(self, destination)
コード例 #13
0
 def transfer_files(self):
     JobDeploymentBase.transfer_files(self)
コード例 #14
0
 def deploy_software(self):
     JobDeploymentBase.deploy_software(self)
コード例 #15
0
 def initialise_resources(self, resource_config=None, num_resources=1,
                          resource_type='m1.small', job_id=None):
     JobDeploymentBase.initialise_resources(self)
     # Start up the cloud resources here and wait for them to reach the 
     # running state. Need to know the image ID that we're starting. The
     # image ID is available from the job configuration
     image_id = None
     image_id_configured = self.job_config.image_id_pre_configured
     image_id_unconfigured = self.job_config.image_id_unconfigured
     
     if image_id_configured and not image_id_unconfigured:
         image_id = image_id_configured
         LOG.debug('Only a configured image identifier has been provided, '
                   'using image ID <%s>.' % image_id)
     elif (not image_id_configured) and image_id_unconfigured:
         if not resource_config:
             LOG.error('Only an unconfigured image ID provided but '
                       'no resource configuration has been provided.')
             raise ResourceInitialisationError('ERROR: Only an unconfigured '
                                     'image type is available but no image '
                                     'configuration has been provided.')
         image_id = image_id_unconfigured
         LOG.debug('Only an unconfigured image identifier has been '
                   'provided, using image ID <%s>.' % image_id)
     elif image_id_configured and image_id_unconfigured:
         image_id = image_id_unconfigured if resource_config else image_id_configured
         LOG.debug('Both configured and unconfigured images provided, '
                   'using image ID <%s>.' % image_id)
     else:
         raise ResourceInitialisationError('ERROR: No image information '
                          'available in the platform configuration, unable '
                          'to initialise resources.')
         
     # Check that the image is present and then use the libcloud driver to  
     # start the resources and return once they're running. 
     # TODO: This is currently synchronous but could also be done  
     # asynchronously using a callback to notify the caller when the nodes 
     # are ready. 
     
     #images = self.driver.list_images()
     #img = next((i for i in images if i.id == image_id), None)
     #if not img:
     
     try:
         img = self.driver.get_image(image_id)
     except socket.error as e:
         img = None
         raise ResourceInitialisationError('ERROR contacting the remote '
                          'cloud platform. Do you have an active network '
                          'connection? - <%s>' % str(e))
     except:
         img = None
         raise ResourceInitialisationError('ERROR: The specified image <%s> '
                          'is not present on the target platform, unable '
                          'to start resources.' % image_id)
     
     sizes = self.driver.list_sizes()
     size = next((s for s in sizes if s.name == resource_type), None)
     if not size:
         raise ResourceInitialisationError('ERROR: The specified resource '
                          'size <%s> is not present on the target platform. '
                          'Unable to start resources.' % resource_type)
     
     # Get the keypair name from the configuration
     keypair_name = self.job_config.key_name
     
     # At this point we know that the image is available and the specified 
     # resource type is valid so we can request to start the instance(s)
     LOG.debug('About to start <%s> resources of type <%s> based on image '
               '<%s (%s)> with keypair <%s>.' % (num_resources, size.name, 
               img.id, img.name, keypair_name))
     
     # When starting a resource we need the name, image, type, keypair, 
     # configuration data and details of the number of resources to start.
     name = job_id
     if not name:
         name = generate_instance_id()
     
     self.driver.create_node(name=name, image=img, size=size, 
                             ex_keyname=keypair_name)
     return
コード例 #16
0
    def initialise_resources(self,
                             prefer_unconfigured=True,
                             num_processes=1,
                             processes_per_node=1,
                             node_type='m1.small',
                             job_id=None,
                             retries=3,
                             software_config=None):
        JobDeploymentBase.initialise_resources(self)
        # Start up the cloud resources here and wait for them to reach the
        # running state. Need to know the image ID that we're starting. The
        # image ID is available from the job configuration
        image_id = None
        image_preconfigured_id = self.platform_config.image_preconfigured_id
        image_unconfigured_id = self.platform_config.image_unconfigured_id

        # Store whether or not we're using an unconfigured image - this
        # determines whether we end up running the deploy software function
        # or not.
        self.use_unconfigured = False
        if image_preconfigured_id and not image_unconfigured_id:
            image_id = image_preconfigured_id
            LOG.debug('Only a configured image identifier has been provided, '
                      'using image ID <%s>.' % image_id)
        elif (not image_preconfigured_id) and image_unconfigured_id:
            image_id = image_unconfigured_id
            self.use_unconfigured = True
            LOG.debug('Only an unconfigured image identifier has been '
                      'provided, using image ID <%s>.' % image_id)
            if not software_config:
                raise JobError(
                    'Only an unconfigured image identifier has been '
                    'provided but no software config has been specified. '
                    'Unable to continue...')
        elif image_preconfigured_id and image_unconfigured_id:
            LOG.debug('Both configured and unconfigured images provided...')
            if prefer_unconfigured:
                image_id = image_unconfigured_id
                self.use_unconfigured = True
                LOG.debug('Using unconfigured image ID <%s>.' % image_id)
                if not software_config:
                    raise JobError(
                        'An unconfigured image identifier has been '
                        'chosen but no software config has been specified. '
                        'Unable to continue...')
            else:
                image_id = image_preconfigured_id
                LOG.debug('Using pre-configured image ID <%s>.' % image_id)
        else:
            raise ResourceInitialisationError(
                'ERROR: No image information '
                'available in the platform configuration, unable '
                'to initialise resources.')

        # If we're using an unconfigured image, we need to prepare the admin
        # security context based on the information that should be provided
        # in the YAML file with the unconfigured image details.
        if self.use_unconfigured:
            self.admin_ctx = saga.Context("ssh")
            self.admin_ctx.user_id = self.platform_config.image_unconfigured_admin_key_user
            self.admin_ctx.user_key = self.platform_config.image_unconfigured_admin_key_file

        # Check that the image is present and then use the libcloud driver to
        # start the resources and return once they're running.
        # TODO: This is currently synchronous but could also be done
        # asynchronously using a callback to notify the caller when the nodes
        # are ready.

        #images = self.driver.list_images()
        #img = next((i for i in images if i.id == image_id), None)
        #if not img:

        img = None
        try:
            #img = self.driver.get_image(image_id)
            images = self.driver.list_images()
            for image in images:
                if image.id == image_id:
                    img = image
                    break
            if img == None:
                raise ResourceInitialisationError('The specified image <%s> '
                                                  'could not be found' %
                                                  image_id)
        except socket.error as e:
            img = None
            raise ResourceInitialisationError(
                'ERROR contacting the remote '
                'cloud platform. Do you have an active network '
                'connection? - <%s>' % str(e))
        except Exception as e:
            LOG.debug('ERROR STRING: %s' % str(e))
            img = None
            if str(e).startswith('Unauthorized:'):
                raise InvalidCredentialsError(
                    'ERROR: Access to the cloud '
                    'platform at <%s> was not authorised. Are your '
                    'credentials correct?' %
                    (self.platform_config.platform_service_host + ':' +
                     str(self.platform_config.platform_service_port)))
            else:
                raise ResourceInitialisationError(
                    'ERROR: The specified image <%s> '
                    'is not present on the target platform, unable '
                    'to start resources.' % image_id)

        sizes = self.driver.list_sizes()
        size = next((s for s in sizes if s.id == node_type), None)
        if not size:
            raise ResourceInitialisationError(
                'ERROR: The specified resource '
                'size (node_type) <%s> is not present on the '
                'target platform. Unable to start resources. Have '
                'you set the node_type parameter in your job spec?' %
                node_type)

        # Get the keypair name from the configuration
        # If we're using an unconfigured resource, we use the admin key pair
        # name if provided.
        if self.use_unconfigured and self.platform_config.image_unconfigured_admin_key_name:
            keypair_name = self.platform_config.image_unconfigured_admin_key_name
        else:
            keypair_name = self.platform_config.user_key_name

        # Get the number of resources from the job configuration
        # TODO: Fix this to obtain number of cores per node from the cloud
        # cloud platform. For now use the specified processes_per_node in the
        # job specification.
        cores_per_node = processes_per_node
        #cores_per_node = self.RESOURCE_TYPE_CORES[node_type]
        #if cores_per_node < processes_per_node:
        #    LOG.debug('A processes_per_node value <%s> greater than the number '
        #              'of cores in a node <%s> has been specified. Altering '
        #              'processes per node to the maximum available on this '
        #              'node type <%s>.' % (processes_per_node, cores_per_node,
        #                                   node_type))
        #    processes_per_node = cores_per_node
        num_nodes = int(ceil(float(num_processes) / float(processes_per_node)))

        # At this point we know that the image is available and the specified
        # resource type is valid so we can request to start the instance(s)
        LOG.debug('About to start <%s> resources of type <%s> based on image '
                  '<%s (%s)> with keypair <%s>.' %
                  (num_nodes, size.name, img.id, img.name, keypair_name))

        # When starting a resource we need the name, image, type, keypair,
        # configuration data and details of the number of resources to start.
        name = job_id
        if not name:
            name = generate_instance_id()

        self.nodes = self.driver.create_node(name=name,
                                             image=img,
                                             size=size,
                                             ex_keyname=keypair_name,
                                             ex_mincount=num_nodes,
                                             ex_maxcount=num_nodes)

        if type(self.nodes) != type([]):
            self.nodes = [self.nodes]

        self.running_nodes = self.driver.wait_until_running(self.nodes)

        # Before we return details of the running nodes, we need to check
        # that they're accessible - it takes some time for the nodes to boot
        # and become available. We do this by setting up a handle to a
        # directory - we assume all nodes have a '/' directory - and then
        # trying to list that directory. If an exception is thrown, we assume
        # that the nodes are not yet available.

        # TODO: Need to replace this wait with a reliable check as to whether
        # the server is up and running. Looks like, for now, this will need to
        # use Paramiko while awaiting updates on saga-python.
        #LOG.debug('Waiting 60 seconds for node to boot...')
        #time.sleep(60)
        # Replaced 60 second wait with check using Paramiko to see if
        # resource is accessible...
        LOG.debug('Checking node is available...')

        nodes_to_check = []
        for node in self.running_nodes:
            nodes_to_check.append(node[0].public_ips[0])

        res = self._wait_for_node_accessbility(
            nodes_to_check,
            self.platform_config.user_id,
            self.platform_config.user_key_file,
            retries=retries)
        if not res:
            # We still have nodes that are not avialable so assume there's a
            # problem and throw a job error.
            raise JobError('After <%s> retries, the following nodes are '
                           'still not accessible <%s>. Cancelling job.' %
                           (retries, nodes_to_check))

        # If we have multiple nodes, now is the time to create the machinefile
        # for MPI job runs
        # For the machinefile we need the private IP of each node and the
        # number of cores.
        machinefile = tempfile.NamedTemporaryFile('w', delete=True)
        machinefile.write("# Machine file for MPI job runs\n")
        for node in self.running_nodes:
            machinefile.write(
                '%s slots=%s max_slots=%s\n' %
                (node[0].private_ips[0], cores_per_node, cores_per_node))
        machinefile.flush()
        LOG.debug('The following machinefile has been created:\n\n%s\n' %
                  machinefile.name)

        # The master node is always considered to be node 0 in
        # the self.running_nodes list.
        LOG.debug('Copying machinefile to master node...')
        saga_machinefile = File('file://%s' % machinefile.name,
                                session=self.session)
        saga_machinefile.copy('sftp://%s/tmp/machinefile' %
                              self.running_nodes[0][0].public_ips[0])
        machinefile.close()
        LOG.debug('machinefile copied to master node...')

        conn = PTYShell('ssh://%s' % self.running_nodes[0][0].public_ips[0],
                        session=self.session)
        conn.run_sync('chmod 644 /tmp/machinefile')
        LOG.debug('Set permissions on /tmp/machinefile on master node to 644.')

        return self.running_nodes
コード例 #17
0
 def collect_output(self, destination):
     JobDeploymentBase.collect_output(self, destination)
コード例 #18
0
    def deploy_software(self, software_config=None):
        JobDeploymentBase.deploy_software(self)
        # Here we undertake transfer of the code to the remote platform if this
        # is required. In many cases, the software is likely to already be
        # deployed on the target platform or may have been configured via a
        # tool such as cloud-init, puppet, etc at resource initialisation time.
        LOG.debug('Deploy software...')

        # If we're not using an unconfigured image, we don't need to run the
        # deploy software function
        if not self.use_unconfigured:
            LOG.info('Using a pre-configured image so running software '
                     'deployment process...')
            return

        # Software deployment requires root access to the target node(s). This
        # should be possible using the key that has been passed to start the
        # reosurce(s).

        # If no software configuration is provided, we ignore this function
        # call and return. If a configuration is provided, we check that the
        # configuration is for the right type of platform and then deploy
        # the software.
        if not software_config:
            return

        if type(software_config) != type([]):
            software_config = [software_config]

        LOG.debug('Received a request to deploy the following software '
                  'configuration IDs to the target platforms: <%s>...' %
                  software_config)

        # Check that we have an admin security context available. If we don't
        # we can't connect to the remote resource(s) to do the required
        # configuration
        if not self.admin_ctx:
            raise JobError(
                'deploy_software: There is no admin context '
                'available so it will not be possible to connect '
                'to remote resources to configure them. Ensure that '
                '')

        # Check that we can get each of the software configurations and that
        # each one supports the target deployment platform.
        scm = SoftwareConfigManager.get_instance()
        scm.init_configuration()

        os_name = self.platform_config.image_unconfigured_os
        flavour = self.platform_config.image_unconfigured_flavour
        admin_key_user = self.platform_config.image_unconfigured_admin_key_user
        admin_key_file = self.platform_config.image_unconfigured_admin_key_file

        sc_dict = {}
        for sc in software_config:
            try:
                conf = scm.get_software_configuration(sc)
                sc_dict[sc] = conf
            except ValueError as e:
                raise JobError('Job error - no software could be found for '
                               'the configuration id <%s>: %s' % (sc, str(e)))

            if not ((os_name == conf.software_os_type) and
                    (flavour == conf.software_os_flavour)):
                LOG.error(
                    'The OS <%s> and flavour <%s> in the provided software '
                    'configuration don\'t match the target platform with '
                    'OS <%s> and flavour <%s>.' %
                    (conf.software_os_type, conf.software_os_flavour, os_name,
                     flavour))
                raise JobError(
                    'The OS <%s> and flavour <%s> in the provided '
                    'software configuration don\'t match the target '
                    'platform with OS <%s> and flavour <%s>.' %
                    (conf.software_os_type, conf.software_os_flavour, os_name,
                     flavour))

        # If we reach this point we assume that each of the software
        # configurations has been found and they are for the right target
        # platform.
        for sc_key in sc_dict.keys():
            sc_obj = sc_dict[sc_key]
            install_commands = sc_obj.get_install_commands()

            # Now run each of the install commands synchronously on all of the
            # target machines to get the software installed.
            node_ips = [node[0].public_ips[0] for node in self.running_nodes]
            LOG.debug('Deploying to the following list of nodes: %s' %
                      node_ips)

            # Set up a new session using the admin user and key provided for
            # the unconfigured image.
            adm_session = saga.Session(default=False)
            adm_ctx = saga.Context("ssh")
            adm_ctx.user_id = admin_key_user
            adm_ctx.user_key = admin_key_file
            adm_session.add_context(adm_ctx)
            # Open shell connections to each of the machines
            shell_conns = []
            opts = {}
            opts['ssh_options'] = {'StrictHostKeyChecking': 'no'}
            for node_ip in node_ips:
                conn = PTYShell('ssh://%s' % node_ip,
                                session=adm_session,
                                opts=opts)
                shell_conns.append(conn)
                if conf.software_os_type == 'linux':
                    self._setup_job_account(conn, self.platform_config)
                else:
                    LOG.warning(
                        'Support for creation of job accounts on '
                        'platforms other than linux is not yet supported...')
            # Copy the job account key to the master node
            job_session = saga.Session(default=False)
            job_session.add_context(self.job_ctx)

            keyfile = File('file://%s' % self.platform_config.user_key_file,
                           session=job_session)
            keyfile_target = shell_conns[0].url + os.path.join(
                self.platform_config.user_home, '.ssh', 'id_rsa')
            LOG.debug('Copying job key to target directory <%s>' %
                      keyfile_target)
            keyfile.copy(keyfile_target)
            for cmd in install_commands:
                for shell_connection in shell_conns:
                    if isinstance(cmd, SoftwareConfigFile):
                        LOG.debug(
                            'Software deployment: About to write data to '
                            'remote file <%s> on node <%s>' %
                            (cmd.filename, shell_connection.url))
                        shell_connection.write_to_remote(
                            cmd.data, cmd.filename)
                    else:
                        LOG.debug('Software deployment: About to run command '
                                  '<%s> on resource <%s>...' %
                                  (cmd, shell_connection.url))
                        if admin_key_user != 'root':
                            cmd = 'sudo ' + cmd
                        result, out, err = shell_connection.run_sync(cmd)
                        LOG.debug('Command completed - Exit code: <%s>, '
                                  'StdOut: <%s>, StdErr:\n<%s>' %
                                  (result, out, err))
コード例 #19
0
    def transfer_files(self):
        JobDeploymentBase.transfer_files(self)
        # Here we transfer any input files to the relevant directory on the
        # target platform.
        # Use SAGA-Python to handle the file transfer.
        LOG.debug('Transfer files...')
        job_dir = self.platform_config.storage_job_directory

        # At this point we need to switch back to using the job secruity
        # context. If we were using unconfigured resources, these will have
        # been configured using an admin context by now.
        self.session = saga.Session(default=False)
        self.session.add_context(self.job_ctx)

        # Begin by checking if we're working with more than one instance, if
        # so we have a master and one or more slave nodes. We'll push the data
        # to the master and then direct the master to distribute it to the
        # slave nodes.
        master_node = self.running_nodes[0][0]
        slave_nodes = []
        if len(self.running_nodes) > 1:
            slave_nodes = [node[0] for node in self.running_nodes[1:]]

        # On the master node: Check that the job storage directory exists and
        # then create a sub-directory specifically for this job.

        # Node is a tuple consisting of two items, the node object and an
        # IP list. For now we work with the node object directly.
        node_ip = master_node.public_ips[0]
        try:
            directory = Directory('sftp://%s%s' % (node_ip, job_dir),
                                  session=self.session)
        except saga.BadParameter as e:
            LOG.error('The specified job directory does not exist on node '
                      '<%s> (%s).' % (node_ip, str(e)))
            #raise JobError('The specified job directory does not exist '
            #               'on node <%s> (%s)' % (node_ip, str(e)))
        try:
            # directory.make_dir() does not return a handle to the new directory
            # so need to create the directory URL manually.
            directory.make_dir(self.job_config.job_id)
        except saga.NoSuccess as e:
            LOG.warning('The specified job data directory already exists on '
                        'node <%s> (%s).' % (node_ip, str(e)))
            #raise JobError('The specified job directory already exists on '
            #               'on node <%s> (%s)' % (node_ip, str(e)))

        job_data_dir = os.path.join(str(directory.url), self.job_config.job_id)

        # Now upload the file(s) to the job data directory
        # and create an input file list containing the resulting locations
        # of the files.
        # There are some cases where jobs may not have input files (they may,
        # for example pull the input files from a remote location as part of
        # the job process) so we first check whether there are any input files
        # to process, if not, then return from this function
        if not self.job_config.input_files:
            LOG.debug('There are no input files to transfer for this job...')
            return

        self.transferred_input_files = []
        for f in self.job_config.input_files:
            try:
                f_obj = File('file://%s' % f, session=self.session)
                f_obj.copy(job_data_dir)
                dest_dir = os.path.join(directory.url.path,
                                        self.job_config.job_id)
                self.transferred_input_files.append(
                    os.path.join(dest_dir, os.path.basename(f_obj.url.path)))
            except:
                LOG.error('Error copying the input file <%s> to the remote '
                          'platform.' % f)
                raise JobError('Error copying the input file <%s> to the '
                               'remote platform.' % f)

        # At this point input files have been successfully transferred to
        # the master node. We now direct the master node to send the files
        # to each of the slave nodes:
        if slave_nodes:
            slave_private_ips = [node.private_ips[0] for node in slave_nodes]
            self._distribute_job_data(master_node.public_ips[0],
                                      slave_private_ips,
                                      self.platform_config.user_id,
                                      self.platform_config.user_key_file,
                                      job_dir, self.job_config.job_id)
コード例 #20
0
 def deploy_software(self, *args, **kwargs):
     JobDeploymentBase.deploy_software(self)
コード例 #21
0
    def initialise_resources(self, *args, **kwargs):
        JobDeploymentBase.initialise_resources(self)
        LOG.debug('SSH Deployer: Initialise resources - Nothing to do here...')

        return None
コード例 #22
0
    def transfer_files(self):
        JobDeploymentBase.transfer_files(self)
        LOG.debug('SSH Deployer: Transfer files...')
        # Here we transfer any input files to the relevant directory on the
        # target platform.
        # Use SAGA-Python to handle the file transfer.
        job_dir = self.platform_config.storage_job_directory
        # Check that the job storage directory exists and then create a
        # sub-directory specifically for this job.
        try:
            LOG.debug('URL for file transfer: <sftp://%s:%s%s>' %
                      (self.host, self.port, job_dir))
            directory = Directory('sftp://%s:%s%s' %
                                  (self.host, self.port, job_dir),
                                  session=self.session)
        except saga.BadParameter as e:
            LOG.error('Error setting up connection to resource directory.')
            if 'connection refused' in str(e).lower():
                raise ConnectionError('Unable to connect to remote resource '
                                      'to set up connection to directory.')

            raise StorageDirectoryNotFoundError(
                'The specified job data base '
                'directory does not exist on resource <%s> (%s)' %
                (self.host, str(e)))
        try:
            # directory.make_dir() does not return a handle to the new directory
            # so need to create the directory URL manually.
            directory.make_dir(self.job_config.job_id)
            job_data_dir = os.path.join(str(directory.url),
                                        self.job_config.job_id)
        except saga.NoSuccess as e:
            LOG.error('The specified job data directory already exists on '
                      'resource <%s> (%s).' % (self.host, str(e)))
            raise DirectoryExistsError('The specified job directory already '
                                       'exists on resource <%s> (%s)' %
                                       (self.host, str(e)))

        # Now upload the file(s) to the job data directory
        # and create an input file list containing the resulting locations
        # of the files.
        # There are some cases where jobs may not have input files (they may,
        # for example pull the input files from a remote location as part of
        # the job process) so we first check whether there are any input files
        # to process, if not, then return from this function
        if not self.job_config.input_files:
            LOG.debug('There are no input files to transfer for this job...')
            return

        self.transferred_input_files = []
        for f in self.job_config.input_files:
            try:
                f_obj = File('file://%s' % f, session=self.session)
                f_obj.copy(job_data_dir)
                dest_dir = os.path.join(directory.url.path,
                                        self.job_config.job_id)
                self.transferred_input_files.append(
                    os.path.join(dest_dir, os.path.basename(f_obj.url.path)))
            except:
                LOG.error('Error copying the input file <%s> to the remote '
                          'platform.' % f)
                raise JobError('Error copying the input file <%s> to the '
                               'remote platform.' % f)
コード例 #23
0
 def run_job(self, job_details=None):
     JobDeploymentBase.run_job(self)
コード例 #24
0
 def collect_output(self):
     JobDeploymentBase.collect_output(self)
コード例 #25
0
 def shutdown_resources(self):
     JobDeploymentBase.shutdown_resources(self)
     # Here we collect terminate the running resources for this job and
     # wait until they have been shut down.
     LOG.debug('SSH Deployer: Shutdown resources - nothing to do here.')