コード例 #1
0
    def initialise_resources(self, *args, **kwargs):
        JobDeploymentBase.initialise_resources(self)
        # Resource initialisation is not required directly but we use this
        # function to intiialise the connection with the PBS platform via the
        # SAGA-Python library.
        self.svc = saga.job.Service('pbs+ssh://%s/' %
                                    self.platform_config.platform_service_host,
                                    session=self.session)

        return None
コード例 #2
0
 def initialise_resources(self, resource_config=None, num_resources=1,
                          resource_type='m1.small', job_id=None):
     JobDeploymentBase.initialise_resources(self)
     # Start up the cloud resources here and wait for them to reach the 
     # running state. Need to know the image ID that we're starting. The
     # image ID is available from the job configuration
     image_id = None
     image_id_configured = self.job_config.image_id_pre_configured
     image_id_unconfigured = self.job_config.image_id_unconfigured
     
     if image_id_configured and not image_id_unconfigured:
         image_id = image_id_configured
         LOG.debug('Only a configured image identifier has been provided, '
                   'using image ID <%s>.' % image_id)
     elif (not image_id_configured) and image_id_unconfigured:
         if not resource_config:
             LOG.error('Only an unconfigured image ID provided but '
                       'no resource configuration has been provided.')
             raise ResourceInitialisationError('ERROR: Only an unconfigured '
                                     'image type is available but no image '
                                     'configuration has been provided.')
         image_id = image_id_unconfigured
         LOG.debug('Only an unconfigured image identifier has been '
                   'provided, using image ID <%s>.' % image_id)
     elif image_id_configured and image_id_unconfigured:
         image_id = image_id_unconfigured if resource_config else image_id_configured
         LOG.debug('Both configured and unconfigured images provided, '
                   'using image ID <%s>.' % image_id)
     else:
         raise ResourceInitialisationError('ERROR: No image information '
                          'available in the platform configuration, unable '
                          'to initialise resources.')
         
     # Check that the image is present and then use the libcloud driver to  
     # start the resources and return once they're running. 
     # TODO: This is currently synchronous but could also be done  
     # asynchronously using a callback to notify the caller when the nodes 
     # are ready. 
     
     #images = self.driver.list_images()
     #img = next((i for i in images if i.id == image_id), None)
     #if not img:
     
     try:
         img = self.driver.get_image(image_id)
     except socket.error as e:
         img = None
         raise ResourceInitialisationError('ERROR contacting the remote '
                          'cloud platform. Do you have an active network '
                          'connection? - <%s>' % str(e))
     except:
         img = None
         raise ResourceInitialisationError('ERROR: The specified image <%s> '
                          'is not present on the target platform, unable '
                          'to start resources.' % image_id)
     
     sizes = self.driver.list_sizes()
     size = next((s for s in sizes if s.name == resource_type), None)
     if not size:
         raise ResourceInitialisationError('ERROR: The specified resource '
                          'size <%s> is not present on the target platform. '
                          'Unable to start resources.' % resource_type)
     
     # Get the keypair name from the configuration
     keypair_name = self.job_config.key_name
     
     # At this point we know that the image is available and the specified 
     # resource type is valid so we can request to start the instance(s)
     LOG.debug('About to start <%s> resources of type <%s> based on image '
               '<%s (%s)> with keypair <%s>.' % (num_resources, size.name, 
               img.id, img.name, keypair_name))
     
     # When starting a resource we need the name, image, type, keypair, 
     # configuration data and details of the number of resources to start.
     name = job_id
     if not name:
         name = generate_instance_id()
     
     self.driver.create_node(name=name, image=img, size=size, 
                             ex_keyname=keypair_name)
     return
コード例 #3
0
    def initialise_resources(self, *args, **kwargs):
        JobDeploymentBase.initialise_resources(self)
        LOG.debug('SSH Deployer: Initialise resources - Nothing to do here...')

        return None
コード例 #4
0
    def initialise_resources(self,
                             prefer_unconfigured=True,
                             num_processes=1,
                             processes_per_node=1,
                             node_type='m1.small',
                             job_id=None,
                             retries=3,
                             software_config=None):
        JobDeploymentBase.initialise_resources(self)
        # Start up the cloud resources here and wait for them to reach the
        # running state. Need to know the image ID that we're starting. The
        # image ID is available from the job configuration
        image_id = None
        image_preconfigured_id = self.platform_config.image_preconfigured_id
        image_unconfigured_id = self.platform_config.image_unconfigured_id

        # Store whether or not we're using an unconfigured image - this
        # determines whether we end up running the deploy software function
        # or not.
        self.use_unconfigured = False
        if image_preconfigured_id and not image_unconfigured_id:
            image_id = image_preconfigured_id
            LOG.debug('Only a configured image identifier has been provided, '
                      'using image ID <%s>.' % image_id)
        elif (not image_preconfigured_id) and image_unconfigured_id:
            image_id = image_unconfigured_id
            self.use_unconfigured = True
            LOG.debug('Only an unconfigured image identifier has been '
                      'provided, using image ID <%s>.' % image_id)
            if not software_config:
                raise JobError(
                    'Only an unconfigured image identifier has been '
                    'provided but no software config has been specified. '
                    'Unable to continue...')
        elif image_preconfigured_id and image_unconfigured_id:
            LOG.debug('Both configured and unconfigured images provided...')
            if prefer_unconfigured:
                image_id = image_unconfigured_id
                self.use_unconfigured = True
                LOG.debug('Using unconfigured image ID <%s>.' % image_id)
                if not software_config:
                    raise JobError(
                        'An unconfigured image identifier has been '
                        'chosen but no software config has been specified. '
                        'Unable to continue...')
            else:
                image_id = image_preconfigured_id
                LOG.debug('Using pre-configured image ID <%s>.' % image_id)
        else:
            raise ResourceInitialisationError(
                'ERROR: No image information '
                'available in the platform configuration, unable '
                'to initialise resources.')

        # If we're using an unconfigured image, we need to prepare the admin
        # security context based on the information that should be provided
        # in the YAML file with the unconfigured image details.
        if self.use_unconfigured:
            self.admin_ctx = saga.Context("ssh")
            self.admin_ctx.user_id = self.platform_config.image_unconfigured_admin_key_user
            self.admin_ctx.user_key = self.platform_config.image_unconfigured_admin_key_file

        # Check that the image is present and then use the libcloud driver to
        # start the resources and return once they're running.
        # TODO: This is currently synchronous but could also be done
        # asynchronously using a callback to notify the caller when the nodes
        # are ready.

        #images = self.driver.list_images()
        #img = next((i for i in images if i.id == image_id), None)
        #if not img:

        img = None
        try:
            #img = self.driver.get_image(image_id)
            images = self.driver.list_images()
            for image in images:
                if image.id == image_id:
                    img = image
                    break
            if img == None:
                raise ResourceInitialisationError('The specified image <%s> '
                                                  'could not be found' %
                                                  image_id)
        except socket.error as e:
            img = None
            raise ResourceInitialisationError(
                'ERROR contacting the remote '
                'cloud platform. Do you have an active network '
                'connection? - <%s>' % str(e))
        except Exception as e:
            LOG.debug('ERROR STRING: %s' % str(e))
            img = None
            if str(e).startswith('Unauthorized:'):
                raise InvalidCredentialsError(
                    'ERROR: Access to the cloud '
                    'platform at <%s> was not authorised. Are your '
                    'credentials correct?' %
                    (self.platform_config.platform_service_host + ':' +
                     str(self.platform_config.platform_service_port)))
            else:
                raise ResourceInitialisationError(
                    'ERROR: The specified image <%s> '
                    'is not present on the target platform, unable '
                    'to start resources.' % image_id)

        sizes = self.driver.list_sizes()
        size = next((s for s in sizes if s.id == node_type), None)
        if not size:
            raise ResourceInitialisationError(
                'ERROR: The specified resource '
                'size (node_type) <%s> is not present on the '
                'target platform. Unable to start resources. Have '
                'you set the node_type parameter in your job spec?' %
                node_type)

        # Get the keypair name from the configuration
        # If we're using an unconfigured resource, we use the admin key pair
        # name if provided.
        if self.use_unconfigured and self.platform_config.image_unconfigured_admin_key_name:
            keypair_name = self.platform_config.image_unconfigured_admin_key_name
        else:
            keypair_name = self.platform_config.user_key_name

        # Get the number of resources from the job configuration
        # TODO: Fix this to obtain number of cores per node from the cloud
        # cloud platform. For now use the specified processes_per_node in the
        # job specification.
        cores_per_node = processes_per_node
        #cores_per_node = self.RESOURCE_TYPE_CORES[node_type]
        #if cores_per_node < processes_per_node:
        #    LOG.debug('A processes_per_node value <%s> greater than the number '
        #              'of cores in a node <%s> has been specified. Altering '
        #              'processes per node to the maximum available on this '
        #              'node type <%s>.' % (processes_per_node, cores_per_node,
        #                                   node_type))
        #    processes_per_node = cores_per_node
        num_nodes = int(ceil(float(num_processes) / float(processes_per_node)))

        # At this point we know that the image is available and the specified
        # resource type is valid so we can request to start the instance(s)
        LOG.debug('About to start <%s> resources of type <%s> based on image '
                  '<%s (%s)> with keypair <%s>.' %
                  (num_nodes, size.name, img.id, img.name, keypair_name))

        # When starting a resource we need the name, image, type, keypair,
        # configuration data and details of the number of resources to start.
        name = job_id
        if not name:
            name = generate_instance_id()

        self.nodes = self.driver.create_node(name=name,
                                             image=img,
                                             size=size,
                                             ex_keyname=keypair_name,
                                             ex_mincount=num_nodes,
                                             ex_maxcount=num_nodes)

        if type(self.nodes) != type([]):
            self.nodes = [self.nodes]

        self.running_nodes = self.driver.wait_until_running(self.nodes)

        # Before we return details of the running nodes, we need to check
        # that they're accessible - it takes some time for the nodes to boot
        # and become available. We do this by setting up a handle to a
        # directory - we assume all nodes have a '/' directory - and then
        # trying to list that directory. If an exception is thrown, we assume
        # that the nodes are not yet available.

        # TODO: Need to replace this wait with a reliable check as to whether
        # the server is up and running. Looks like, for now, this will need to
        # use Paramiko while awaiting updates on saga-python.
        #LOG.debug('Waiting 60 seconds for node to boot...')
        #time.sleep(60)
        # Replaced 60 second wait with check using Paramiko to see if
        # resource is accessible...
        LOG.debug('Checking node is available...')

        nodes_to_check = []
        for node in self.running_nodes:
            nodes_to_check.append(node[0].public_ips[0])

        res = self._wait_for_node_accessbility(
            nodes_to_check,
            self.platform_config.user_id,
            self.platform_config.user_key_file,
            retries=retries)
        if not res:
            # We still have nodes that are not avialable so assume there's a
            # problem and throw a job error.
            raise JobError('After <%s> retries, the following nodes are '
                           'still not accessible <%s>. Cancelling job.' %
                           (retries, nodes_to_check))

        # If we have multiple nodes, now is the time to create the machinefile
        # for MPI job runs
        # For the machinefile we need the private IP of each node and the
        # number of cores.
        machinefile = tempfile.NamedTemporaryFile('w', delete=True)
        machinefile.write("# Machine file for MPI job runs\n")
        for node in self.running_nodes:
            machinefile.write(
                '%s slots=%s max_slots=%s\n' %
                (node[0].private_ips[0], cores_per_node, cores_per_node))
        machinefile.flush()
        LOG.debug('The following machinefile has been created:\n\n%s\n' %
                  machinefile.name)

        # The master node is always considered to be node 0 in
        # the self.running_nodes list.
        LOG.debug('Copying machinefile to master node...')
        saga_machinefile = File('file://%s' % machinefile.name,
                                session=self.session)
        saga_machinefile.copy('sftp://%s/tmp/machinefile' %
                              self.running_nodes[0][0].public_ips[0])
        machinefile.close()
        LOG.debug('machinefile copied to master node...')

        conn = PTYShell('ssh://%s' % self.running_nodes[0][0].public_ips[0],
                        session=self.session)
        conn.run_sync('chmod 644 /tmp/machinefile')
        LOG.debug('Set permissions on /tmp/machinefile on master node to 644.')

        return self.running_nodes