Ejemplo n.º 1
0
    def __init__(
            self,
            name,
            # these parameters are inherited from the `LRMS` class
            architecture,
            max_cores,
            max_cores_per_job,
            max_memory_per_core,
            max_walltime,
            # these are specific of the EC2Lrms class
            ec2_region,
            keypair_name,
            public_key,
            vm_auth,
            image_id=None,
            image_name=None,
            ec2_url=None,
            instance_type=None,
            auth=None,
            vm_pool_max_size=None,
            user_data=None,
            **extra_args):
        LRMS.__init__(self, name, architecture, max_cores, max_cores_per_job,
                      max_memory_per_core, max_walltime, auth, **extra_args)

        self.free_slots = int(max_cores)
        self.user_run = 0
        self.user_queued = 0
        self.queued = 0
        self.vm_pool_max_size = vm_pool_max_size
        if vm_pool_max_size is not None:
            try:
                self.vm_pool_max_size = int(self.vm_pool_max_size)
            except ValueError:
                raise ConfigurationError(
                    "Value for `vm_pool_max_size` must be an integer,"
                    " was %s instead." % vm_pool_max_size)

        self.subresource_type = self.type.split('+', 1)[1]
        if self.subresource_type not in available_subresource_types:
            raise UnrecoverableError("Invalid resource type: %s" % self.type)

        self.region = ec2_region

        # Mapping of job.execution._lrms_vm_id => LRMS
        self.subresources = {}

        auth = self._auth_fn()
        self.ec2_access_key = auth.ec2_access_key
        self.ec2_secret_key = auth.ec2_secret_key
        if ec2_url is None:
            ec2_url = os.getenv('EC2_URL')
        if ec2_url is None:
            raise gc3libs.exceptions.InvalidArgument(
                "Cannot connect to the EC2 API:"
                " No 'EC2_URL' environment variable defined,"
                " and no 'ec2_url' argument passed to the EC2 backend.")
        self.ec2_url = gc3libs.url.Url(ec2_url)

        # Keypair names can only contain alphanumeric chars!
        if re.match(r'.*\W.*', keypair_name):
            raise ConfigurationError(
                "Keypair name `%s` is invalid: keypair names can only contain "
                "alphanumeric chars: [a-zA-Z0-9_]" % keypair_name)
        self.keypair_name = keypair_name
        self.public_key = os.path.expanduser(
            os.path.expandvars(public_key.strip()))
        self.image_id = image_id
        self.image_name = image_name
        self.instance_type = instance_type
        self._instance_type_specs = {}
        self.user_data = user_data

        self._parse_security_group()
        self._conn = None

        # `self.subresource_args` is used to create subresources
        self.subresource_args = extra_args
        self.subresource_args['type'] = self.subresource_type
        self.subresource_args['architecture'] = self['architecture']
        self.subresource_args['max_cores'] = self['max_cores']
        self.subresource_args['max_cores_per_job'] = self['max_cores_per_job']
        self.subresource_args['max_memory_per_core'] = \
            self['max_memory_per_core']
        self.subresource_args['max_walltime'] = self['max_walltime']
        # SSH-specific configuration
        self.subresource_args['transport'] = 'ssh'
        self.subresource_args['auth'] = vm_auth
        self.subresource_args['ssh_timeout'] = 7  # FIXME: hard-coded!
        self.subresource_args['ignore_ssh_host_keys'] = True
        self.subresource_args['keyfile'] = self.public_key
        if self.subresource_args['keyfile'].endswith('.pub'):
            self.subresource_args['keyfile'] = \
              self.subresource_args['keyfile'][:-len('.pub')]
        # ShellcmdLrms by default trusts the configuration, instead of
        # checking the real amount of memory and number of cpus, but
        # we need the real values instead.
        if self.subresource_type == gc3libs.Default.SHELLCMD_LRMS:
            self.subresource_args['override'] = 'True'

        if not image_name and not image_id:
            raise ConfigurationError(
                "No `image_id` or `image_name` has been specified in the"
                " configuration file.")
Ejemplo n.º 2
0
import hashlib
import os
import re
import paramiko

# EC2 APIs
try:
    import boto
    import boto.ec2.regioninfo
    import boto.exception
except ImportError as err:
    from gc3libs.exceptions import ConfigurationError
    raise ConfigurationError(
        "The EC2 backend is used but the `boto` module"
        " cannot be used: {err}. Please, either install it with"
        "`pip install boto` and verify that it works"
        " by running `python -c 'import boto'`,"
        " then try again, or update your configuration file and"
        " disable any EC2-based resources.".format(err=err))

import Crypto.PublicKey.DSA
import Crypto.PublicKey.RSA

# GC3Pie imports
import gc3libs
from gc3libs.exceptions import UnrecoverableError, \
    ConfigurationError, MaximumCapacityReached, ResourceNotReady, \
    UnrecoverableAuthError, TransportError
import gc3libs.url
from gc3libs import Run
from gc3libs.utils import same_docstring_as, insert_char_every_n_chars
Ejemplo n.º 3
0
    def __init__(self, name,
                 # these parameters are inherited from the `LRMS` class
                 architecture, max_cores, max_cores_per_job,
                 max_memory_per_core, max_walltime,
                 # these are specific of the OpenStackLrms class
                 keypair_name, public_key, vm_auth,
                 os_region=None, image_id=None, os_auth_url=None,
                 instance_type=None, auth=None,
                 vm_pool_max_size=None, user_data=None,
                 vm_os_overhead=gc3libs.Default.VM_OS_OVERHEAD,
                 # extra args are used to instanciate "sub-resources"
                 **extra_args):

        # Note: this creates attributes from key/value pairs given in the
        # `extra_args` parameters. In particular, the `self.type` attribute
        # (referenced below) is set in this chained constructor...
        LRMS.__init__(
            self, name,
            architecture, max_cores, max_cores_per_job,
            max_memory_per_core, max_walltime, auth, **extra_args)

        self.free_slots = int(max_cores)
        self.user_run = 0
        self.user_queued = 0
        self.queued = 0
        self._flavors = []
        self.vm_pool_max_size = vm_pool_max_size
        if vm_pool_max_size is not None:
            try:
                self.vm_pool_max_size = int(self.vm_pool_max_size)
            except ValueError:
                raise ConfigurationError(
                    "Value for `vm_pool_max_size` must be an integer,"
                    " was %s instead." % vm_pool_max_size)

        # pylint: disable=no-member
        self.subresource_type = self.type.split('+', 1)[1]
        if self.subresource_type not in available_subresource_types:
            raise UnrecoverableError("Invalid resource type: %s" % self.type)

        # Mapping of job.execution.instance_id => LRMS
        self.subresources = {}

        auth = self._auth_fn()
        if os_auth_url is None:
            os_auth_url = os.getenv('OS_AUTH_URL')
        if os_auth_url is None:
            raise gc3libs.exceptions.InvalidArgument(
                "Cannot connect to the OpenStack API:"
                " No 'OS_AUTH_URL' environment variable defined,"
                " and no 'os_auth_url' argument passed"
                " to the OpenStack backend.")
        self.os_auth_url = os_auth_url
        self.os_username = auth.os_username
        self.os_password = auth.os_password
        self.os_tenant_name = auth.os_project_name
        self.os_region_name = os_region
        if self.os_auth_url is None:
            raise gc3libs.exceptions.InvalidArgument(
                "Cannot connect to the OpenStack API:"
                " No 'os_auth_url' argument passed to the OpenStack backend.")
        # Keypair names can only contain alphanumeric chars!
        if not set(keypair_name).issubset(set(ascii_letters + digits + '_')):
            raise ConfigurationError(
                "Keypair name `%s` is invalid: keypair names can only contain "
                "alphanumeric chars: [a-zA-Z0-9_]" % keypair_name)
        self.keypair_name = keypair_name
        self.public_key = os.path.expanduser(
            os.path.expandvars(public_key.strip()))
        self.image_id = image_id
        self.instance_type = instance_type
        self.user_data = user_data
        self.vm_os_overhead = gc3libs.quantity.Memory(vm_os_overhead)
        self._parse_security_group()
        self._conn = None

        # `*_instance_type` config items should be consumed here,
        # not in any sub-resource
        for key, value in extra_args.items():
            if key.endswith('_instance_type'):
                self[key] = value
                extra_args.pop(key)

        # `self.subresource_args` is used to create subresources
        self.subresource_args = extra_args
        self.subresource_args['type'] = self.subresource_type
        self.subresource_args['architecture'] = self['architecture']
        self.subresource_args['max_cores'] = self['max_cores']
        self.subresource_args['max_cores_per_job'] = self['max_cores_per_job']
        self.subresource_args['max_memory_per_core'] = \
            self['max_memory_per_core']
        self.subresource_args['max_walltime'] = self['max_walltime']
        # SSH-specific configuration
        self.subresource_args['transport'] = 'ssh'
        self.subresource_args['auth'] = vm_auth
        self.subresource_args['ssh_timeout'] = 7  # FIXME: hard-coded!
        self.subresource_args['ignore_ssh_host_keys'] = True
        self.subresource_args['keyfile'] = self.public_key
        if self.subresource_args['keyfile'].endswith('.pub'):
            self.subresource_args['keyfile'] = \
              self.subresource_args['keyfile'][:-len('.pub')]
        # ShellcmdLrms by default trusts the configuration, instead of
        # checking the real amount of memory and number of cpus, but
        # we need the real values instead.
        if self.subresource_type == gc3libs.Default.SHELLCMD_LRMS:
            self.subresource_args['override'] = 'True'

        if image_id is None:
            raise ConfigurationError(
                "No `image_id` specified in the configuration file.")

        # "Connect" to the cloud (connection is actually performed
        # only when needed by the `Client` class.

        self.client = self._new_client()

        # Set up the VMPool persistent class. This has been delayed
        # until here because otherwise self._conn is None
        pooldir = os.path.join(os.path.expandvars(OpenStackLrms.RESOURCE_DIR),
                               'vmpool', self.name)
        self._vmpool = OpenStackVMPool(pooldir, self.client)
Ejemplo n.º 4
0
    def submit_job(self, job):
        """
        Submission on an OpenStack resource will usually happen in
        multiple steps, since creating a VM and attaching a resource
        to it will take some time.

        In order to return as soon as possible, the backend will raise
        a `RecoverableError` whenever submission is delayed.

        In case a permanent error is found (for instance, we cannot
        create VMs on the cloud), a `UnrecoverableError` is raised.

        More in detail, when during submission the following will
        happen:

        * First of all, the backend will try to submit the job to one
          of the already available subresources.

        * If none of them is able to sbmit the job, the backend will
          check if there is a VM in pending state, and in case there
          is one it will raise a `RecoverableError`, thus delaying
          submission.

        * If no VM in pending state is found, the `vm_pool_max_size`
          configuration option is checked. If we already reached the
          maximum number of VM, a `UnrecoverableError` is raised.

        * If no VM in pending state is found but `vm_pool_max_size` is
          still lesser than the number of VM currently created (or it
          is None, which for us means no limit), then a new VM is
          created, and `RecoverableError` is raised.

        """
        # Updating resource is needed to update the subresources. This
        # is not always done before the submit_job because of issue
        # nr.  386:
        #     https://github.com/uzh/gc3pie/issues/386
        self.get_resource_status()
        pending_vms = set(vm.id for vm in self._vmpool.get_all_vms()
                          if vm.status in PENDING_STATES)

        image_id = self.get_image_id_for_job(job)
        # Check if the image id is valid
        if image_id not in [img.id for img in self._get_available_images()]:
            raise ConfigurationError("Image ID %s not found in cloud "
                                     "%s" % (image_id, self.os_auth_url))

        instance_type = self.get_instance_type_for_job(job)
        if not instance_type:
            raise RuntimeError(
                "Unable to find a suitable instance type for "
                "application %s" % job)

        # First of all, try to submit to one of the subresources.
        for vm_id, subresource in self.subresources.items():
            if not subresource.updated:
                # The VM is probably still booting, let's skip to the
                # next one and add it to the list of "pending" VMs.
                pending_vms.add(vm_id)
                continue
            try:
                # Check that the required image id and instance type
                # are correct
                vm = self._get_vm(vm_id)
                if vm.image['id'] != image_id:
                    continue
                subresource.submit_job(job)
                job.execution._lrms_vm_id = vm_id
                job.changed = True
                gc3libs.log.info(
                    "Job successfully submitted to remote resource %s.",
                    subresource.name)
                return job
            except (LRMSSubmitError, InstanceNotFound) as ex:
                if gc3libs.error_ignored(
                        # context:
                        # - module
                        'openstack',
                        # - class
                        'OpenStackLrms',
                        # - method
                        'submit_job',
                        # - actual error class
                        ex.__class__.__name__,
                        # - additional keywords
                        'submit',
                ):
                    gc3libs.log.debug(
                        "Ignoring error in submitting to resource '%s': %s",
                        subresource.name, ex)
                else:
                    # propagate error back to caller
                    raise

        # Couldn't submit to any resource.
        if not pending_vms:
            # No pending VM, and no resource available. Create a new VM
            if not self.vm_pool_max_size \
                    or len(self._vmpool) < self.vm_pool_max_size:
                user_data = self.get_user_data_for_job(job)
                vm = self._create_instance(
                    image_id,
                    name="GC3Pie_%s_%d" % (self.name, (len(self._vmpool) + 1)),
                    instance_type=instance_type,
                    user_data=user_data)
                pending_vms.add(vm.id)

                self._vmpool.add_vm(vm)
            else:
                raise MaximumCapacityReached(
                    "Already running the maximum number of VM on resource %s:"
                    " %d VMs started, but max %d allowed by configuration."
                    % (self.name, len(self._vmpool), self.vm_pool_max_size),
                    do_log=True)

        # If we reached this point, we are waiting for a VM to be
        # ready, so delay the submission until we wither can submit to
        # one of the available resources or until all the VMs are
        # ready.
        gc3libs.log.debug(
            "No available resource was found, but some VM is still in"
            " `pending` state. Waiting until the next iteration before"
            " creating a new VM. Pending VM ids: %s", pending_vms)
        raise LRMSSkipSubmissionToNextIteration(
            "Delaying submission until one of the VMs currently pending"
            " is ready. Pending VM ids: %s"
            % str.join(', ', pending_vms))
Ejemplo n.º 5
0
if sys.version_info < (2, 7):
    raise ImportError("GC3Pie's OpenStack backend requires Python 2.7+")

import os
import paramiko
from string import ascii_letters, digits

# OpenStack APIs
try:
    import os_client_config
except ImportError as err:
    from gc3libs.exceptions import ConfigurationError
    raise ConfigurationError(
        "The OpenStack backend is used but the `os_client_config` module"
        " cannot be used: {err}. Please, either install it with"
        "`pip install os-client-config` and verify that it works"
        " by running `python -c 'import os_client_config'`,"
        " then try again, or update your configuration file and"
        " disable any OpenStack-based resources."
        .format(err=err))

try:
    from novaclient.exceptions import NotFound
except ImportError as err:
    from gc3libs.exceptions import ConfigurationError
    raise ConfigurationError(
        "The OpenStack backend is used but the `novaclient` module"
        " cannot be used: {err}. Please, either install it with"
        "`pip install python-novaclient` and verify that it works"
        " by running `python -c 'import novaclient'`,"
        " then try again, or update your configuration file and"
        " disable any OpenStack-based resources."