Example #1
0
    def _createSecurityGroup(self):
        assert self._ctx

        def groupNotFound(e):
            retry = (e.status == 400
                     and 'does not exist in default VPC' in e.body)
            return retry

        vpcId = None
        if self._vpcSubnet:
            conn = boto.connect_vpc(region=self._ctx.ec2.region)
            subnets = conn.get_all_subnets(subnet_ids=[self._vpcSubnet])
            if len(subnets) > 0:
                vpcId = subnets[0].vpc_id
        # security group create/get. ssh + all ports open within the group
        try:
            web = self._ctx.ec2.create_security_group(
                self.clusterName,
                'Toil appliance security group',
                vpc_id=vpcId)
        except EC2ResponseError as e:
            if e.status == 400 and 'already exists' in e.body:
                pass  # group exists- nothing to do
            else:
                raise
        else:
            for attempt in retry(predicate=groupNotFound, timeout=300):
                with attempt:
                    # open port 22 for ssh-ing
                    web.authorize(ip_protocol='tcp',
                                  from_port=22,
                                  to_port=22,
                                  cidr_ip='0.0.0.0/0')
            for attempt in retry(predicate=groupNotFound, timeout=300):
                with attempt:
                    # the following authorizes all TCP access within the web security group
                    web.authorize(ip_protocol='tcp',
                                  from_port=0,
                                  to_port=65535,
                                  src_group=web)
            for attempt in retry(predicate=groupNotFound, timeout=300):
                with attempt:
                    # We also want to open up UDP, both for user code and for the RealtimeLogger
                    web.authorize(ip_protocol='udp',
                                  from_port=0,
                                  to_port=65535,
                                  src_group=web)
        out = []
        for sg in self._ctx.ec2.get_all_security_groups():
            if sg.name == self.clusterName and (vpcId is None
                                                or sg.vpc_id == vpcId):
                out.append(sg)
        return out
Example #2
0
def _dockerKill(containerName, action):
    """
    Deprecated.  Kills the specified container.
    :param str containerName: The name of the container created by docker_call
    :param int action: What action should be taken on the container?
    """
    running = containerIsRunning(containerName)
    if running is None:
        # This means that the container doesn't exist.  We will see this if the
        # container was run with --rm and has already exited before this call.
        logger.debug(
            'The container with name "%s" appears to have already been '
            'removed.  Nothing to '
            'do.', containerName)
    else:
        if action in (None, FORGO):
            logger.debug(
                'The container with name %s continues to exist as we '
                'were asked to forgo a '
                'post-job action on it.', containerName)
        else:
            logger.debug(
                'The container with name %s exists. Running '
                'user-specified defer functions.', containerName)
            if running and action >= STOP:
                logger.debug('Stopping container "%s".', containerName)
                for attempt in retry(predicate=dockerPredicate):
                    with attempt:
                        subprocess.check_call(
                            ['docker', 'stop', containerName])
            else:
                logger.debug('The container "%s" was not found to be running.',
                             containerName)
            if action >= RM:
                # If the container was run with --rm, then stop will most likely
                # remove the container.  We first check if it is running then
                # remove it.
                running = containerIsRunning(containerName)
                if running is not None:
                    logger.debug('Removing container "%s".', containerName)
                    for attempt in retry(predicate=dockerPredicate):
                        with attempt:
                            subprocess.check_call(
                                ['docker', 'rm', '-f', containerName])
                else:
                    logger.debug(
                        'Container "%s" was not found on the system.'
                        'Nothing to remove.', containerName)
Example #3
0
    def _getProfileArn(self):
        assert self._ctx
        policy = dict(iam_full=self.full_policy('iam'), ec2_full=self.full_policy('ec2'),
                      s3_full=self.full_policy('s3'), sbd_full=self.full_policy('sdb'))
        iamRoleName = self._ctx.setup_iam_ec2_role(role_name=_INSTANCE_PROFILE_ROLE_NAME, policies=policy)

        try:
            profile = self._ctx.iam.get_instance_profile(iamRoleName)
        except BotoServerError as e:
            if e.status == 404:
                profile = self._ctx.iam.create_instance_profile(iamRoleName)
                profile = profile.create_instance_profile_response.create_instance_profile_result
            else:
                raise
        else:
            profile = profile.get_instance_profile_response.get_instance_profile_result
        profile = profile.instance_profile
        profile_arn = profile.arn

        if len(profile.roles) > 1:
                raise RuntimeError('Did not expect profile to contain more than one role')
        elif len(profile.roles) == 1:
            # this should be profile.roles[0].role_name
            if profile.roles.member.role_name == iamRoleName:
                return profile_arn
            else:
                self._ctx.iam.remove_role_from_instance_profile(iamRoleName,
                                                                profile.roles.member.role_name)
        for attempt in retry(predicate=lambda err: err.status == 404):
            with attempt:
                self._ctx.iam.add_role_to_instance_profile(iamRoleName, iamRoleName)
        return profile_arn
Example #4
0
def _fixPermissions(tool, workDir):
    """
    Deprecated.

    Fix permission of a mounted Docker directory by reusing the tool to change
    ownership.  Docker natively runs as a root inside the container, and files
    written to the mounted directory are implicitly owned by root.
    :param list baseDockerCall: Docker run parameters
    :param str tool: Name of tool
    :param str workDir: Path of work directory to recursively chown
    """
    if os.geteuid() == 0:
        # we're running as root so this chown is redundant
        return

    baseDockerCall = [
        'docker', 'run', '--log-driver=none', '-v',
        os.path.abspath(workDir) + ':/data', '--rm', '--entrypoint=chown'
    ]
    stat = os.stat(workDir)
    command = baseDockerCall + [tool] + [
        '-R', '{}:{}'.format(stat.st_uid, stat.st_gid), '/data'
    ]
    for attempt in retry(predicate=dockerPredicate):
        with attempt:
            subprocess.check_call(command)
Example #5
0
def retry_ec2(retry_after=a_short_time,
              retry_for=10 * a_short_time,
              retry_while=not_found):
    t = retry_after
    return retry(delays=(t, t, t * 2, t * 4),
                 timeout=retry_for,
                 predicate=retry_while)
Example #6
0
    def _getProfileARN(self):
        assert self._ctx
        def addRoleErrors(e):
            return e.status == 404
        roleName = 'toil'
        policy = dict(iam_full=iamFullPolicy, ec2_full=ec2FullPolicy,
                      s3_full=s3FullPolicy, sbd_full=sdbFullPolicy)
        iamRoleName = self._ctx.setup_iam_ec2_role(role_name=roleName, policies=policy)

        try:
            profile = self._ctx.iam.get_instance_profile(iamRoleName)
        except BotoServerError as e:
            if e.status == 404:
                profile = self._ctx.iam.create_instance_profile(iamRoleName)
                profile = profile.create_instance_profile_response.create_instance_profile_result
            else:
                raise
        else:
            profile = profile.get_instance_profile_response.get_instance_profile_result
        profile = profile.instance_profile
        profile_arn = profile.arn

        if len(profile.roles) > 1:
                raise RuntimeError('Did not expect profile to contain more than one role')
        elif len(profile.roles) == 1:
            # this should be profile.roles[0].role_name
            if profile.roles.member.role_name == iamRoleName:
                return profile_arn
            else:
                self._ctx.iam.remove_role_from_instance_profile(iamRoleName,
                                                          profile.roles.member.role_name)
        for attempt in retry(predicate=addRoleErrors):
            with attempt:
                self._ctx.iam.add_role_to_instance_profile(iamRoleName, iamRoleName)
        return profile_arn
Example #7
0
 def _try_kubernetes_expecting_gone(self, method, *args, **kwargs):
     """
     Same as _try_kubernetes, but raises 404 errors as soon as they are
     encountered (because we are waiting for them) instead of retrying on
     them.
     """
     
     for attempt in retry(predicate=retryable_kubernetes_errors_expecting_gone):
         with attempt:
             return method(*args, **kwargs)
Example #8
0
    def _download(self, dstFile):
        """
        Download this resource from its URL to the given file object.

        :type dstFile: io.BytesIO|io.FileIO
        """
        for attempt in retry(predicate=lambda e: isinstance(e, HTTPError) and e.code == 400):
            with attempt:
                with closing(urlopen(self.url)) as content:
                    buf = content.read()
        contentHash = hashlib.md5(buf)
        assert contentHash.hexdigest() == self.contentHash
        dstFile.write(buf)
Example #9
0
    def _discoverAMI(self):
        """
        :return: The AMI ID (a string like 'ami-0a9a5d2b65cce04eb') for CoreOS
                 or a compatible replacement like Flatcar.
        :rtype: str
        """

        # Take a user override
        ami = os.environ.get('TOIL_AWS_AMI')
        if ami is not None:
            return ami

        # CoreOS is dead, long live Flatcar

        # Flatcar images, however, only live for 9 months.
        # Rather than hardcode a list of AMIs by region that will die, we use
        # their JSON feed of the current ones.
        JSON_FEED_URL = 'https://stable.release.flatcar-linux.net/amd64-usr/current/flatcar_production_ami_all.json'

        # What region do we care about?
        region = zoneToRegion(self._zone)

        for attempt in retry(predicate=lambda e: True):
            # Until we get parseable JSON
            # TODO: What errors do we get for timeout, JSON parse failure, etc?
            with attempt:
                # Try to get the JSON and parse it.
                feed = json.loads(urllib.request.urlopen(JSON_FEED_URL).read())

        try:
            for ami_record in feed['amis']:
                # Scan the klist of regions
                if ami_record['name'] == region:
                    # When we find ours
                    # Save the AMI ID
                    ami = ami_record['hvm']
                    # And stop scanning
                    break
        except KeyError:
            # We didn't see a field we need
            raise RuntimeError(
                'Flatcar image feed at {} does not have expected format'.
                format(JSON_FEED_URL))

        if ami is None:
            # We didn't find it
            raise RuntimeError(
                'Flatcar image feed at {} does not have an image for region {}'
                .format(JSON_FEED_URL, region))

        return ami
Example #10
0
    def _try_kubernetes(self, method, *args, **kwargs):
        """
        Kubernetes API can end abruptly and fail when it could dynamically backoff and retry.

        For example, calling self._api('batch').create_namespaced_job(self.namespace, job),
        Kubernetes can behave inconsistently and fail given a large job. See 
        https://github.com/DataBiosphere/toil/issues/2884 .
        
        This function gives Kubernetes more time to try an executable api.  
        """

        for attempt in retry(predicate=retryable_kubernetes_errors):
            with attempt:
                return method(*args, **kwargs)
Example #11
0
    def _discoverAMI(self):
        def descriptionMatches(ami):
            return ami.description is not None and 'stable 1855.5.0' in ami.description
        coreOSAMI = os.environ.get('TOIL_AWS_AMI')
        if coreOSAMI is not None:
            return coreOSAMI

        for attempt in retry(predicate=lambda e: isinstance(e, SSLError)):
            # SSLError is thrown when get_all_images times out
            with attempt:
                # 679593333241 is the aws marketplace account
                amis = self._ctx.ec2.get_all_images(owners=['679593333241'], filters={'name': 'CoreOS-stable-1855.5.0-hvm-0d1e0bd0-eaea-4397-9a3a-c56f861d2a14-ami-0f74e41ea6c13f74b.4'})

        coreOSAMI = [ami for ami in amis if descriptionMatches(ami)]
        logger.debug('Found the following matching AMIs: %s', coreOSAMI)
        assert len(coreOSAMI) == 1, coreOSAMI
        return coreOSAMI.pop().id
Example #12
0
        def _obtain_credentials_from_boto3(self):
            """
            We know the current cached credentials are not good, and that we
            need to get them from Boto 3. Fill in our credential fields
            (_access_key, _secret_key, _security_token,
            _credential_expiry_time) from Boto 3.
            """

            # We get a Credentials object
            # <https://github.com/boto/botocore/blob/8d3ea0e61473fba43774eb3c74e1b22995ee7370/botocore/credentials.py#L227>
            # or a RefreshableCredentials, or None on failure.
            creds = None
            for attempt in retry(timeout=10, predicate=lambda _: True):
                with attempt:
                    creds = self._boto3_resolver.load_credentials()

                    if creds is None:
                        try:
                            resolvers = str(self._boto3_resolver.providers)
                        except:
                            resolvers = "(Resolvers unavailable)"
                        raise RuntimeError(
                            "Could not obtain AWS credentials from Boto3. Resolvers tried: "
                            + resolvers)

            # Make sure the credentials actually has some credentials if it is lazy
            creds.get_frozen_credentials()

            # Get when the credentials will expire, if ever
            if isinstance(creds, RefreshableCredentials):
                # Credentials may expire.
                # Get a naive UTC datetime like boto 2 uses from the boto 3 time.
                self._credential_expiry_time = creds._expiry_time.astimezone(
                    timezone('UTC')).replace(tzinfo=None)
            else:
                # Credentials never expire
                self._credential_expiry_time = None

            # Then, atomically get all the credentials bits. They may be newer than we think they are, but never older.
            frozen = creds.get_frozen_credentials()

            # Copy them into us
            self._access_key = frozen.access_key
            self._secret_key = frozen.secret_key
            self._security_token = frozen.token
Example #13
0
    def destroyCluster(self):
        """
        Terminate instances and delete the profile and security group.
        """
        assert self._ctx

        def expectedShutdownErrors(e):
            return e.status == 400 and 'dependent object' in e.body

        instances = self._getNodesInCluster(nodeType=None, both=True)
        spotIDs = self._getSpotRequestIDs()
        if spotIDs:
            self._ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs)
        instancesToTerminate = awsFilterImpairedNodes(instances, self._ctx.ec2)
        vpcId = None
        if instancesToTerminate:
            vpcId = instancesToTerminate[0].vpc_id
            self._deleteIAMProfiles(instances=instancesToTerminate)
            self._terminateInstances(instances=instancesToTerminate)
        if len(instances) == len(instancesToTerminate):
            logger.info('Deleting security group...')
            removed = False
            for attempt in retry(timeout=300,
                                 predicate=expectedShutdownErrors):
                with attempt:
                    for sg in self._ctx.ec2.get_all_security_groups():
                        if sg.name == self.clusterName and vpcId and sg.vpc_id == vpcId:
                            try:
                                self._ctx.ec2.delete_security_group(
                                    group_id=sg.id)
                                removed = True
                            except BotoServerError as e:
                                if e.error_code == 'InvalidGroup.NotFound':
                                    pass
                                else:
                                    raise
            if removed:
                logger.info('... Succesfully deleted security group')
        else:
            assert len(instances) > len(instancesToTerminate)
            # the security group can't be deleted until all nodes are terminated
            logger.warning(
                'The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes '
                'have failed health checks. As a result, the security group & IAM '
                'roles will not be deleted.')
Example #14
0
 def _startMesos(self, numCores=None):
     if numCores is None:
         numCores = cpu_count()
     shutil.rmtree('/tmp/mesos', ignore_errors=True)
     self.master = self.MesosMasterThread(numCores)
     self.master.start()
     self.agent = self.MesosAgentThread(numCores)
     self.agent.start()
     
     # Wait for the master to come up.
     # Bad Things will happen if the master is not yet ready when Toil tries to use it.
     for attempt in retry(predicate=lambda e: True):
         with attempt:
             log.info('Checking if Mesos is ready...')
             with closing(urlopen('http://127.0.0.1:5050/version')) as content:
                 content.read()
     
     log.info('Mesos is ready! Running test.')
Example #15
0
    def _discoverAMI(self):
        def descriptionMatches(ami):
            return ami.description is not None and 'stable 1632.2.1' in ami.description

        coreOSAMI = os.environ.get('TOIL_AWS_AMI')
        if coreOSAMI is not None:
            return coreOSAMI
        # that ownerID corresponds to coreOS

        for attempt in retry(predicate=lambda e: isinstance(e, SSLError)):
            # SSLError is thrown when get_all_images times out
            with attempt:
                amis = self._ctx.ec2.get_all_images(owners=['679593333241'])

        coreOSAMI = [ami for ami in amis if descriptionMatches(ami)]
        logger.debug('Found the following matching AMIs: %s', coreOSAMI)
        assert len(coreOSAMI) == 1
        return coreOSAMI.pop().id
Example #16
0
def subprocessDockerCall(job,
                         tool,
                         parameters=None,
                         workDir=None,
                         dockerParameters=None,
                         checkOutput=True,
                         outfile=None,
                         errfile=None,
                         defer=None):
    """
    Deprecated.  Calls Docker using subprocess.check_output().

    Assumes `docker` is on the PATH.  Uses Toil's defer functionality to ensure
    containers are shutdown even in case of job or pipeline failure

    Example of using dockerCall in toil to index a FASTA file with SAMtools:
        def toil_job(job):
            work_dir = job.fileStore.getLocalTempDir()
            path = job.fileStore.readGlobalFile(ref_id, os.path.join(
                                                          work_dir, 'ref.fasta')
            parameters = ['faidx', path]
            dockerCall(job, tool='quay.io/ucgc_cgl/samtools:latest',
                       work_dir=work_dir, parameters=parameters)

    :param toil.Job.job job: The Job instance for the calling function.
    :param str tool: Name of the Docker image to be used
                     (e.g. quay.io/ucsc_cgl/samtools).
    :param list[str] parameters: Command line arguments to be passed.
           If list of lists: list[list[str]], then treat as successive commands
           chained with pipe.
    :param str workDir: Directory to mount into the container via `-v`.
                        Destination convention is '/data'.
    :param list[str] dockerParameters: Parameters to pass to Docker.
             Default parameters are `--rm`, `--log-driver none`, and the
             mountpoint `-v work_dir:/data` where /data is the destination
             convention.
             These defaults are removed if docker_parmaters is passed,
             so be sure to pass them if they are desired.
    :param file outfile: Pipe output of Docker call to file handle
    :param file errfile: Pipe standard error of Docker call to file handle
    :param int defer: What action should be taken on the container upon job
                      completion?
           FORGO (0) will leave the container untouched.
           STOP (1) will attempt to stop the container with `docker stop`
           (useful for debugging).
           RM (2) will stop the container and then forcefully remove it from the
           system using `docker rm -f`. This is the default behavior if defer is
           set to None.
    """
    if parameters is None:
        parameters = []
    if workDir is None:
        workDir = os.getcwd()

    # Setup the outgoing subprocess call for docker
    baseDockerCall = ['docker', 'run']
    if dockerParameters:
        baseDockerCall += dockerParameters
    else:
        baseDockerCall += [
            '--rm', '--log-driver', 'none', '-v',
            os.path.abspath(workDir) + ':/data'
        ]

    # Ensure the user has passed a valid value for defer
    assert defer in (None, FORGO, STOP, RM)

    # Get container name which is needed for _dockerKill
    try:
        if any('--name' in x for x in baseDockerCall):
            if any('--name=' in x for x in baseDockerCall):
                containerName = [
                    x.split('=')[1] for x in baseDockerCall if '--name' in x
                ][0]
            else:
                containerName = baseDockerCall[baseDockerCall.index('--name') +
                                               1]
        else:
            containerName = getContainerName(job)
            baseDockerCall.extend(['--name', containerName])
    except ValueError:
        containerName = getContainerName(job)
        baseDockerCall.extend(['--name', containerName])
    except IndexError:
        raise RuntimeError(
            "Couldn't parse Docker's `--name=` option, check parameters: " +
            str(dockerParameters))

    # Defer the container on-exit action
    if '--rm' in baseDockerCall and defer is None:
        defer = RM
    if '--rm' in baseDockerCall and defer is not RM:
        logger.warn('--rm being passed to docker call but defer not set to '
                    'dockerCall.RM, defer set to: ' + str(defer))
    job.defer(_dockerKill, containerName, action=defer)
    # Defer the permission fixing function which will run after this job.
    # We call this explicitly later on in this function,
    # but we defer it as well to handle unexpected job failure.
    job.defer(_fixPermissions, tool, workDir)

    # Make subprocess call

    # If parameters is list of lists, treat each list as separate command and chain with pipes
    if len(parameters) > 0 and type(parameters[0]) is list:
        # When piping, all arguments now get merged into a single string to bash -c.
        # We try to support spaces in paths by wrapping them all in quotes first.
        chain_params = [
            ' '.join(p)
            for p in [list(map(pipes.quote, q)) for q in parameters]
        ]
        # Use bash's set -eo pipefail to detect and abort on a failure in any command in the chain
        call = baseDockerCall + [
            '--entrypoint', '/bin/bash', tool, '-c',
            'set -eo pipefail && {}'.format(' | '.join(chain_params))
        ]
    else:
        call = baseDockerCall + [tool] + parameters
    logger.info("Calling docker with " + repr(call))

    params = {}
    if outfile:
        params['stdout'] = outfile
    if errfile:
        params['stderr'] = errfile
    if checkOutput:
        callMethod = subprocess.check_output
    else:
        callMethod = subprocess.check_call

    for attempt in retry(predicate=dockerPredicate):
        with attempt:
            out = callMethod(call, **params)

    _fixPermissions(tool=tool, workDir=workDir)
    return out
Example #17
0
 def wrapper(*args, **kwargs):
     for attempt in retry(delays=truncExpBackoff(),
                          timeout=300,
                          predicate=awsRetryPredicate):
         with attempt:
             return f(*args, **kwargs)
Example #18
0
def retry_kubernetes(retry_while=retryable_kubernetes_errors):
    """
    A wrapper that sends retryable Kubernetes predicates into a context-manager which will allow 
    Kubernetes to keep retrying until a False or an executable method is seen.  
    """
    return retry(predicate=retry_while)
Example #19
0
    def addNodes(self, nodeType, numNodes, preemptable, spotBid=None):
        assert self._leaderPrivateIP
        if preemptable and not spotBid:
            if self._spotBidsMap and nodeType in self._spotBidsMap:
                spotBid = self._spotBidsMap[nodeType]
            else:
                raise RuntimeError(
                    "No spot bid given for a preemptable node request.")
        instanceType = E2Instances[nodeType]
        bdm = self._getBlockDeviceMapping(instanceType,
                                          rootVolSize=self._nodeStorage)

        keyPath = self._sseKey if self._sseKey else None
        userData = self._getCloudConfigUserData('worker',
                                                self._masterPublicKey, keyPath,
                                                preemptable)
        if isinstance(userData, text_type):
            # Spot-market provisioning requires bytes for user data.
            userData = userData.encode('utf-8')
        sgs = [
            sg for sg in self._ctx.ec2.get_all_security_groups()
            if sg.name in self._leaderSecurityGroupNames
        ]
        kwargs = {
            'key_name': self._keyName,
            'security_group_ids': [sg.id for sg in sgs],
            'instance_type': instanceType.name,
            'user_data': userData,
            'block_device_map': bdm,
            'instance_profile_arn': self._leaderProfileArn,
            'placement': self._zone,
            'subnet_id': self._subnetID
        }

        instancesLaunched = []

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                # after we start launching instances we want to ensure the full setup is done
                # the biggest obstacle is AWS request throttling, so we retry on these errors at
                # every request in this method
                if not preemptable:
                    logger.debug('Launching %s non-preemptable nodes',
                                 numNodes)
                    instancesLaunched = create_ondemand_instances(
                        self._ctx.ec2,
                        image_id=self._discoverAMI(),
                        spec=kwargs,
                        num_instances=numNodes)
                else:
                    logger.debug('Launching %s preemptable nodes', numNodes)
                    kwargs['placement'] = getSpotZone(spotBid,
                                                      instanceType.name,
                                                      self._ctx)
                    # force generator to evaluate
                    instancesLaunched = list(
                        create_spot_instances(
                            ec2=self._ctx.ec2,
                            price=spotBid,
                            image_id=self._discoverAMI(),
                            tags={'clusterName': self.clusterName},
                            spec=kwargs,
                            num_instances=numNodes,
                            tentative=True))
                    # flatten the list
                    instancesLaunched = [
                        item for sublist in instancesLaunched
                        for item in sublist
                    ]

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                wait_instances_running(self._ctx.ec2, instancesLaunched)

        self._tags[_TOIL_NODE_TYPE_TAG_KEY] = 'worker'
        AWSProvisioner._addTags(instancesLaunched, self._tags)
        if self._sseKey:
            for i in instancesLaunched:
                self._waitForIP(i)
                node = Node(publicIP=i.ip_address,
                            privateIP=i.private_ip_address,
                            name=i.id,
                            launchTime=i.launch_time,
                            nodeType=i.instance_type,
                            preemptable=preemptable,
                            tags=i.tags)
                node.waitForNode('toil_worker')
                node.coreRsync([self._sseKey, ':' + self._sseKey],
                               applianceName='toil_worker')
        logger.debug('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
Example #20
0
    def destroyCluster(self):
        """
        Terminate instances and delete the profile and security group.
        """
        assert self._ctx

        def expectedShutdownErrors(e):
            return e.status == 400 and 'dependent object' in e.body

        def destroyInstances(instances):
            """
            Similar to _terminateInstances, except that it also cleans up any
            resources associated with the instances (e.g. IAM profiles).
            """
            self._deleteIAMProfiles(instances)
            self._terminateInstances(instances)

        # We should terminate the leader first in case a workflow is still running in the cluster.
        # The leader may create more instances while we're terminating the workers.
        vpcId = None
        try:
            leader = self.getLeader(returnRawInstance=True)
            vpcId = leader.vpc_id
            logger.info('Terminating the leader first ...')
            destroyInstances([leader])
            logger.info('Now terminating any remaining workers ...')
        except (NoSuchClusterException, InvalidClusterStateException):
            # It's ok if the leader is not found. We'll terminate any remaining
            # instances below anyway.
            pass

        instances = self._getNodesInCluster(nodeType=None, both=True)
        spotIDs = self._getSpotRequestIDs()
        if spotIDs:
            self._ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs)
        instancesToTerminate = awsFilterImpairedNodes(instances, self._ctx.ec2)
        if instancesToTerminate:
            vpcId = vpcId or instancesToTerminate[0].vpc_id
            destroyInstances(instancesToTerminate)
        if len(instances) == len(instancesToTerminate):
            logger.debug('Deleting security group...')
            removed = False
            for attempt in retry(timeout=300,
                                 predicate=expectedShutdownErrors):
                with attempt:
                    for sg in self._ctx.ec2.get_all_security_groups():
                        if sg.name == self.clusterName and vpcId and sg.vpc_id == vpcId:
                            try:
                                self._ctx.ec2.delete_security_group(
                                    group_id=sg.id)
                                removed = True
                            except BotoServerError as e:
                                if e.error_code == 'InvalidGroup.NotFound':
                                    pass
                                else:
                                    raise
            if removed:
                logger.debug('... Succesfully deleted security group')
        else:
            assert len(instances) > len(instancesToTerminate)
            # the security group can't be deleted until all nodes are terminated
            logger.warning(
                'The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes '
                'have failed health checks. As a result, the security group & IAM '
                'roles will not be deleted.')
Example #21
0
    def setNodeCount(self, nodeType, numNodes, preemptable=False, force=False):
        """
        Attempt to grow or shrink the number of preemptable or non-preemptable worker nodes in
        the cluster to the given value, or as close a value as possible, and, after performing
        the necessary additions or removals of worker nodes, return the resulting number of
        preemptable or non-preemptable nodes currently in the cluster.

        :param str nodeType: The node type to add or remove.

        :param int numNodes: Desired size of the cluster

        :param bool preemptable: whether the added nodes will be preemptable, i.e. whether they
               may be removed spontaneously by the underlying platform at any time.

        :param bool force: If False, the provisioner is allowed to deviate from the given number
               of nodes. For example, when downsizing a cluster, a provisioner might leave nodes
               running if they have active jobs running on them.

        :rtype: int :return: the number of worker nodes in the cluster after making the necessary
                adjustments. This value should be, but is not guaranteed to be, close or equal to
                the `numNodes` argument. It represents the closest possible approximation of the
                actual cluster size at the time this method returns.
        """
        for attempt in retry(predicate=self.provisioner.retryPredicate):
            with attempt:
                workerInstances = self.getNodes(preemptable=preemptable)
                logger.debug("Cluster contains %i instances" %
                             len(workerInstances))
                # Reduce to nodes of the correct type
                workerInstances = {
                    node: workerInstances[node]
                    for node in workerInstances if node.nodeType == nodeType
                }
                ignoredNodes = [
                    node for node in workerInstances
                    if node.privateIP in self.ignoredNodes
                ]
                numIgnoredNodes = len(ignoredNodes)
                numCurrentNodes = len(workerInstances)
                logger.debug(
                    "Cluster contains %i instances of type %s (%i ignored and draining jobs until "
                    "they can be safely terminated)" %
                    (numCurrentNodes, nodeType, numIgnoredNodes))
                if not force:
                    delta = numNodes - (numCurrentNodes - numIgnoredNodes)
                else:
                    delta = numNodes - numCurrentNodes
                if delta > 0 and numIgnoredNodes > 0:
                    # We can un-ignore a few nodes to compensate for the additional nodes we want.
                    numNodesToUnignore = min(delta, numIgnoredNodes)
                    logger.debug(
                        'Unignoring %i nodes because we want to scale back up again.'
                        % numNodesToUnignore)
                    delta -= numNodesToUnignore
                    for node in ignoredNodes[:numNodesToUnignore]:
                        self.ignoredNodes.remove(node.privateIP)
                        self.leader.batchSystem.unignoreNode(node.privateIP)
                if delta > 0:
                    logger.info(
                        'Adding %i %s nodes to get to desired cluster size of %i.',
                        delta,
                        'preemptable' if preemptable else 'non-preemptable',
                        numNodes)
                    numNodes = numCurrentNodes + self._addNodes(
                        nodeType, numNodes=delta, preemptable=preemptable)
                elif delta < 0:
                    logger.info(
                        'Removing %i %s nodes to get to desired cluster size of %i.',
                        -delta,
                        'preemptable' if preemptable else 'non-preemptable',
                        numNodes)
                    numNodes = numCurrentNodes - self._removeNodes(
                        workerInstances,
                        nodeType=nodeType,
                        numNodes=-delta,
                        preemptable=preemptable,
                        force=force)
                else:
                    if not force:
                        logger.debug(
                            'Cluster (minus ignored nodes) already at desired size of %i. Nothing to do.',
                            numNodes)
                    else:
                        logger.debug(
                            'Cluster already at desired size of %i. Nothing to do.',
                            numNodes)
        return numNodes
Example #22
0
def retry_azure(delays=(0, 1, 1, 4, 16, 64), timeout=300, predicate=defaultRetryPredicate):
    return retry(delays=delays, timeout=timeout, predicate=predicate)
Example #23
0
def retry_s3(delays=default_delays,
             timeout=default_timeout,
             predicate=retryable_s3_errors):
    return retry(delays=delays, timeout=timeout, predicate=predicate)