Beispiel #1
0
    def _createSecurityGroup(self):
        assert self._ctx

        def groupNotFound(e):
            retry = (e.status == 400
                     and 'does not exist in default VPC' in e.body)
            return retry

        vpcId = None
        if self._vpcSubnet:
            conn = boto.connect_vpc(region=self._ctx.ec2.region)
            subnets = conn.get_all_subnets(subnet_ids=[self._vpcSubnet])
            if len(subnets) > 0:
                vpcId = subnets[0].vpc_id
        # security group create/get. ssh + all ports open within the group
        try:
            web = self._ctx.ec2.create_security_group(
                self.clusterName,
                'Toil appliance security group',
                vpc_id=vpcId)
        except EC2ResponseError as e:
            if e.status == 400 and 'already exists' in e.body:
                pass  # group exists- nothing to do
            else:
                raise
        else:
            for attempt in old_retry(predicate=groupNotFound, timeout=300):
                with attempt:
                    # open port 22 for ssh-ing
                    web.authorize(ip_protocol='tcp',
                                  from_port=22,
                                  to_port=22,
                                  cidr_ip='0.0.0.0/0')
            for attempt in old_retry(predicate=groupNotFound, timeout=300):
                with attempt:
                    # the following authorizes all TCP access within the web security group
                    web.authorize(ip_protocol='tcp',
                                  from_port=0,
                                  to_port=65535,
                                  src_group=web)
            for attempt in old_retry(predicate=groupNotFound, timeout=300):
                with attempt:
                    # We also want to open up UDP, both for user code and for the RealtimeLogger
                    web.authorize(ip_protocol='udp',
                                  from_port=0,
                                  to_port=65535,
                                  src_group=web)
        out = []
        for sg in self._ctx.ec2.get_all_security_groups():
            if sg.name == self.clusterName and (vpcId is None
                                                or sg.vpc_id == vpcId):
                out.append(sg)
        return out
Beispiel #2
0
def retry_s3(
    delays: Iterable[float] = DEFAULT_DELAYS,
    timeout: float = DEFAULT_TIMEOUT,
    predicate: Callable[[Exception], bool] = retryable_s3_errors
) -> Iterator[ContextManager[None]]:
    """
    Retry iterator of context managers specifically for S3 operations.
    """
    return old_retry(delays=delays, timeout=timeout, predicate=predicate)
Beispiel #3
0
    def _discoverAMI(self):
        """
        :return: The AMI ID (a string like 'ami-0a9a5d2b65cce04eb') for CoreOS
                 or a compatible replacement like Flatcar.
        :rtype: str
        """

        # Take a user override
        ami = os.environ.get('TOIL_AWS_AMI')
        if ami is not None:
            return ami

        # CoreOS is dead, long live Flatcar

        # Flatcar images, however, only live for 9 months.
        # Rather than hardcode a list of AMIs by region that will die, we use
        # their JSON feed of the current ones.
        JSON_FEED_URL = 'https://stable.release.flatcar-linux.net/amd64-usr/current/flatcar_production_ami_all.json'

        # What region do we care about?
        region = zoneToRegion(self._zone)

        for attempt in old_retry(predicate=lambda e: True):
            # Until we get parseable JSON
            # TODO: What errors do we get for timeout, JSON parse failure, etc?
            with attempt:
                # Try to get the JSON and parse it.
                feed = json.loads(urllib.request.urlopen(JSON_FEED_URL).read())

        try:
            for ami_record in feed['amis']:
                # Scan the klist of regions
                if ami_record['name'] == region:
                    # When we find ours
                    # Save the AMI ID
                    ami = ami_record['hvm']
                    # And stop scanning
                    break
        except KeyError:
            # We didn't see a field we need
            raise RuntimeError(
                'Flatcar image feed at {} does not have expected format'.
                format(JSON_FEED_URL))

        if ami is None:
            # We didn't find it
            raise RuntimeError(
                'Flatcar image feed at {} does not have an image for region {}'
                .format(JSON_FEED_URL, region))

        return ami
Beispiel #4
0
    def _getProfileArn(self):
        assert self._ctx
        policy = dict(iam_full=self.full_policy('iam'),
                      ec2_full=self.full_policy('ec2'),
                      s3_full=self.full_policy('s3'),
                      sbd_full=self.full_policy('sdb'))
        iamRoleName = self._ctx.setup_iam_ec2_role(
            role_name=_INSTANCE_PROFILE_ROLE_NAME, policies=policy)

        try:
            profile = self._ctx.iam.get_instance_profile(iamRoleName)
        except BotoServerError as e:
            if e.status == 404:
                profile = self._ctx.iam.create_instance_profile(iamRoleName)
                profile = profile.create_instance_profile_response.create_instance_profile_result
            else:
                raise
        else:
            profile = profile.get_instance_profile_response.get_instance_profile_result
        profile = profile.instance_profile
        profile_arn = profile.arn

        if len(profile.roles) > 1:
            raise RuntimeError(
                'Did not expect profile to contain more than one role')
        elif len(profile.roles) == 1:
            # this should be profile.roles[0].role_name
            if profile.roles.member.role_name == iamRoleName:
                return profile_arn
            else:
                self._ctx.iam.remove_role_from_instance_profile(
                    iamRoleName, profile.roles.member.role_name)
        for attempt in old_retry(predicate=lambda err: err.status == 404):
            with attempt:
                self._ctx.iam.add_role_to_instance_profile(
                    iamRoleName, iamRoleName)
        return profile_arn
Beispiel #5
0
 def wrapper(*args, **kwargs):
     for attempt in old_retry(delays=truncExpBackoff(),
                              timeout=300,
                              predicate=googleRetryPredicate):
         with attempt:
             return f(*args, **kwargs)
Beispiel #6
0
    def setNodeCount(self, nodeType, numNodes, preemptable=False, force=False):
        """
        Attempt to grow or shrink the number of preemptable or non-preemptable worker nodes in
        the cluster to the given value, or as close a value as possible, and, after performing
        the necessary additions or removals of worker nodes, return the resulting number of
        preemptable or non-preemptable nodes currently in the cluster.

        :param str nodeType: The node type to add or remove.

        :param int numNodes: Desired size of the cluster

        :param bool preemptable: whether the added nodes will be preemptable, i.e. whether they
               may be removed spontaneously by the underlying platform at any time.

        :param bool force: If False, the provisioner is allowed to deviate from the given number
               of nodes. For example, when downsizing a cluster, a provisioner might leave nodes
               running if they have active jobs running on them.

        :rtype: int :return: the number of worker nodes in the cluster after making the necessary
                adjustments. This value should be, but is not guaranteed to be, close or equal to
                the `numNodes` argument. It represents the closest possible approximation of the
                actual cluster size at the time this method returns.
        """
        for attempt in old_retry(predicate=self.provisioner.retryPredicate):
            with attempt:
                workerInstances = self.getNodes(preemptable=preemptable)
                logger.debug("Cluster contains %i instances" %
                             len(workerInstances))
                # Reduce to nodes of the correct type
                workerInstances = {
                    node: workerInstances[node]
                    for node in workerInstances if node.nodeType == nodeType
                }
                ignoredNodes = [
                    node for node in workerInstances
                    if node.privateIP in self.ignoredNodes
                ]
                numIgnoredNodes = len(ignoredNodes)
                numCurrentNodes = len(workerInstances)
                logger.debug(
                    "Cluster contains %i instances of type %s (%i ignored and draining jobs until "
                    "they can be safely terminated)" %
                    (numCurrentNodes, nodeType, numIgnoredNodes))
                if not force:
                    delta = numNodes - (numCurrentNodes - numIgnoredNodes)
                else:
                    delta = numNodes - numCurrentNodes
                if delta > 0 and numIgnoredNodes > 0:
                    # We can un-ignore a few nodes to compensate for the additional nodes we want.
                    numNodesToUnignore = min(delta, numIgnoredNodes)
                    logger.debug(
                        'Unignoring %i nodes because we want to scale back up again.'
                        % numNodesToUnignore)
                    delta -= numNodesToUnignore
                    for node in ignoredNodes[:numNodesToUnignore]:
                        self.ignoredNodes.remove(node.privateIP)
                        self.leader.batchSystem.unignoreNode(node.privateIP)
                if delta > 0:
                    logger.info(
                        'Adding %i %s nodes to get to desired cluster size of %i.',
                        delta,
                        'preemptable' if preemptable else 'non-preemptable',
                        numNodes)
                    numNodes = numCurrentNodes + self._addNodes(
                        nodeType, numNodes=delta, preemptable=preemptable)
                elif delta < 0:
                    logger.info(
                        'Removing %i %s nodes to get to desired cluster size of %i.',
                        -delta,
                        'preemptable' if preemptable else 'non-preemptable',
                        numNodes)
                    numNodes = numCurrentNodes - self._removeNodes(
                        workerInstances,
                        nodeType=nodeType,
                        numNodes=-delta,
                        preemptable=preemptable,
                        force=force)
                else:
                    if not force:
                        logger.debug(
                            'Cluster (minus ignored nodes) already at desired size of %i. Nothing to do.',
                            numNodes)
                    else:
                        logger.debug(
                            'Cluster already at desired size of %i. Nothing to do.',
                            numNodes)
        return numNodes
Beispiel #7
0
def retry_s3(delays=default_delays, timeout=default_timeout, predicate=retryable_s3_errors):
    return old_retry(delays=delays, timeout=timeout, predicate=predicate)
Beispiel #8
0
def retry_ec2(t=a_short_time,
              retry_for=10 * a_short_time,
              retry_while=not_found):
    return old_retry(delays=(t, t, t * 2, t * 4),
                     timeout=retry_for,
                     predicate=retry_while)
Beispiel #9
0
    def addNodes(self, nodeType, numNodes, preemptable, spotBid=None):
        assert self._leaderPrivateIP
        if preemptable and not spotBid:
            if self._spotBidsMap and nodeType in self._spotBidsMap:
                spotBid = self._spotBidsMap[nodeType]
            else:
                raise RuntimeError(
                    "No spot bid given for a preemptable node request.")
        instanceType = E2Instances[nodeType]
        bdm = self._getBlockDeviceMapping(
            instanceType,
            rootVolSize=self._nodeStorageOverrides.get(nodeType,
                                                       self._nodeStorage))

        keyPath = self._sseKey if self._sseKey else None
        userData = self._getCloudConfigUserData('worker',
                                                self._masterPublicKey, keyPath,
                                                preemptable)
        if isinstance(userData, text_type):
            # Spot-market provisioning requires bytes for user data.
            userData = userData.encode('utf-8')
        sgs = [
            sg for sg in self._ctx.ec2.get_all_security_groups()
            if sg.name in self._leaderSecurityGroupNames
        ]
        kwargs = {
            'key_name': self._keyName,
            'security_group_ids': [sg.id for sg in sgs],
            'instance_type': instanceType.name,
            'user_data': userData,
            'block_device_map': bdm,
            'instance_profile_arn': self._leaderProfileArn,
            'placement': self._zone,
            'subnet_id': self._subnetID
        }

        instancesLaunched = []

        for attempt in old_retry(predicate=awsRetryPredicate):
            with attempt:
                # after we start launching instances we want to ensure the full setup is done
                # the biggest obstacle is AWS request throttling, so we retry on these errors at
                # every request in this method
                if not preemptable:
                    logger.debug('Launching %s non-preemptable nodes',
                                 numNodes)
                    instancesLaunched = create_ondemand_instances(
                        self._ctx.ec2,
                        image_id=self._discoverAMI(),
                        spec=kwargs,
                        num_instances=numNodes)
                else:
                    logger.debug('Launching %s preemptable nodes', numNodes)
                    kwargs['placement'] = getSpotZone(spotBid,
                                                      instanceType.name,
                                                      self._ctx)
                    # force generator to evaluate
                    instancesLaunched = list(
                        create_spot_instances(
                            ec2=self._ctx.ec2,
                            price=spotBid,
                            image_id=self._discoverAMI(),
                            tags={'clusterName': self.clusterName},
                            spec=kwargs,
                            num_instances=numNodes,
                            tentative=True))
                    # flatten the list
                    instancesLaunched = [
                        item for sublist in instancesLaunched
                        for item in sublist
                    ]

        for attempt in old_retry(predicate=awsRetryPredicate):
            with attempt:
                wait_instances_running(self._ctx.ec2, instancesLaunched)

        self._tags[_TOIL_NODE_TYPE_TAG_KEY] = 'worker'
        AWSProvisioner._addTags(instancesLaunched, self._tags)
        if self._sseKey:
            for i in instancesLaunched:
                self._waitForIP(i)
                node = Node(publicIP=i.ip_address,
                            privateIP=i.private_ip_address,
                            name=i.id,
                            launchTime=i.launch_time,
                            nodeType=i.instance_type,
                            preemptable=preemptable,
                            tags=i.tags)
                node.waitForNode('toil_worker')
                node.coreRsync([self._sseKey, ':' + self._sseKey],
                               applianceName='toil_worker')
        logger.debug('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
Beispiel #10
0
    def destroyCluster(self):
        """
        Terminate instances and delete the profile and security group.
        """
        assert self._ctx

        def expectedShutdownErrors(e):
            return e.status == 400 and 'dependent object' in e.body

        def destroyInstances(instances):
            """
            Similar to _terminateInstances, except that it also cleans up any
            resources associated with the instances (e.g. IAM profiles).
            """
            self._deleteIAMProfiles(instances)
            self._terminateInstances(instances)

        # We should terminate the leader first in case a workflow is still running in the cluster.
        # The leader may create more instances while we're terminating the workers.
        vpcId = None
        try:
            leader = self.getLeader(returnRawInstance=True)
            vpcId = leader.vpc_id
            logger.info('Terminating the leader first ...')
            destroyInstances([leader])
            logger.info('Now terminating any remaining workers ...')
        except (NoSuchClusterException, InvalidClusterStateException):
            # It's ok if the leader is not found. We'll terminate any remaining
            # instances below anyway.
            pass

        instances = self._getNodesInCluster(nodeType=None, both=True)
        spotIDs = self._getSpotRequestIDs()
        if spotIDs:
            self._ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs)
        instancesToTerminate = awsFilterImpairedNodes(instances, self._ctx.ec2)
        if instancesToTerminate:
            vpcId = vpcId or instancesToTerminate[0].vpc_id
            destroyInstances(instancesToTerminate)
        if len(instances) == len(instancesToTerminate):
            logger.debug('Deleting security group...')
            removed = False
            for attempt in old_retry(timeout=300,
                                     predicate=expectedShutdownErrors):
                with attempt:
                    for sg in self._ctx.ec2.get_all_security_groups():
                        if sg.name == self.clusterName and vpcId and sg.vpc_id == vpcId:
                            try:
                                self._ctx.ec2.delete_security_group(
                                    group_id=sg.id)
                                removed = True
                            except BotoServerError as e:
                                if e.error_code == 'InvalidGroup.NotFound':
                                    pass
                                else:
                                    raise
            if removed:
                logger.debug('... Succesfully deleted security group')
        else:
            assert len(instances) > len(instancesToTerminate)
            # the security group can't be deleted until all nodes are terminated
            logger.warning(
                'The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes '
                'have failed health checks. As a result, the security group & IAM '
                'roles will not be deleted.')
Beispiel #11
0
def retry_sdb(delays=DEFAULT_DELAYS,
              timeout=DEFAULT_TIMEOUT,
              predicate=retryable_sdb_errors):
    return old_retry(delays=delays, timeout=timeout, predicate=predicate)