def _createSecurityGroup(self): assert self._ctx def groupNotFound(e): retry = (e.status == 400 and 'does not exist in default VPC' in e.body) return retry vpcId = None if self._vpcSubnet: conn = boto.connect_vpc(region=self._ctx.ec2.region) subnets = conn.get_all_subnets(subnet_ids=[self._vpcSubnet]) if len(subnets) > 0: vpcId = subnets[0].vpc_id # security group create/get. ssh + all ports open within the group try: web = self._ctx.ec2.create_security_group( self.clusterName, 'Toil appliance security group', vpc_id=vpcId) except EC2ResponseError as e: if e.status == 400 and 'already exists' in e.body: pass # group exists- nothing to do else: raise else: for attempt in old_retry(predicate=groupNotFound, timeout=300): with attempt: # open port 22 for ssh-ing web.authorize(ip_protocol='tcp', from_port=22, to_port=22, cidr_ip='0.0.0.0/0') for attempt in old_retry(predicate=groupNotFound, timeout=300): with attempt: # the following authorizes all TCP access within the web security group web.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=web) for attempt in old_retry(predicate=groupNotFound, timeout=300): with attempt: # We also want to open up UDP, both for user code and for the RealtimeLogger web.authorize(ip_protocol='udp', from_port=0, to_port=65535, src_group=web) out = [] for sg in self._ctx.ec2.get_all_security_groups(): if sg.name == self.clusterName and (vpcId is None or sg.vpc_id == vpcId): out.append(sg) return out
def retry_s3( delays: Iterable[float] = DEFAULT_DELAYS, timeout: float = DEFAULT_TIMEOUT, predicate: Callable[[Exception], bool] = retryable_s3_errors ) -> Iterator[ContextManager[None]]: """ Retry iterator of context managers specifically for S3 operations. """ return old_retry(delays=delays, timeout=timeout, predicate=predicate)
def _discoverAMI(self): """ :return: The AMI ID (a string like 'ami-0a9a5d2b65cce04eb') for CoreOS or a compatible replacement like Flatcar. :rtype: str """ # Take a user override ami = os.environ.get('TOIL_AWS_AMI') if ami is not None: return ami # CoreOS is dead, long live Flatcar # Flatcar images, however, only live for 9 months. # Rather than hardcode a list of AMIs by region that will die, we use # their JSON feed of the current ones. JSON_FEED_URL = 'https://stable.release.flatcar-linux.net/amd64-usr/current/flatcar_production_ami_all.json' # What region do we care about? region = zoneToRegion(self._zone) for attempt in old_retry(predicate=lambda e: True): # Until we get parseable JSON # TODO: What errors do we get for timeout, JSON parse failure, etc? with attempt: # Try to get the JSON and parse it. feed = json.loads(urllib.request.urlopen(JSON_FEED_URL).read()) try: for ami_record in feed['amis']: # Scan the klist of regions if ami_record['name'] == region: # When we find ours # Save the AMI ID ami = ami_record['hvm'] # And stop scanning break except KeyError: # We didn't see a field we need raise RuntimeError( 'Flatcar image feed at {} does not have expected format'. format(JSON_FEED_URL)) if ami is None: # We didn't find it raise RuntimeError( 'Flatcar image feed at {} does not have an image for region {}' .format(JSON_FEED_URL, region)) return ami
def _getProfileArn(self): assert self._ctx policy = dict(iam_full=self.full_policy('iam'), ec2_full=self.full_policy('ec2'), s3_full=self.full_policy('s3'), sbd_full=self.full_policy('sdb')) iamRoleName = self._ctx.setup_iam_ec2_role( role_name=_INSTANCE_PROFILE_ROLE_NAME, policies=policy) try: profile = self._ctx.iam.get_instance_profile(iamRoleName) except BotoServerError as e: if e.status == 404: profile = self._ctx.iam.create_instance_profile(iamRoleName) profile = profile.create_instance_profile_response.create_instance_profile_result else: raise else: profile = profile.get_instance_profile_response.get_instance_profile_result profile = profile.instance_profile profile_arn = profile.arn if len(profile.roles) > 1: raise RuntimeError( 'Did not expect profile to contain more than one role') elif len(profile.roles) == 1: # this should be profile.roles[0].role_name if profile.roles.member.role_name == iamRoleName: return profile_arn else: self._ctx.iam.remove_role_from_instance_profile( iamRoleName, profile.roles.member.role_name) for attempt in old_retry(predicate=lambda err: err.status == 404): with attempt: self._ctx.iam.add_role_to_instance_profile( iamRoleName, iamRoleName) return profile_arn
def wrapper(*args, **kwargs): for attempt in old_retry(delays=truncExpBackoff(), timeout=300, predicate=googleRetryPredicate): with attempt: return f(*args, **kwargs)
def setNodeCount(self, nodeType, numNodes, preemptable=False, force=False): """ Attempt to grow or shrink the number of preemptable or non-preemptable worker nodes in the cluster to the given value, or as close a value as possible, and, after performing the necessary additions or removals of worker nodes, return the resulting number of preemptable or non-preemptable nodes currently in the cluster. :param str nodeType: The node type to add or remove. :param int numNodes: Desired size of the cluster :param bool preemptable: whether the added nodes will be preemptable, i.e. whether they may be removed spontaneously by the underlying platform at any time. :param bool force: If False, the provisioner is allowed to deviate from the given number of nodes. For example, when downsizing a cluster, a provisioner might leave nodes running if they have active jobs running on them. :rtype: int :return: the number of worker nodes in the cluster after making the necessary adjustments. This value should be, but is not guaranteed to be, close or equal to the `numNodes` argument. It represents the closest possible approximation of the actual cluster size at the time this method returns. """ for attempt in old_retry(predicate=self.provisioner.retryPredicate): with attempt: workerInstances = self.getNodes(preemptable=preemptable) logger.debug("Cluster contains %i instances" % len(workerInstances)) # Reduce to nodes of the correct type workerInstances = { node: workerInstances[node] for node in workerInstances if node.nodeType == nodeType } ignoredNodes = [ node for node in workerInstances if node.privateIP in self.ignoredNodes ] numIgnoredNodes = len(ignoredNodes) numCurrentNodes = len(workerInstances) logger.debug( "Cluster contains %i instances of type %s (%i ignored and draining jobs until " "they can be safely terminated)" % (numCurrentNodes, nodeType, numIgnoredNodes)) if not force: delta = numNodes - (numCurrentNodes - numIgnoredNodes) else: delta = numNodes - numCurrentNodes if delta > 0 and numIgnoredNodes > 0: # We can un-ignore a few nodes to compensate for the additional nodes we want. numNodesToUnignore = min(delta, numIgnoredNodes) logger.debug( 'Unignoring %i nodes because we want to scale back up again.' % numNodesToUnignore) delta -= numNodesToUnignore for node in ignoredNodes[:numNodesToUnignore]: self.ignoredNodes.remove(node.privateIP) self.leader.batchSystem.unignoreNode(node.privateIP) if delta > 0: logger.info( 'Adding %i %s nodes to get to desired cluster size of %i.', delta, 'preemptable' if preemptable else 'non-preemptable', numNodes) numNodes = numCurrentNodes + self._addNodes( nodeType, numNodes=delta, preemptable=preemptable) elif delta < 0: logger.info( 'Removing %i %s nodes to get to desired cluster size of %i.', -delta, 'preemptable' if preemptable else 'non-preemptable', numNodes) numNodes = numCurrentNodes - self._removeNodes( workerInstances, nodeType=nodeType, numNodes=-delta, preemptable=preemptable, force=force) else: if not force: logger.debug( 'Cluster (minus ignored nodes) already at desired size of %i. Nothing to do.', numNodes) else: logger.debug( 'Cluster already at desired size of %i. Nothing to do.', numNodes) return numNodes
def retry_s3(delays=default_delays, timeout=default_timeout, predicate=retryable_s3_errors): return old_retry(delays=delays, timeout=timeout, predicate=predicate)
def retry_ec2(t=a_short_time, retry_for=10 * a_short_time, retry_while=not_found): return old_retry(delays=(t, t, t * 2, t * 4), timeout=retry_for, predicate=retry_while)
def addNodes(self, nodeType, numNodes, preemptable, spotBid=None): assert self._leaderPrivateIP if preemptable and not spotBid: if self._spotBidsMap and nodeType in self._spotBidsMap: spotBid = self._spotBidsMap[nodeType] else: raise RuntimeError( "No spot bid given for a preemptable node request.") instanceType = E2Instances[nodeType] bdm = self._getBlockDeviceMapping( instanceType, rootVolSize=self._nodeStorageOverrides.get(nodeType, self._nodeStorage)) keyPath = self._sseKey if self._sseKey else None userData = self._getCloudConfigUserData('worker', self._masterPublicKey, keyPath, preemptable) if isinstance(userData, text_type): # Spot-market provisioning requires bytes for user data. userData = userData.encode('utf-8') sgs = [ sg for sg in self._ctx.ec2.get_all_security_groups() if sg.name in self._leaderSecurityGroupNames ] kwargs = { 'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': self._leaderProfileArn, 'placement': self._zone, 'subnet_id': self._subnetID } instancesLaunched = [] for attempt in old_retry(predicate=awsRetryPredicate): with attempt: # after we start launching instances we want to ensure the full setup is done # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptable: logger.debug('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances( self._ctx.ec2, image_id=self._discoverAMI(), spec=kwargs, num_instances=numNodes) else: logger.debug('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(spotBid, instanceType.name, self._ctx) # force generator to evaluate instancesLaunched = list( create_spot_instances( ec2=self._ctx.ec2, price=spotBid, image_id=self._discoverAMI(), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True)) # flatten the list instancesLaunched = [ item for sublist in instancesLaunched for item in sublist ] for attempt in old_retry(predicate=awsRetryPredicate): with attempt: wait_instances_running(self._ctx.ec2, instancesLaunched) self._tags[_TOIL_NODE_TYPE_TAG_KEY] = 'worker' AWSProvisioner._addTags(instancesLaunched, self._tags) if self._sseKey: for i in instancesLaunched: self._waitForIP(i) node = Node(publicIP=i.ip_address, privateIP=i.private_ip_address, name=i.id, launchTime=i.launch_time, nodeType=i.instance_type, preemptable=preemptable, tags=i.tags) node.waitForNode('toil_worker') node.coreRsync([self._sseKey, ':' + self._sseKey], applianceName='toil_worker') logger.debug('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def destroyCluster(self): """ Terminate instances and delete the profile and security group. """ assert self._ctx def expectedShutdownErrors(e): return e.status == 400 and 'dependent object' in e.body def destroyInstances(instances): """ Similar to _terminateInstances, except that it also cleans up any resources associated with the instances (e.g. IAM profiles). """ self._deleteIAMProfiles(instances) self._terminateInstances(instances) # We should terminate the leader first in case a workflow is still running in the cluster. # The leader may create more instances while we're terminating the workers. vpcId = None try: leader = self.getLeader(returnRawInstance=True) vpcId = leader.vpc_id logger.info('Terminating the leader first ...') destroyInstances([leader]) logger.info('Now terminating any remaining workers ...') except (NoSuchClusterException, InvalidClusterStateException): # It's ok if the leader is not found. We'll terminate any remaining # instances below anyway. pass instances = self._getNodesInCluster(nodeType=None, both=True) spotIDs = self._getSpotRequestIDs() if spotIDs: self._ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs) instancesToTerminate = awsFilterImpairedNodes(instances, self._ctx.ec2) if instancesToTerminate: vpcId = vpcId or instancesToTerminate[0].vpc_id destroyInstances(instancesToTerminate) if len(instances) == len(instancesToTerminate): logger.debug('Deleting security group...') removed = False for attempt in old_retry(timeout=300, predicate=expectedShutdownErrors): with attempt: for sg in self._ctx.ec2.get_all_security_groups(): if sg.name == self.clusterName and vpcId and sg.vpc_id == vpcId: try: self._ctx.ec2.delete_security_group( group_id=sg.id) removed = True except BotoServerError as e: if e.error_code == 'InvalidGroup.NotFound': pass else: raise if removed: logger.debug('... Succesfully deleted security group') else: assert len(instances) > len(instancesToTerminate) # the security group can't be deleted until all nodes are terminated logger.warning( 'The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes ' 'have failed health checks. As a result, the security group & IAM ' 'roles will not be deleted.')
def retry_sdb(delays=DEFAULT_DELAYS, timeout=DEFAULT_TIMEOUT, predicate=retryable_sdb_errors): return old_retry(delays=delays, timeout=timeout, predicate=predicate)