def _createSecurityGroup(cls, ctx, name): def groupNotFound(e): retry = (e.status == 400 and 'does not exist in default VPC' in e.body) return retry # security group create/get. ssh + all ports open within the group try: web = ctx.ec2.create_security_group(name, 'Toil appliance security group') except EC2ResponseError as e: if e.status == 400 and 'already exists' in e.body: pass # group exists- nothing to do else: raise else: for attempt in retry(predicate=groupNotFound, timeout=300): with attempt: # open port 22 for ssh-ing web.authorize(ip_protocol='tcp', from_port=22, to_port=22, cidr_ip='0.0.0.0/0') for attempt in retry(predicate=groupNotFound, timeout=300): with attempt: # the following authorizes all port access within the web security group web.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=web) for attempt in retry(predicate=groupNotFound, timeout=300): with attempt: # open port 5050-5051 for mesos web interface web.authorize(ip_protocol='tcp', from_port=5050, to_port=5051, cidr_ip='0.0.0.0/0')
def _createSecurityGroup(cls, ctx, name, vpcSubnet=None): def groupNotFound(e): retry = (e.status == 400 and 'does not exist in default VPC' in e.body) return retry vpcId = None if vpcSubnet: conn = boto.connect_vpc(region=ctx.ec2.region) subnets = conn.get_all_subnets(subnet_ids=[vpcSubnet]) if len(subnets) > 0: vpcId = subnets[0].vpc_id # security group create/get. ssh + all ports open within the group try: web = ctx.ec2.create_security_group(name, 'Toil appliance security group', vpc_id=vpcId) except EC2ResponseError as e: if e.status == 400 and 'already exists' in e.body: pass # group exists- nothing to do else: raise else: for attempt in retry(predicate=groupNotFound, timeout=300): with attempt: # open port 22 for ssh-ing web.authorize(ip_protocol='tcp', from_port=22, to_port=22, cidr_ip='0.0.0.0/0') for attempt in retry(predicate=groupNotFound, timeout=300): with attempt: # the following authorizes all port access within the web security group web.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=web) out = [] for sg in ctx.ec2.get_all_security_groups(): if sg.name == name and vpcId is None or sg.vpc_id == vpcId: out.append(sg) return out
def _getNodesInCluster(cls, ctx, clusterName, preemptable=False, both=False): for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: pendingInstances = ctx.ec2.get_only_instances( filters={ 'instance.group-name': clusterName, 'instance-state-name': 'pending' }) for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: runningInstances = ctx.ec2.get_only_instances( filters={ 'instance.group-name': clusterName, 'instance-state-name': 'running' }) instances = set(pendingInstances) if not preemptable and not both: return [ x for x in instances.union(set(runningInstances)) if x.spot_instance_request_id is None ] elif preemptable and not both: return [ x for x in instances.union(set(runningInstances)) if x.spot_instance_request_id is not None ] elif both: return [x for x in instances.union(set(runningInstances))]
def _getProfileARN(cls, ctx): def addRoleErrors(e): return e.status == 404 def throttleError(e): return isinstance( e, BotoServerError ) and e.status == 400 and e.error_code == 'Throttling' def truncExpBackoff(): # as recommended here https://forums.aws.amazon.com/thread.jspa?messageID=406788#406788 yield 0 t = 1 while t < 1024: yield t t *= 2 while True: yield t for attempt in retry(delays=truncExpBackoff(), predicate=throttleError): with attempt: roleName = 'toil' policy = dict(iam_full=iamFullPolicy, ec2_full=ec2FullPolicy, s3_full=s3FullPolicy, sbd_full=sdbFullPolicy) iamRoleName = ctx.setup_iam_ec2_role(role_name=roleName, policies=policy) try: profile = ctx.iam.get_instance_profile(iamRoleName) except BotoServerError as e: if e.status == 404: profile = ctx.iam.create_instance_profile(iamRoleName) profile = profile.create_instance_profile_response.create_instance_profile_result else: raise else: profile = profile.get_instance_profile_response.get_instance_profile_result profile = profile.instance_profile profile_arn = profile.arn if len(profile.roles) > 1: raise RuntimeError( 'Did not expect profile to contain more than one role') elif len(profile.roles) == 1: # this should be profile.roles[0].role_name if profile.roles.member.role_name == iamRoleName: return profile_arn else: ctx.iam.remove_role_from_instance_profile( iamRoleName, profile.roles.member.role_name) for attempt in retry(predicate=addRoleErrors): with attempt: ctx.iam.add_role_to_instance_profile( iamRoleName, iamRoleName) return profile_arn
def addNodes(self, nodeType, numNodes, preemptable): instanceType = ec2_instance_types[nodeType] bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self.nodeStorage) arn = self._getProfileARN(self.ctx) keyPath = '' if not self.config or not self.config.sseKey else self.config.sseKey entryPoint = 'mesos-slave' if not self.config or not self.config.sseKey else "waitForKey.sh" workerData = dict(role='worker', image=applianceSelf(), entrypoint=entryPoint, sshKey=self.masterPublicKey, args=workerArgs.format(ip=self.leaderIP, preemptable=preemptable, keyPath=keyPath)) userData = awsUserData.format(**workerData) sgs = [sg for sg in self.ctx.ec2.get_all_security_groups() if sg.name == self.clusterName] kwargs = {'key_name': self.keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': arn, 'placement': getCurrentAWSZone()} kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id instancesLaunched = [] for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: # after we start launching instances we want to insure the full setup is done # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptable: logger.info('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances(self.ctx.ec2, image_id=self._discoverAMI(self.ctx), spec=kwargs, num_instances=numNodes) else: logger.info('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(self.spotBids[nodeType], instanceType.name, self.ctx) # force generator to evaluate instancesLaunched = list(create_spot_instances(ec2=self.ctx.ec2, price=self.spotBids[nodeType], image_id=self._discoverAMI(self.ctx), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True) ) # flatten the list instancesLaunched = [item for sublist in instancesLaunched for item in sublist] for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: wait_instances_running(self.ctx.ec2, instancesLaunched) # request throttling retry happens internally to these two methods to insure proper granularity AWSProvisioner._addTags(instancesLaunched, self.tags) self._propagateKey(instancesLaunched) logger.info('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def addNodes(self, numNodes, preemptable): instanceType = self._getInstanceType(preemptable) bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self.nodeStorage) arn = self._getProfileARN(self.ctx) keyPath = '' if not self.config or not self.config.sseKey else self.config.sseKey entryPoint = 'mesos-slave' if not self.config or not self.config.sseKey else "waitForKey.sh" workerData = dict(role='worker', image=applianceSelf(), entrypoint=entryPoint, sshKey=self.masterPublicKey, args=workerArgs.format(ip=self.leaderIP, preemptable=preemptable, keyPath=keyPath)) userData = awsUserData.format(**workerData) sgs = [sg for sg in self.ctx.ec2.get_all_security_groups() if sg.name == self.clusterName] kwargs = {'key_name': self.keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': arn, 'placement': getCurrentAWSZone()} kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id instancesLaunched = [] for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: # after we start launching instances we want to insure the full setup is done # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptable: logger.info('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances(self.ctx.ec2, image_id=self._discoverAMI(self.ctx), spec=kwargs, num_instances=numNodes) else: logger.info('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(self.spotBid, instanceType.name, self.ctx) # force generator to evaluate instancesLaunched = list(create_spot_instances(ec2=self.ctx.ec2, price=self.spotBid, image_id=self._discoverAMI(self.ctx), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True) ) # flatten the list instancesLaunched = [item for sublist in instancesLaunched for item in sublist] for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: wait_instances_running(self.ctx.ec2, instancesLaunched) # request throttling retry happens internally to these two methods to insure proper granularity AWSProvisioner._addTags(instancesLaunched, self.tags) self._propagateKey(instancesLaunched) logger.info('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def _getProfileARN(cls, ctx): def addRoleErrors(e): return e.status == 404 def throttleError(e): return isinstance(e, BotoServerError) and e.status == 400 and e.error_code == 'Throttling' def truncExpBackoff(): # as recommended here https://forums.aws.amazon.com/thread.jspa?messageID=406788#406788 yield 0 t = 1 while t < 1024: yield t t *= 2 while True: yield t for attempt in retry(delays=truncExpBackoff(), predicate=throttleError): with attempt: roleName = 'toil' policy = dict(iam_full=iamFullPolicy, ec2_full=ec2FullPolicy, s3_full=s3FullPolicy, sbd_full=sdbFullPolicy) iamRoleName = ctx.setup_iam_ec2_role(role_name=roleName, policies=policy) try: profile = ctx.iam.get_instance_profile(iamRoleName) except BotoServerError as e: if e.status == 404: profile = ctx.iam.create_instance_profile(iamRoleName) profile = profile.create_instance_profile_response.create_instance_profile_result else: raise else: profile = profile.get_instance_profile_response.get_instance_profile_result profile = profile.instance_profile profile_arn = profile.arn if len(profile.roles) > 1: raise RuntimeError('Did not expect profile to contain more than one role') elif len(profile.roles) == 1: # this should be profile.roles[0].role_name if profile.roles.member.role_name == iamRoleName: return profile_arn else: ctx.iam.remove_role_from_instance_profile(iamRoleName, profile.roles.member.role_name) for attempt in retry(predicate=addRoleErrors): with attempt: ctx.iam.add_role_to_instance_profile(iamRoleName, iamRoleName) return profile_arn
def _dockerKill(containerName, action): """ Deprecated. Kills the specified container. :param str containerName: The name of the container created by docker_call :param int action: What action should be taken on the container? """ running = containerIsRunning(containerName) if running is None: # This means that the container doesn't exist. We will see this if the # container was run with --rm and has already exited before this call. logger.info( 'The container with name "%s" appears to have already been ' 'removed. Nothing to ' 'do.', containerName) else: if action in (None, FORGO): logger.info( 'The container with name %s continues to exist as we ' 'were asked to forgo a ' 'post-job action on it.', containerName) else: logger.info( 'The container with name %s exists. Running ' 'user-specified defer functions.', containerName) if running and action >= STOP: logger.info('Stopping container "%s".', containerName) for attempt in retry(predicate=dockerPredicate): with attempt: subprocess.check_call( ['docker', 'stop', containerName]) else: logger.info('The container "%s" was not found to be running.', containerName) if action >= RM: # If the container was run with --rm, then stop will most likely # remove the container. We first check if it is running then # remove it. running = containerIsRunning(containerName) if running is not None: logger.info('Removing container "%s".', containerName) for attempt in retry(predicate=dockerPredicate): with attempt: subprocess.check_call( ['docker', 'rm', '-f', containerName]) else: logger.info( 'Container "%s" was not found on the system.' 'Nothing to remove.', containerName)
def _getProfileARN(cls, ctx): def addRoleErrors(e): return e.status == 404 roleName = 'toil' policy = dict(iam_full=iamFullPolicy, ec2_full=ec2FullPolicy, s3_full=s3FullPolicy, sbd_full=sdbFullPolicy) iamRoleName = ctx.setup_iam_ec2_role(role_name=roleName, policies=policy) try: profile = ctx.iam.get_instance_profile(iamRoleName) except BotoServerError as e: if e.status == 404: profile = ctx.iam.create_instance_profile(iamRoleName) profile = profile.create_instance_profile_response.create_instance_profile_result else: raise else: profile = profile.get_instance_profile_response.get_instance_profile_result profile = profile.instance_profile profile_arn = profile.arn if len(profile.roles) > 1: raise RuntimeError('Did not expect profile to contain more than one role') elif len(profile.roles) == 1: # this should be profile.roles[0].role_name if profile.roles.member.role_name == iamRoleName: return profile_arn else: ctx.iam.remove_role_from_instance_profile(iamRoleName, profile.roles.member.role_name) for attempt in retry(predicate=addRoleErrors): with attempt: ctx.iam.add_role_to_instance_profile(iamRoleName, iamRoleName) return profile_arn
def _getNodesInCluster(cls, ctx, clusterName, preemptable=False, both=False): for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: pendingInstances = ctx.ec2.get_only_instances(filters={'instance.group-name': clusterName, 'instance-state-name': 'pending'}) for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: runningInstances = ctx.ec2.get_only_instances(filters={'instance.group-name': clusterName, 'instance-state-name': 'running'}) instances = set(pendingInstances) if not preemptable and not both: return [x for x in instances.union(set(runningInstances)) if x.spot_instance_request_id is None] elif preemptable and not both: return [x for x in instances.union(set(runningInstances)) if x.spot_instance_request_id is not None] elif both: return [x for x in instances.union(set(runningInstances))]
def _getClusterInstance(self, md): zone = getCurrentAWSZone() region = Context.availability_zone_re.match(zone).group(1) conn = boto.ec2.connect_to_region(region) for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: return conn.get_all_instances(instance_ids=[md["instance-id"]])[0].instances[0]
def _containerIsRunning(container_name): """ Checks whether the container is running or not. :param container_name: Name of the container being checked. :returns: True if running, False if not running, None if the container doesn't exist. :rtype: bool """ try: for attempt in retry(predicate=dockerPredicate): with attempt: output = subprocess.check_output([ 'docker', 'inspect', '--format', '{{.State.Running}}', container_name ]).strip() except subprocess.CalledProcessError: # This will be raised if the container didn't exist. _logger.debug( "'docker inspect' failed. Assuming container %s doesn't exist.", container_name, exc_info=True) return None if output == 'true': return True elif output == 'false': return False else: raise RuntimeError("Got unexpected value for State.Running (%s)" % output)
def _fixPermissions(tool, workDir): """ Fix permission of a mounted Docker directory by reusing the tool to change ownership. Docker natively runs as a root inside the container, and files written to the mounted directory are implicitly owned by root. :param list baseDockerCall: Docker run parameters :param str tool: Name of tool :param str workDir: Path of work directory to recursively chown """ if os.geteuid() == 0: # we're running as root so this chown is redundant return baseDockerCall = [ 'docker', 'run', '--log-driver=none', '-v', os.path.abspath(workDir) + ':/data', '--rm', '--entrypoint=chown' ] stat = os.stat(workDir) command = baseDockerCall + [tool] + [ '-R', '{}:{}'.format(stat.st_uid, stat.st_gid), '/data' ] for attempt in retry(predicate=dockerPredicate): with attempt: subprocess.check_call(command)
def destroyCluster(cls, clusterName, zone=None): def expectedShutdownErrors(e): return e.status == 400 and 'dependent object' in e.body ctx = cls._buildContext(clusterName=clusterName, zone=zone) instances = cls.__getNodesInCluster(ctx, clusterName, both=True) spotIDs = cls._getSpotRequestIDs(ctx, clusterName) if spotIDs: ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs) instancesToTerminate = awsFilterImpairedNodes(instances, ctx.ec2) if instancesToTerminate: cls._deleteIAMProfiles(instances=instancesToTerminate, ctx=ctx) cls._terminateInstances(instances=instancesToTerminate, ctx=ctx) if len(instances) == len(instancesToTerminate): logger.info('Deleting security group...') for attempt in retry(timeout=300, predicate=expectedShutdownErrors): with attempt: try: ctx.ec2.delete_security_group(name=clusterName) except BotoServerError as e: if e.error_code == 'InvalidGroup.NotFound': pass else: raise logger.info('... Succesfully deleted security group') else: assert len(instances) > len(instancesToTerminate) # the security group can't be deleted until all nodes are terminated logger.warning('The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes ' 'have failed health checks. As a result, the security group & IAM ' 'roles will not be deleted.')
def _addTags(cls, instances, tags): for instance in instances: for key, value in iteritems(tags): for attempt in retry( predicate=AWSProvisioner._throttlePredicate): with attempt: instance.add_tag(key, value)
def retry_ec2(retry_after=a_short_time, retry_for=10 * a_short_time, retry_while=not_found): t = retry_after return retry(delays=(t, t, t * 2, t * 4), timeout=retry_for, predicate=retry_while)
def _propagateKey(self, instances): if not self.config or not self.config.sseKey: return for node in instances: for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: # since we're going to be rsyncing into the appliance we need the appliance to be running first ipAddress = self._waitForNode(node, 'toil_worker') self._rsyncNode(ipAddress, [self.config.sseKey, ':' + self.config.sseKey], applianceName='toil_worker')
def setNodeCount(self, numNodes, preemptable=False, force=False): """ Attempt to grow or shrink the number of prepemptable or non-preemptable worker nodes in the cluster to the given value, or as close a value as possible, and, after performing the necessary additions or removals of worker nodes, return the resulting number of preemptable or non-preemptable nodes currently in the cluster. :param int numNodes: Desired size of the cluster :param bool preemptable: whether the added nodes will be preemptable, i.e. whether they may be removed spontaneously by the underlying platform at any time. :param bool force: If False, the provisioner is allowed to deviate from the given number of nodes. For example, when downsizing a cluster, a provisioner might leave nodes running if they have active jobs running on them. :rtype: int :return: the number of nodes in the cluster after making the necessary adjustments. This value should be, but is not guaranteed to be, close or equal to the `numNodes` argument. It represents the closest possible approximation of the actual cluster size at the time this method returns. """ for attempt in retry(predicate=self.retryPredicate): with attempt: workerInstances = self._getWorkersInCluster(preemptable) numCurrentNodes = len(workerInstances) delta = numNodes - numCurrentNodes if delta > 0: log.info( 'Adding %i %s nodes to get to desired cluster size of %i.', delta, 'preemptable' if preemptable else 'non-preemptable', numNodes) numNodes = numCurrentNodes + self._addNodes( workerInstances, numNodes=delta, preemptable=preemptable) elif delta < 0: log.info( 'Removing %i %s nodes to get to desired cluster size of %i.', -delta, 'preemptable' if preemptable else 'non-preemptable', numNodes) numNodes = numCurrentNodes - self._removeNodes( workerInstances, numNodes=-delta, preemptable=preemptable, force=force) else: log.info( 'Cluster already at desired size of %i. Nothing to do.', numNodes) return numNodes
def _dockerKill(containerName, action): """ Kills the specified container. :param str containerName: The name of the container created by docker_call :param int action: What action should be taken on the container? See `defer=` in :func:`docker_call` """ running = _containerIsRunning(containerName) if running is None: # This means that the container doesn't exist. We will see this if the container was run # with --rm and has already exited before this call. _logger.info('The container with name "%s" appears to have already been removed. Nothing to ' 'do.', containerName) else: if action in (None, FORGO): _logger.info('The container with name %s continues to exist as we were asked to forgo a ' 'post-job action on it.', containerName) else: _logger.info('The container with name %s exists. Running user-specified defer functions.', containerName) if running and action >= STOP: _logger.info('Stopping container "%s".', containerName) for attempt in retry(predicate=dockerPredicate): with attempt: subprocess.check_call(['docker', 'stop', containerName]) else: _logger.info('The container "%s" was not found to be running.', containerName) if action >= RM: # If the container was run with --rm, then stop will most likely remove the # container. We first check if it is running then remove it. running = _containerIsRunning(containerName) if running is not None: _logger.info('Removing container "%s".', containerName) for attempt in retry(predicate=dockerPredicate): with attempt: subprocess.check_call(['docker', 'rm', '-f', containerName]) else: _logger.info('The container "%s" was not found on the system. Nothing to remove.', containerName)
def _createSecurityGroup(cls, ctx, name): def groupNotFound(e): retry = (e.status == 400 and 'does not exist in default VPC' in e.body) return retry # security group create/get. ssh + all ports open within the group try: web = ctx.ec2.create_security_group(name, 'Toil appliance security group') except EC2ResponseError as e: if e.status == 400 and 'already exists' in e.body: pass # group exists- nothing to do else: raise else: for attempt in retry(predicate=groupNotFound, timeout=300): with attempt: # open port 22 for ssh-ing web.authorize(ip_protocol='tcp', from_port=22, to_port=22, cidr_ip='0.0.0.0/0') for attempt in retry(predicate=groupNotFound, timeout=300): with attempt: # the following authorizes all port access within the web security group web.authorize(ip_protocol='tcp', from_port=0, to_port=65535, src_group=web)
def _download(self, dstFile): """ Download this resource from its URL to the given file object. :type dstFile: io.BytesIO|io.FileIO """ for attempt in retry(predicate=lambda e: isinstance(e, HTTPError) and e.code == 400): with attempt: with closing(urlopen(self.url)) as content: buf = content.read() contentHash = hashlib.md5(buf) assert contentHash.hexdigest() == self.contentHash dstFile.write(buf)
def _discoverAMI(cls, ctx): def descriptionMatches(ami): return ami.description is not None and 'stable 1235.4.0' in ami.description coreOSAMI = os.environ.get('TOIL_AWS_AMI') if coreOSAMI is not None: return coreOSAMI # that ownerID corresponds to coreOS for attempt in retry(predicate= lambda e : isinstance(e, SSLError)): # SSLError is thrown when get_all_images times out with attempt: amis = ctx.ec2.get_all_images(owners=['679593333241']) coreOSAMI = [ami for ami in amis if descriptionMatches(ami)] logger.debug('Found the following matching AMIs: %s', coreOSAMI) assert len(coreOSAMI) == 1 return coreOSAMI.pop().id
def destroyCluster(self): """ Terminate instances and delete the profile and security group. """ assert self._ctx def expectedShutdownErrors(e): return e.status == 400 and 'dependent object' in e.body instances = self._getNodesInCluster(nodeType=None, both=True) spotIDs = self._getSpotRequestIDs() if spotIDs: self._ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs) instancesToTerminate = awsFilterImpairedNodes(instances, self._ctx.ec2) vpcId = None if instancesToTerminate: vpcId = instancesToTerminate[0].vpc_id self._deleteIAMProfiles(instances=instancesToTerminate) self._terminateInstances(instances=instancesToTerminate) if len(instances) == len(instancesToTerminate): logger.info('Deleting security group...') removed = False for attempt in retry(timeout=300, predicate=expectedShutdownErrors): with attempt: for sg in self._ctx.ec2.get_all_security_groups(): if sg.name == self.clusterName and vpcId and sg.vpc_id == vpcId: try: self._ctx.ec2.delete_security_group( group_id=sg.id) removed = True except BotoServerError as e: if e.error_code == 'InvalidGroup.NotFound': pass else: raise if removed: logger.info('... Succesfully deleted security group') else: assert len(instances) > len(instancesToTerminate) # the security group can't be deleted until all nodes are terminated logger.warning( 'The TOIL_AWS_NODE_DEBUG environment variable is set and some nodes ' 'have failed health checks. As a result, the security group & IAM ' 'roles will not be deleted.')
def _fixPermissions(tool, workDir): """ Fix permission of a mounted Docker directory by reusing the tool to change ownership. Docker natively runs as a root inside the container, and files written to the mounted directory are implicitly owned by root. :param list baseDockerCall: Docker run parameters :param str tool: Name of tool :param str workDir: Path of work directory to recursively chown """ if os.geteuid() == 0: # we're running as root so this chown is redundant return baseDockerCall = ['docker', 'run', '--log-driver=none', '-v', os.path.abspath(workDir) + ':/data', '--rm', '--entrypoint=chown'] stat = os.stat(workDir) command = baseDockerCall + [tool] + ['-R', '{}:{}'.format(stat.st_uid, stat.st_gid), '/data'] for attempt in retry(predicate=dockerPredicate): with attempt: subprocess.check_call(command)
def setNodeCount(self, numNodes, preemptable=False, force=False): """ Attempt to grow or shrink the number of prepemptable or non-preemptable worker nodes in the cluster to the given value, or as close a value as possible, and, after performing the necessary additions or removals of worker nodes, return the resulting number of preemptable or non-preemptable nodes currently in the cluster. :param int numNodes: Desired size of the cluster :param bool preemptable: whether the added nodes will be preemptable, i.e. whether they may be removed spontaneously by the underlying platform at any time. :param bool force: If False, the provisioner is allowed to deviate from the given number of nodes. For example, when downsizing a cluster, a provisioner might leave nodes running if they have active jobs running on them. :rtype: int :return: the number of worker nodes in the cluster after making the necessary adjustments. This value should be, but is not guaranteed to be, close or equal to the `numNodes` argument. It represents the closest possible approximation of the actual cluster size at the time this method returns. """ for attempt in retry(predicate=self.scaler.provisioner.retryPredicate): with attempt: workerInstances = self.getNodes(preemptable=preemptable) numCurrentNodes = len(workerInstances) delta = numNodes - numCurrentNodes if delta > 0: logger.info('Adding %i %s nodes to get to desired cluster size of %i.', delta, 'preemptable' if preemptable else 'non-preemptable', numNodes) numNodes = numCurrentNodes + self._addNodes(numNodes=delta, preemptable=preemptable) elif delta < 0: logger.info('Removing %i %s nodes to get to desired cluster size of %i.', -delta, 'preemptable' if preemptable else 'non-preemptable', numNodes) numNodes = numCurrentNodes - self._removeNodes(workerInstances, numNodes=-delta, preemptable=preemptable, force=force) else: logger.info('Cluster already at desired size of %i. Nothing to do.', numNodes) return numNodes
def __setup_entity_policies( self, entity_name, policies, list_policies, delete_policy, get_policy, put_policy ): # Delete superfluous policies policy_names = set( list_policies( entity_name ).policy_names ) for policy_name in policy_names.difference( set( policies.keys( ) ) ): delete_policy( entity_name, policy_name ) # Create expected policies for policy_name, policy in policies.iteritems( ): current_policy = None try: current_policy = json.loads( urllib.unquote( get_policy( entity_name, policy_name ).policy_document ) ) except BotoServerError as e: if e.status == 404 and e.error_code == 'NoSuchEntity': pass else: raise if current_policy != policy: for attempt in retry(predicate=throttlePredicate): with attempt: put_policy( entity_name, policy_name, json.dumps( policy ) )
def _containerIsRunning(container_name): """ Checks whether the container is running or not. :param container_name: Name of the container being checked. :returns: True if running, False if not running, None if the container doesn't exist. :rtype: bool """ try: for attempt in retry(predicate=dockerPredicate): with attempt: output = subprocess.check_output(['docker', 'inspect', '--format', '{{.State.Running}}', container_name]).strip() except subprocess.CalledProcessError: # This will be raised if the container didn't exist. _logger.debug("'docker inspect' failed. Assuming container %s doesn't exist.", container_name, exc_info=True) return None if output == 'true': return True elif output == 'false': return False else: raise RuntimeError("Got unexpected value for State.Running (%s)" % output)
def _terminateIDs(cls, instanceIDs, ctx): logger.info('Terminating instance(s): %s', instanceIDs) for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: ctx.ec2.terminate_instances(instance_ids=instanceIDs) logger.info('Instance(s) terminated.')
def addNodes(self, nodeType, numNodes, preemptable, spotBid=None): assert self._leaderPrivateIP if preemptable and not spotBid: if self._spotBidsMap and nodeType in self._spotBidsMap: spotBid = self._spotBidsMap[nodeType] else: raise RuntimeError( "No spot bid given for a preemptable node request.") instanceType = ec2_instance_types[nodeType] bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self._nodeStorage) arn = self._getProfileARN() keyPath = self._sseKey if self._sseKey else None userData = self._getCloudConfigUserData('worker', self._masterPublicKey, keyPath, preemptable) sgs = [ sg for sg in self._ctx.ec2.get_all_security_groups() if sg.name == self.clusterName ] kwargs = { 'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': arn, 'placement': self._zone, 'subnet_id': self._subnetID } instancesLaunched = [] for attempt in retry(predicate=awsRetryPredicate): with attempt: # after we start launching instances we want to insure the full setup is done # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptable: logger.info('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances( self._ctx.ec2, image_id=self._discoverAMI(), spec=kwargs, num_instances=numNodes) else: logger.info('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(spotBid, instanceType.name, self._ctx) # force generator to evaluate instancesLaunched = list( create_spot_instances( ec2=self._ctx.ec2, price=spotBid, image_id=self._discoverAMI(), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True)) # flatten the list instancesLaunched = [ item for sublist in instancesLaunched for item in sublist ] for attempt in retry(predicate=awsRetryPredicate): with attempt: wait_instances_running(self._ctx.ec2, instancesLaunched) AWSProvisioner._addTags(instancesLaunched, self._tags) if self._sseKey: for i in instancesLaunched: self._waitForIP(i) node = Node(publicIP=i.ip_address, privateIP=i.private_ip_address, name=i.id, launchTime=i.launch_time, nodeType=i.instance_type, preemptable=preemptable, tags=i.tags) node.waitForNode('toil_worker') node.coreRsync([self._sseKey, ':' + self._sseKey], applianceName='toil_worker') logger.info('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def _docker(job, tool, parameters=None, workDir=None, dockerParameters=None, outfile=None, checkOutput=False, defer=None): """ :param toil.Job.job job: The Job instance for the calling function. :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools). :param list[str] parameters: Command line arguments to be passed to the tool. If list of lists: list[list[str]], then treat as successive commands chained with pipe. :param str workDir: Directory to mount into the container via `-v`. Destination convention is /data :param list[str] dockerParameters: Parameters to pass to Docker. Default parameters are `--rm`, `--log-driver none`, and the mountpoint `-v work_dir:/data` where /data is the destination convention. These defaults are removed if docker_parmaters is passed, so be sure to pass them if they are desired. :param file outfile: Pipe output of Docker call to file handle :param bool checkOutput: When True, this function returns docker's output. :param int defer: What action should be taken on the container upon job completion? FORGO (0) will leave the container untouched. STOP (1) will attempt to stop the container with `docker stop` (useful for debugging). RM (2) will stop the container and then forcefully remove it from the system using `docker rm -f`. This is the default behavior if defer is set to None. """ if parameters is None: parameters = [] if workDir is None: workDir = os.getcwd() # Setup the outgoing subprocess call for docker baseDockerCall = ['docker', 'run'] if dockerParameters: baseDockerCall += dockerParameters else: baseDockerCall += [ '--rm', '--log-driver', 'none', '-v', os.path.abspath(workDir) + ':/data' ] # Ensure the user has passed a valid value for defer require(defer in (None, FORGO, STOP, RM), 'Please provide a valid value for defer.') # Get container name which is needed for _dockerKill try: if any('--name' in x for x in baseDockerCall): if any('--name=' in x for x in baseDockerCall): containerName = [ x.split('=')[1] for x in baseDockerCall if '--name' in x ][0] else: containerName = baseDockerCall[baseDockerCall.index('--name') + 1] else: containerName = _getContainerName(job) except ValueError: containerName = _getContainerName(job) baseDockerCall.extend(['--name', containerName]) except IndexError: raise RuntimeError( "Couldn't parse Docker's `--name=` option, check parameters: " + str(dockerParameters)) # Defer the container on-exit action if '--rm' in baseDockerCall and defer is None: defer = RM if '--rm' in baseDockerCall and defer is not RM: _logger.warn( '--rm being passed to docker call but defer not set to dockerCall.RM, defer set to: ' + str(defer)) job.defer(_dockerKill, containerName, action=defer) # Defer the permission fixing function which will run after this job concludes. # We call this explicitly later on in this function, but we defer it as well to handle unexpected job failure. job.defer(_fixPermissions, tool, workDir) # Make subprocess call # If parameters is list of lists, treat each list as separate command and chain with pipes if len(parameters) > 0 and type(parameters[0]) is list: # When piping, all arguments now get merged into a single string to bash -c. # We try to support spaces in paths by wrapping them all in quotes first. chain_params = [ ' '.join(p) for p in [map(pipes.quote, q) for q in parameters] ] call = baseDockerCall + [ '--entrypoint', '/bin/bash', tool, '-c', ' | '.join(chain_params) ] else: call = baseDockerCall + [tool] + parameters _logger.info("Calling docker with " + repr(call)) params = {} if outfile: params['stdout'] = outfile if checkOutput: callMethod = subprocess.check_output else: callMethod = subprocess.check_call for attempt in retry(predicate=dockerPredicate): with attempt: out = callMethod(call, **params) _fixPermissions(tool=tool, workDir=workDir) return out
def retry_azure( delays=(0, 1, 1, 4, 16, 64), timeout=300, predicate=defaultRetryPredicate): return retry(delays=delays, timeout=timeout, predicate=predicate)
def retry_azure(delays=(0, 1, 1, 4, 16, 64), timeout=300, predicate=defaultRetryPredicate): return retry(delays=delays, timeout=timeout, predicate=predicate)
def retry_s3(delays=default_delays, timeout=default_timeout, predicate=retryable_s3_errors): return retry(delays=delays, timeout=timeout, predicate=predicate)
def retry_ec2( retry_after=a_short_time, retry_for=10 * a_short_time, retry_while=not_found ): t = retry_after return retry( delays=(t,t,t*2,t*4), timeout=retry_for, predicate=retry_while )
def _deleteRootEBS(cls, ebsIDs, ctx): for volumeID in ebsIDs: for attempt in retry(predicate=AWSProvisioner.throttlePredicate): with attempt: ctx.ec2.delete_volume(volumeID)
def _docker(job, tool, parameters=None, workDir=None, dockerParameters=None, outfile=None, checkOutput=False, defer=None): """ :param toil.Job.job job: The Job instance for the calling function. :param str tool: Name of the Docker image to be used (e.g. quay.io/ucsc_cgl/samtools). :param list[str] parameters: Command line arguments to be passed to the tool. If list of lists: list[list[str]], then treat as successive commands chained with pipe. :param str workDir: Directory to mount into the container via `-v`. Destination convention is /data :param list[str] dockerParameters: Parameters to pass to Docker. Default parameters are `--rm`, `--log-driver none`, and the mountpoint `-v work_dir:/data` where /data is the destination convention. These defaults are removed if docker_parmaters is passed, so be sure to pass them if they are desired. :param file outfile: Pipe output of Docker call to file handle :param bool checkOutput: When True, this function returns docker's output. :param int defer: What action should be taken on the container upon job completion? FORGO (0) will leave the container untouched. STOP (1) will attempt to stop the container with `docker stop` (useful for debugging). RM (2) will stop the container and then forcefully remove it from the system using `docker rm -f`. This is the default behavior if defer is set to None. """ if parameters is None: parameters = [] if workDir is None: workDir = os.getcwd() # Setup the outgoing subprocess call for docker baseDockerCall = ['docker', 'run'] if dockerParameters: baseDockerCall += dockerParameters else: baseDockerCall += ['--rm', '--log-driver', 'none', '-v', os.path.abspath(workDir) + ':/data'] # Ensure the user has passed a valid value for defer require(defer in (None, FORGO, STOP, RM), 'Please provide a valid value for defer.') # Get container name which is needed for _dockerKill try: if any('--name' in x for x in baseDockerCall): if any('--name=' in x for x in baseDockerCall): containerName = [x.split('=')[1] for x in baseDockerCall if '--name' in x][0] else: containerName = baseDockerCall[baseDockerCall.index('--name') + 1] else: containerName = _getContainerName(job) baseDockerCall.extend(['--name', containerName]) except ValueError: containerName = _getContainerName(job) baseDockerCall.extend(['--name', containerName]) except IndexError: raise RuntimeError("Couldn't parse Docker's `--name=` option, check parameters: " + str(dockerParameters)) # Defer the container on-exit action if '--rm' in baseDockerCall and defer is None: defer = RM if '--rm' in baseDockerCall and defer is not RM: _logger.warn('--rm being passed to docker call but defer not set to dockerCall.RM, defer set to: ' + str(defer)) job.defer(_dockerKill, containerName, action=defer) # Defer the permission fixing function which will run after this job concludes. # We call this explicitly later on in this function, but we defer it as well to handle unexpected job failure. job.defer(_fixPermissions, tool, workDir) # Make subprocess call # If parameters is list of lists, treat each list as separate command and chain with pipes if len(parameters) > 0 and type(parameters[0]) is list: # When piping, all arguments now get merged into a single string to bash -c. # We try to support spaces in paths by wrapping them all in quotes first. chain_params = [' '.join(p) for p in [list(map(pipes.quote, q)) for q in parameters]] # Use bash's set -eo pipefail to detect and abort on a failure in any command in the chain call = baseDockerCall + ['--entrypoint', '/bin/bash', tool, '-c', 'set -eo pipefail && {}'.format(' | '.join(chain_params))] else: call = baseDockerCall + [tool] + parameters _logger.info("Calling docker with " + repr(call)) params = {} if outfile: params['stdout'] = outfile if checkOutput: callMethod = subprocess.check_output else: callMethod = subprocess.check_call for attempt in retry(predicate=dockerPredicate): with attempt: out = callMethod(call, **params) _fixPermissions(tool=tool, workDir=workDir) return out
def setNodeCount(self, nodeType, numNodes, preemptable=False, force=False): """ Attempt to grow or shrink the number of preemptable or non-preemptable worker nodes in the cluster to the given value, or as close a value as possible, and, after performing the necessary additions or removals of worker nodes, return the resulting number of preemptable or non-preemptable nodes currently in the cluster. :param str nodeType: The node type to add or remove. :param int numNodes: Desired size of the cluster :param bool preemptable: whether the added nodes will be preemptable, i.e. whether they may be removed spontaneously by the underlying platform at any time. :param bool force: If False, the provisioner is allowed to deviate from the given number of nodes. For example, when downsizing a cluster, a provisioner might leave nodes running if they have active jobs running on them. :rtype: int :return: the number of worker nodes in the cluster after making the necessary adjustments. This value should be, but is not guaranteed to be, close or equal to the `numNodes` argument. It represents the closest possible approximation of the actual cluster size at the time this method returns. """ for attempt in retry(predicate=self.provisioner.retryPredicate): with attempt: workerInstances = self.getNodes(preemptable=preemptable) logger.info("Cluster contains %i instances" % len(workerInstances)) # Reduce to nodes of the correct type workerInstances = { node: workerInstances[node] for node in workerInstances if node.nodeType == nodeType } ignoredNodes = [ node for node in workerInstances if node.privateIP in self.ignoredNodes ] numIgnoredNodes = len(ignoredNodes) numCurrentNodes = len(workerInstances) logger.info( "Cluster contains %i instances of type %s (%i ignored and draining jobs until " "they can be safely terminated)" % (numCurrentNodes, nodeType, numIgnoredNodes)) if not force: delta = numNodes - (numCurrentNodes - numIgnoredNodes) else: delta = numNodes - numCurrentNodes if delta > 0: if numIgnoredNodes > 0: # We can un-ignore a few nodes to compensate for the additional nodes we want. numNodesToUnignore = min(delta, numIgnoredNodes) logger.info( 'Unignoring %i nodes because we want to scale back up again.' % numNodesToUnignore) delta -= numNodesToUnignore for node in ignoredNodes[:numNodesToUnignore]: self.ignoredNodes.remove(node.privateIP) self.leader.batchSystem.unignoreNode( node.privateIP) logger.info( 'Adding %i %s nodes to get to desired cluster size of %i.', delta, 'preemptable' if preemptable else 'non-preemptable', numNodes) numNodes = numCurrentNodes + self._addNodes( nodeType, numNodes=delta, preemptable=preemptable) elif delta < 0: logger.info( 'Removing %i %s nodes to get to desired cluster size of %i.', -delta, 'preemptable' if preemptable else 'non-preemptable', numNodes) numNodes = numCurrentNodes - self._removeNodes( workerInstances, nodeType=nodeType, numNodes=-delta, preemptable=preemptable, force=force) else: if not force: logger.info( 'Cluster (minus ignored nodes) already at desired size of %i. Nothing to do.', numNodes) else: logger.info( 'Cluster already at desired size of %i. Nothing to do.', numNodes) return numNodes
def wrapper(*args, **kwargs): for attempt in retry(delays=truncExpBackoff(), timeout=300, predicate=googleRetryPredicate): with attempt: return f(*args, **kwargs)
def _addTags(cls, instances, tags): for instance in instances: for key, value in iteritems(tags): for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: instance.add_tag(key, value)