def getLeader(self, wait=False, returnRawInstance=False): assert self._ctx instances = self._getNodesInCluster(nodeType=None, both=True) instances.sort(key=lambda x: x.launch_time) try: leader = instances[0] # assume leader was launched first except IndexError: raise NoSuchClusterException(self.clusterName) if (leader.tags.get(_TOIL_NODE_TYPE_TAG_KEY) or 'leader') != 'leader': raise InvalidClusterStateException( 'Invalid cluster state! The first launched instance appears not to be the leader ' 'as it is missing the "leader" tag. The safest recovery is to destroy the cluster ' 'and restart the job. Incorrect Leader ID: %s' % leader.id) leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address, name=leader.id, launchTime=leader.launch_time, nodeType=None, preemptable=False, tags=leader.tags) if wait: logger.debug("Waiting for toil_leader to enter 'running' state...") wait_instances_running(self._ctx.ec2, [leader]) logger.debug('... toil_leader is running') self._waitForIP(leader) leaderNode.waitForNode('toil_leader') return leader if returnRawInstance else leaderNode
def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs): """ In addition to the parameters inherited from the abstractProvisioner, the AWS launchCluster takes the following parameters: keyName: The key used to communicate with instances vpcSubnet: A subnet (optional). """ if 'keyName' not in kwargs: raise RuntimeError("A keyPairName is required for the AWS provisioner.") self._keyName = kwargs['keyName'] self._vpcSubnet = kwargs['vpcSubnet'] if 'vpcSubnet' in kwargs else None profileARN = self._getProfileARN() # the security group name is used as the cluster identifier sgs = self._createSecurityGroup() bdm = self._getBlockDeviceMapping(E2Instances[leaderNodeType], rootVolSize=leaderStorage) self._masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' # dummy key userData = self._getCloudConfigUserData('leader', self._masterPublicKey) specKwargs = {'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': leaderNodeType, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': profileARN, 'placement': self._zone} if self._vpcSubnet: specKwargs["subnet_id"] = self._vpcSubnet instances = create_ondemand_instances(self._ctx.ec2, image_id=self._discoverAMI(), spec=specKwargs, num_instances=1) # wait for the leader to finish setting up leader = instances[0] wait_instances_running(self._ctx.ec2, [leader]) self._waitForIP(leader) leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address, name=leader.id, launchTime=leader.launch_time, nodeType=leaderNodeType, preemptable=False, tags=leader.tags) leaderNode.waitForNode('toil_leader') defaultTags = {'Name': self.clusterName, 'Owner': owner} if kwargs['userTags']: defaultTags.update(kwargs['userTags']) # if we running launch cluster we need to save this data as it won't be generated # from the metadata. This data is needed to launch worker nodes. self._leaderPrivateIP = leader.private_ip_address self._addTags([leader], defaultTags) self._tags = leader.tags self._subnetID = leader.subnet_id
def launchCluster(self): from boto.ec2.blockdevicemapping import BlockDeviceType self.createClusterUtil(args=[ '--leaderStorage', str(self.requestedLeaderStorage), '--nodeTypes', ",".join( self.instanceTypes), '-w', ",".join(self.numWorkers), '--nodeStorage', str(self.requestedLeaderStorage) ]) ctx = AWSProvisioner._buildContext(self.clusterName) nodes = AWSProvisioner._getNodesInCluster(ctx, self.clusterName, both=True) nodes.sort(key=lambda x: x.launch_time) # assuming that leader is first workers = nodes[1:] # test that two worker nodes were created self.assertEqual(2, len(workers)) # test that workers have expected storage size # just use the first worker worker = workers[0] worker = next(wait_instances_running(ctx.ec2, [worker])) rootBlockDevice = worker.block_device_mapping["/dev/xvda"] self.assertTrue(isinstance(rootBlockDevice, BlockDeviceType)) rootVolume = ctx.ec2.get_all_volumes( volume_ids=[rootBlockDevice.volume_id])[0] self.assertGreaterEqual(rootVolume.size, self.requestedNodeStorage)
def launchCluster(self): from boto.ec2.blockdevicemapping import BlockDeviceType from toil.lib.ec2 import wait_instances_running self.createClusterUtil(args=[ '--leaderStorage', str(self.requestedLeaderStorage), '--nodeTypes', ",".join( self.instanceTypes), '-w', ",".join(self.numWorkers), '--nodeStorage', str(self.requestedLeaderStorage) ]) self.cluster = cluster_factory(provisioner='aws', zone=self.zone, clusterName=self.clusterName) # We need to wait a little bit here because the workers might not be # visible to EC2 read requests immediately after the create returns, # which is the last thing that starting the cluster does. time.sleep(10) nodes = self.cluster._getNodesInCluster(both=True) nodes.sort(key=lambda x: x.launch_time) # assuming that leader is first workers = nodes[1:] # test that two worker nodes were created self.assertEqual(2, len(workers)) # test that workers have expected storage size # just use the first worker worker = workers[0] worker = next(wait_instances_running(self.boto2_ec2, [worker])) rootBlockDevice = worker.block_device_mapping["/dev/xvda"] self.assertTrue(isinstance(rootBlockDevice, BlockDeviceType)) rootVolume = self.boto2_ec2.get_all_volumes( volume_ids=[rootBlockDevice.volume_id])[0] self.assertGreaterEqual(rootVolume.size, self.requestedNodeStorage)
def _getLeader(cls, clusterName, wait=False, zone=None): ctx = cls._buildContext(clusterName=clusterName, zone=zone) instances = cls._getNodesInCluster(ctx, clusterName, nodeType=None, both=True) instances.sort(key=lambda x: x.launch_time) try: leader = instances[0] # assume leader was launched first except IndexError: raise NoSuchClusterException(clusterName) if wait: logger.info("Waiting for toil_leader to enter 'running' state...") wait_instances_running(ctx.ec2, [leader]) logger.info('... toil_leader is running') cls._waitForNode(leader, 'toil_leader') return leader
def getLeader(self, wait=False): assert self._ctx instances = self._getNodesInCluster(nodeType=None, both=True) instances.sort(key=lambda x: x.launch_time) try: leader = instances[0] # assume leader was launched first except IndexError: raise NoSuchClusterException(self.clusterName) leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address, name=leader.id, launchTime=leader.launch_time, nodeType=None, preemptable=False, tags=leader.tags) if wait: logger.debug("Waiting for toil_leader to enter 'running' state...") wait_instances_running(self._ctx.ec2, [leader]) logger.debug('... toil_leader is running') self._waitForIP(leader) leaderNode.waitForNode('toil_leader') return leaderNode
def addNodes(self, nodeType, numNodes, preemptable, spotBid=None): assert self._leaderPrivateIP if preemptable and not spotBid: if self._spotBidsMap and nodeType in self._spotBidsMap: spotBid = self._spotBidsMap[nodeType] else: raise RuntimeError( "No spot bid given for a preemptable node request.") instanceType = E2Instances[nodeType] bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self._nodeStorage) keyPath = self._sseKey if self._sseKey else None userData = self._getCloudConfigUserData('worker', self._masterPublicKey, keyPath, preemptable) if isinstance(userData, text_type): # Spot-market provisioning requires bytes for user data. userData = userData.encode('utf-8') sgs = [ sg for sg in self._ctx.ec2.get_all_security_groups() if sg.name in self._leaderSecurityGroupNames ] kwargs = { 'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': self._leaderProfileArn, 'placement': self._zone, 'subnet_id': self._subnetID } instancesLaunched = [] for attempt in retry(predicate=awsRetryPredicate): with attempt: # after we start launching instances we want to ensure the full setup is done # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptable: logger.debug('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances( self._ctx.ec2, image_id=self._discoverAMI(), spec=kwargs, num_instances=numNodes) else: logger.debug('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(spotBid, instanceType.name, self._ctx) # force generator to evaluate instancesLaunched = list( create_spot_instances( ec2=self._ctx.ec2, price=spotBid, image_id=self._discoverAMI(), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True)) # flatten the list instancesLaunched = [ item for sublist in instancesLaunched for item in sublist ] for attempt in retry(predicate=awsRetryPredicate): with attempt: wait_instances_running(self._ctx.ec2, instancesLaunched) self._tags[_TOIL_NODE_TYPE_TAG_KEY] = 'worker' AWSProvisioner._addTags(instancesLaunched, self._tags) if self._sseKey: for i in instancesLaunched: self._waitForIP(i) node = Node(publicIP=i.ip_address, privateIP=i.private_ip_address, name=i.id, launchTime=i.launch_time, nodeType=i.instance_type, preemptable=preemptable, tags=i.tags) node.waitForNode('toil_worker') node.coreRsync([self._sseKey, ':' + self._sseKey], applianceName='toil_worker') logger.debug('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def addNodes(self, nodeType, numNodes, preemptable): instanceType = ec2_instance_types[nodeType] bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self.nodeStorage) arn = self._getProfileARN(self.ctx) keyPath = '' if not self.config or not self.config.sseKey else self.config.sseKey entryPoint = 'mesos-slave' if not self.config or not self.config.sseKey else "waitForKey.sh" workerData = dict(role='worker', image=applianceSelf(), entrypoint=entryPoint, sshKey=self.masterPublicKey, args=workerArgs.format(ip=self.leaderIP, preemptable=preemptable, keyPath=keyPath)) userData = awsUserData.format(**workerData) sgs = [ sg for sg in self.ctx.ec2.get_all_security_groups() if sg.name == self.clusterName ] kwargs = { 'key_name': self.keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': arn, 'placement': getCurrentAWSZone() } kwargs[ "subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance( self.instanceMetaData).subnet_id instancesLaunched = [] for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: # after we start launching instances we want to insure the full setup is done # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptable: logger.info('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances( self.ctx.ec2, image_id=self._discoverAMI(self.ctx), spec=kwargs, num_instances=numNodes) else: logger.info('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(self.spotBids[nodeType], instanceType.name, self.ctx) # force generator to evaluate instancesLaunched = list( create_spot_instances( ec2=self.ctx.ec2, price=self.spotBids[nodeType], image_id=self._discoverAMI(self.ctx), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True)) # flatten the list instancesLaunched = [ item for sublist in instancesLaunched for item in sublist ] for attempt in retry(predicate=AWSProvisioner._throttlePredicate): with attempt: wait_instances_running(self.ctx.ec2, instancesLaunched) # request throttling retry happens internally to these two methods to insure proper granularity AWSProvisioner._addTags(instancesLaunched, self.tags) self._propagateKey(instancesLaunched) logger.info('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def launchCluster(self, leaderNodeType, leaderSpotBid, nodeTypes, preemptableNodeTypes, keyName, clusterName, numWorkers=0, numPreemptableWorkers=0, spotBids=None, userTags=None, zone=None, vpcSubnet=None, leaderStorage=50, nodeStorage=50, **kwargs): if self.config is None: self.nodeStorage = nodeStorage if userTags is None: userTags = {} ctx = self._buildContext(clusterName=clusterName, zone=zone) profileARN = self._getProfileARN(ctx) leaderInstanceType = ec2_instance_types[leaderNodeType] # the security group name is used as the cluster identifier sgs = self._createSecurityGroup(ctx, clusterName, vpcSubnet) bdm = self._getBlockDeviceMapping(leaderInstanceType, rootVolSize=leaderStorage) self.masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' leaderData = dict(role='leader', image=applianceSelf(), entrypoint='mesos-master', sshKey=self.masterPublicKey, args=leaderArgs.format(name=clusterName)) userData = awsUserData.format(**leaderData) kwargs = { 'key_name': keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': leaderNodeType, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': profileARN, 'placement': zone } if vpcSubnet: kwargs["subnet_id"] = vpcSubnet if not leaderSpotBid: logger.info('Launching non-preemptable leader') instances = create_ondemand_instances( ctx.ec2, image_id=self._discoverAMI(ctx), spec=kwargs, num_instances=1) leader = instances[0] else: logger.info('Launching preemptable leader') # force generator to evaluate instances = list( create_spot_instances(ec2=ctx.ec2, price=leaderSpotBid, image_id=self._discoverAMI(ctx), tags={'clusterName': clusterName}, spec=kwargs, num_instances=1))[0] leader = instances[0] wait_instances_running(ctx.ec2, [leader]) self._waitForNode(leader, 'toil_leader') defaultTags = {'Name': clusterName, 'Owner': keyName} defaultTags.update(userTags) # if we running launch cluster we need to save this data as it won't be generated # from the metadata. This data is needed to launch worker nodes. self.leaderIP = leader.private_ip_address self._addTags([leader], defaultTags) self.ctx = ctx if spotBids: self.spotBids = dict(zip(preemptableNodeTypes, spotBids)) self.clusterName = clusterName self.keyName = keyName self.tags = leader.tags self.subnetID = leader.subnet_id # assuming that if the leader was launched without a spotbid then all workers # will be non-preemptable workersCreated = 0 for nodeType, workers in zip(nodeTypes, numWorkers): workersCreated += self.addNodes(nodeType=nodeType, numNodes=workers, preemptable=False) for nodeType, workers in zip(preemptableNodeTypes, numPreemptableWorkers): workersCreated += self.addNodes(nodeType=nodeType, numNodes=workers, preemptable=True) logger.info('Added %d workers', workersCreated) return leader