Example #1
0
    def getLeader(self, wait=False, returnRawInstance=False):
        assert self._ctx
        instances = self._getNodesInCluster(nodeType=None, both=True)
        instances.sort(key=lambda x: x.launch_time)
        try:
            leader = instances[0]  # assume leader was launched first
        except IndexError:
            raise NoSuchClusterException(self.clusterName)
        if (leader.tags.get(_TOIL_NODE_TYPE_TAG_KEY) or 'leader') != 'leader':
            raise InvalidClusterStateException(
                'Invalid cluster state! The first launched instance appears not to be the leader '
                'as it is missing the "leader" tag. The safest recovery is to destroy the cluster '
                'and restart the job. Incorrect Leader ID: %s' % leader.id)
        leaderNode = Node(publicIP=leader.ip_address,
                          privateIP=leader.private_ip_address,
                          name=leader.id,
                          launchTime=leader.launch_time,
                          nodeType=None,
                          preemptable=False,
                          tags=leader.tags)
        if wait:
            logger.debug("Waiting for toil_leader to enter 'running' state...")
            wait_instances_running(self._ctx.ec2, [leader])
            logger.debug('... toil_leader is running')
            self._waitForIP(leader)
            leaderNode.waitForNode('toil_leader')

        return leader if returnRawInstance else leaderNode
Example #2
0
    def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs):
        """
        In addition to the parameters inherited from the abstractProvisioner,
        the AWS launchCluster takes the following parameters:
        keyName: The key used to communicate with instances
        vpcSubnet: A subnet (optional).
        """
        if 'keyName' not in kwargs:
            raise RuntimeError("A keyPairName is required for the AWS provisioner.")
        self._keyName = kwargs['keyName']
        self._vpcSubnet = kwargs['vpcSubnet'] if 'vpcSubnet' in kwargs else None

        profileARN = self._getProfileARN()
        # the security group name is used as the cluster identifier
        sgs = self._createSecurityGroup()
        bdm = self._getBlockDeviceMapping(E2Instances[leaderNodeType], rootVolSize=leaderStorage)

        self._masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' # dummy key
        userData =  self._getCloudConfigUserData('leader', self._masterPublicKey)
        specKwargs = {'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs],
                  'instance_type': leaderNodeType,
                  'user_data': userData, 'block_device_map': bdm,
                  'instance_profile_arn': profileARN,
                  'placement': self._zone}
        if self._vpcSubnet:
            specKwargs["subnet_id"] = self._vpcSubnet
        instances = create_ondemand_instances(self._ctx.ec2, image_id=self._discoverAMI(),
                                                  spec=specKwargs, num_instances=1)

        # wait for the leader to finish setting up
        leader = instances[0]
        wait_instances_running(self._ctx.ec2, [leader])
        self._waitForIP(leader)
        leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address,
                          name=leader.id, launchTime=leader.launch_time, nodeType=leaderNodeType,
                          preemptable=False, tags=leader.tags)
        leaderNode.waitForNode('toil_leader')

        defaultTags = {'Name': self.clusterName, 'Owner': owner}
        if kwargs['userTags']:
            defaultTags.update(kwargs['userTags'])

        # if we running launch cluster we need to save this data as it won't be generated
        # from the metadata. This data is needed to launch worker nodes.
        self._leaderPrivateIP = leader.private_ip_address
        self._addTags([leader], defaultTags)
        self._tags = leader.tags
        self._subnetID = leader.subnet_id
Example #3
0
    def launchCluster(self):
        from boto.ec2.blockdevicemapping import BlockDeviceType
        self.createClusterUtil(args=[
            '--leaderStorage',
            str(self.requestedLeaderStorage), '--nodeTypes', ",".join(
                self.instanceTypes), '-w', ",".join(self.numWorkers),
            '--nodeStorage',
            str(self.requestedLeaderStorage)
        ])

        ctx = AWSProvisioner._buildContext(self.clusterName)
        nodes = AWSProvisioner._getNodesInCluster(ctx,
                                                  self.clusterName,
                                                  both=True)
        nodes.sort(key=lambda x: x.launch_time)
        # assuming that leader is first
        workers = nodes[1:]
        # test that two worker nodes were created
        self.assertEqual(2, len(workers))
        # test that workers have expected storage size
        # just use the first worker
        worker = workers[0]
        worker = next(wait_instances_running(ctx.ec2, [worker]))
        rootBlockDevice = worker.block_device_mapping["/dev/xvda"]
        self.assertTrue(isinstance(rootBlockDevice, BlockDeviceType))
        rootVolume = ctx.ec2.get_all_volumes(
            volume_ids=[rootBlockDevice.volume_id])[0]
        self.assertGreaterEqual(rootVolume.size, self.requestedNodeStorage)
Example #4
0
    def launchCluster(self):
        from boto.ec2.blockdevicemapping import BlockDeviceType

        from toil.lib.ec2 import wait_instances_running
        self.createClusterUtil(args=[
            '--leaderStorage',
            str(self.requestedLeaderStorage), '--nodeTypes', ",".join(
                self.instanceTypes), '-w', ",".join(self.numWorkers),
            '--nodeStorage',
            str(self.requestedLeaderStorage)
        ])

        self.cluster = cluster_factory(provisioner='aws',
                                       zone=self.zone,
                                       clusterName=self.clusterName)
        # We need to wait a little bit here because the workers might not be
        # visible to EC2 read requests immediately after the create returns,
        # which is the last thing that starting the cluster does.
        time.sleep(10)
        nodes = self.cluster._getNodesInCluster(both=True)
        nodes.sort(key=lambda x: x.launch_time)
        # assuming that leader is first
        workers = nodes[1:]
        # test that two worker nodes were created
        self.assertEqual(2, len(workers))
        # test that workers have expected storage size
        # just use the first worker
        worker = workers[0]
        worker = next(wait_instances_running(self.boto2_ec2, [worker]))
        rootBlockDevice = worker.block_device_mapping["/dev/xvda"]
        self.assertTrue(isinstance(rootBlockDevice, BlockDeviceType))
        rootVolume = self.boto2_ec2.get_all_volumes(
            volume_ids=[rootBlockDevice.volume_id])[0]
        self.assertGreaterEqual(rootVolume.size, self.requestedNodeStorage)
Example #5
0
 def _getLeader(cls, clusterName, wait=False, zone=None):
     ctx = cls._buildContext(clusterName=clusterName, zone=zone)
     instances = cls._getNodesInCluster(ctx,
                                        clusterName,
                                        nodeType=None,
                                        both=True)
     instances.sort(key=lambda x: x.launch_time)
     try:
         leader = instances[0]  # assume leader was launched first
     except IndexError:
         raise NoSuchClusterException(clusterName)
     if wait:
         logger.info("Waiting for toil_leader to enter 'running' state...")
         wait_instances_running(ctx.ec2, [leader])
         logger.info('... toil_leader is running')
         cls._waitForNode(leader, 'toil_leader')
     return leader
Example #6
0
    def getLeader(self, wait=False):
        assert self._ctx
        instances = self._getNodesInCluster(nodeType=None, both=True)
        instances.sort(key=lambda x: x.launch_time)
        try:
            leader = instances[0]  # assume leader was launched first
        except IndexError:
            raise NoSuchClusterException(self.clusterName)
        leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address,
                          name=leader.id, launchTime=leader.launch_time, nodeType=None,
                          preemptable=False, tags=leader.tags)
        if wait:
            logger.debug("Waiting for toil_leader to enter 'running' state...")
            wait_instances_running(self._ctx.ec2, [leader])
            logger.debug('... toil_leader is running')
            self._waitForIP(leader)
            leaderNode.waitForNode('toil_leader')

        return leaderNode
Example #7
0
    def addNodes(self, nodeType, numNodes, preemptable, spotBid=None):
        assert self._leaderPrivateIP
        if preemptable and not spotBid:
            if self._spotBidsMap and nodeType in self._spotBidsMap:
                spotBid = self._spotBidsMap[nodeType]
            else:
                raise RuntimeError(
                    "No spot bid given for a preemptable node request.")
        instanceType = E2Instances[nodeType]
        bdm = self._getBlockDeviceMapping(instanceType,
                                          rootVolSize=self._nodeStorage)

        keyPath = self._sseKey if self._sseKey else None
        userData = self._getCloudConfigUserData('worker',
                                                self._masterPublicKey, keyPath,
                                                preemptable)
        if isinstance(userData, text_type):
            # Spot-market provisioning requires bytes for user data.
            userData = userData.encode('utf-8')
        sgs = [
            sg for sg in self._ctx.ec2.get_all_security_groups()
            if sg.name in self._leaderSecurityGroupNames
        ]
        kwargs = {
            'key_name': self._keyName,
            'security_group_ids': [sg.id for sg in sgs],
            'instance_type': instanceType.name,
            'user_data': userData,
            'block_device_map': bdm,
            'instance_profile_arn': self._leaderProfileArn,
            'placement': self._zone,
            'subnet_id': self._subnetID
        }

        instancesLaunched = []

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                # after we start launching instances we want to ensure the full setup is done
                # the biggest obstacle is AWS request throttling, so we retry on these errors at
                # every request in this method
                if not preemptable:
                    logger.debug('Launching %s non-preemptable nodes',
                                 numNodes)
                    instancesLaunched = create_ondemand_instances(
                        self._ctx.ec2,
                        image_id=self._discoverAMI(),
                        spec=kwargs,
                        num_instances=numNodes)
                else:
                    logger.debug('Launching %s preemptable nodes', numNodes)
                    kwargs['placement'] = getSpotZone(spotBid,
                                                      instanceType.name,
                                                      self._ctx)
                    # force generator to evaluate
                    instancesLaunched = list(
                        create_spot_instances(
                            ec2=self._ctx.ec2,
                            price=spotBid,
                            image_id=self._discoverAMI(),
                            tags={'clusterName': self.clusterName},
                            spec=kwargs,
                            num_instances=numNodes,
                            tentative=True))
                    # flatten the list
                    instancesLaunched = [
                        item for sublist in instancesLaunched
                        for item in sublist
                    ]

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                wait_instances_running(self._ctx.ec2, instancesLaunched)

        self._tags[_TOIL_NODE_TYPE_TAG_KEY] = 'worker'
        AWSProvisioner._addTags(instancesLaunched, self._tags)
        if self._sseKey:
            for i in instancesLaunched:
                self._waitForIP(i)
                node = Node(publicIP=i.ip_address,
                            privateIP=i.private_ip_address,
                            name=i.id,
                            launchTime=i.launch_time,
                            nodeType=i.instance_type,
                            preemptable=preemptable,
                            tags=i.tags)
                node.waitForNode('toil_worker')
                node.coreRsync([self._sseKey, ':' + self._sseKey],
                               applianceName='toil_worker')
        logger.debug('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
Example #8
0
    def addNodes(self, nodeType, numNodes, preemptable):
        instanceType = ec2_instance_types[nodeType]
        bdm = self._getBlockDeviceMapping(instanceType,
                                          rootVolSize=self.nodeStorage)
        arn = self._getProfileARN(self.ctx)
        keyPath = '' if not self.config or not self.config.sseKey else self.config.sseKey
        entryPoint = 'mesos-slave' if not self.config or not self.config.sseKey else "waitForKey.sh"
        workerData = dict(role='worker',
                          image=applianceSelf(),
                          entrypoint=entryPoint,
                          sshKey=self.masterPublicKey,
                          args=workerArgs.format(ip=self.leaderIP,
                                                 preemptable=preemptable,
                                                 keyPath=keyPath))
        userData = awsUserData.format(**workerData)
        sgs = [
            sg for sg in self.ctx.ec2.get_all_security_groups()
            if sg.name == self.clusterName
        ]
        kwargs = {
            'key_name': self.keyName,
            'security_group_ids': [sg.id for sg in sgs],
            'instance_type': instanceType.name,
            'user_data': userData,
            'block_device_map': bdm,
            'instance_profile_arn': arn,
            'placement': getCurrentAWSZone()
        }
        kwargs[
            "subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(
                self.instanceMetaData).subnet_id

        instancesLaunched = []

        for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
            with attempt:
                # after we start launching instances we want to insure the full setup is done
                # the biggest obstacle is AWS request throttling, so we retry on these errors at
                # every request in this method
                if not preemptable:
                    logger.info('Launching %s non-preemptable nodes', numNodes)
                    instancesLaunched = create_ondemand_instances(
                        self.ctx.ec2,
                        image_id=self._discoverAMI(self.ctx),
                        spec=kwargs,
                        num_instances=numNodes)
                else:
                    logger.info('Launching %s preemptable nodes', numNodes)
                    kwargs['placement'] = getSpotZone(self.spotBids[nodeType],
                                                      instanceType.name,
                                                      self.ctx)
                    # force generator to evaluate
                    instancesLaunched = list(
                        create_spot_instances(
                            ec2=self.ctx.ec2,
                            price=self.spotBids[nodeType],
                            image_id=self._discoverAMI(self.ctx),
                            tags={'clusterName': self.clusterName},
                            spec=kwargs,
                            num_instances=numNodes,
                            tentative=True))
                    # flatten the list
                    instancesLaunched = [
                        item for sublist in instancesLaunched
                        for item in sublist
                    ]

        for attempt in retry(predicate=AWSProvisioner._throttlePredicate):
            with attempt:
                wait_instances_running(self.ctx.ec2, instancesLaunched)

        # request throttling retry happens internally to these two methods to insure proper granularity
        AWSProvisioner._addTags(instancesLaunched, self.tags)
        self._propagateKey(instancesLaunched)

        logger.info('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
Example #9
0
    def launchCluster(self,
                      leaderNodeType,
                      leaderSpotBid,
                      nodeTypes,
                      preemptableNodeTypes,
                      keyName,
                      clusterName,
                      numWorkers=0,
                      numPreemptableWorkers=0,
                      spotBids=None,
                      userTags=None,
                      zone=None,
                      vpcSubnet=None,
                      leaderStorage=50,
                      nodeStorage=50,
                      **kwargs):
        if self.config is None:
            self.nodeStorage = nodeStorage
        if userTags is None:
            userTags = {}
        ctx = self._buildContext(clusterName=clusterName, zone=zone)
        profileARN = self._getProfileARN(ctx)
        leaderInstanceType = ec2_instance_types[leaderNodeType]
        # the security group name is used as the cluster identifier
        sgs = self._createSecurityGroup(ctx, clusterName, vpcSubnet)
        bdm = self._getBlockDeviceMapping(leaderInstanceType,
                                          rootVolSize=leaderStorage)
        self.masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded'
        leaderData = dict(role='leader',
                          image=applianceSelf(),
                          entrypoint='mesos-master',
                          sshKey=self.masterPublicKey,
                          args=leaderArgs.format(name=clusterName))
        userData = awsUserData.format(**leaderData)
        kwargs = {
            'key_name': keyName,
            'security_group_ids': [sg.id for sg in sgs],
            'instance_type': leaderNodeType,
            'user_data': userData,
            'block_device_map': bdm,
            'instance_profile_arn': profileARN,
            'placement': zone
        }
        if vpcSubnet:
            kwargs["subnet_id"] = vpcSubnet
        if not leaderSpotBid:
            logger.info('Launching non-preemptable leader')
            instances = create_ondemand_instances(
                ctx.ec2,
                image_id=self._discoverAMI(ctx),
                spec=kwargs,
                num_instances=1)
            leader = instances[0]
        else:
            logger.info('Launching preemptable leader')
            # force generator to evaluate
            instances = list(
                create_spot_instances(ec2=ctx.ec2,
                                      price=leaderSpotBid,
                                      image_id=self._discoverAMI(ctx),
                                      tags={'clusterName': clusterName},
                                      spec=kwargs,
                                      num_instances=1))[0]
            leader = instances[0]

        wait_instances_running(ctx.ec2, [leader])
        self._waitForNode(leader, 'toil_leader')

        defaultTags = {'Name': clusterName, 'Owner': keyName}
        defaultTags.update(userTags)

        # if we running launch cluster we need to save this data as it won't be generated
        # from the metadata. This data is needed to launch worker nodes.
        self.leaderIP = leader.private_ip_address
        self._addTags([leader], defaultTags)
        self.ctx = ctx
        if spotBids:
            self.spotBids = dict(zip(preemptableNodeTypes, spotBids))
        self.clusterName = clusterName
        self.keyName = keyName
        self.tags = leader.tags
        self.subnetID = leader.subnet_id
        # assuming that if the leader was launched without a spotbid then all workers
        # will be non-preemptable
        workersCreated = 0
        for nodeType, workers in zip(nodeTypes, numWorkers):
            workersCreated += self.addNodes(nodeType=nodeType,
                                            numNodes=workers,
                                            preemptable=False)
        for nodeType, workers in zip(preemptableNodeTypes,
                                     numPreemptableWorkers):
            workersCreated += self.addNodes(nodeType=nodeType,
                                            numNodes=workers,
                                            preemptable=True)
        logger.info('Added %d workers', workersCreated)

        return leader