Esempio n. 1
0
    def _getKubernetesJoiningInfo(self, leader: Node = None) -> Dict[str, str]:
        """
        Get the Kubernetes joining info created when Kubernetes was set up on
        this node, which is the leader, or on a different specified Node.

        Returns a dict of JOIN_TOKEN, JOIN_CERT_HASH, and JOIN_ENDPOINT, which
        can be inserted into our Kubernetes worker setup script and config.

        :param leader: Node to operate on, if not the current machine.
        """

        # Make a parser for the config
        config = configparser.ConfigParser(interpolation=None)
        # Leave case alone
        config.optionxform = str

        if leader is None:
            # This info is always supposed to be set up before the Toil appliance
            # starts, and mounted in at the same path as on the host. So we just go
            # read it.
            with open('/etc/kubernetes/worker.ini') as f:
                config.read_file(f)
        else:
            # Grab from remote file
            with tempfile.TemporaryDirectory() as tmpdir:
                localFile = os.path.join(tmpdir, 'worker.ini')
                leader.extractFile('/etc/kubernetes/worker.ini', localFile,
                                   'toil_leader')

                with open(localFile) as f:
                    config.read_file(f)

        # Grab everything out of the default section where our setup script put
        # it.
        return dict(config['DEFAULT'])
Esempio n. 2
0
    def getLeader(self, wait=False, returnRawInstance=False):
        assert self._ctx
        instances = self._getNodesInCluster(nodeType=None, both=True)
        instances.sort(key=lambda x: x.launch_time)
        try:
            leader = instances[0]  # assume leader was launched first
        except IndexError:
            raise NoSuchClusterException(self.clusterName)
        if (leader.tags.get(_TOIL_NODE_TYPE_TAG_KEY) or 'leader') != 'leader':
            raise InvalidClusterStateException(
                'Invalid cluster state! The first launched instance appears not to be the leader '
                'as it is missing the "leader" tag. The safest recovery is to destroy the cluster '
                'and restart the job. Incorrect Leader ID: %s' % leader.id)
        leaderNode = Node(publicIP=leader.ip_address,
                          privateIP=leader.private_ip_address,
                          name=leader.id,
                          launchTime=leader.launch_time,
                          nodeType=None,
                          preemptable=False,
                          tags=leader.tags)
        if wait:
            logger.debug("Waiting for toil_leader to enter 'running' state...")
            wait_instances_running(self._ctx.ec2, [leader])
            logger.debug('... toil_leader is running')
            self._waitForIP(leader)
            leaderNode.waitForNode('toil_leader')

        return leader if returnRawInstance else leaderNode
Esempio n. 3
0
    def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs):
        """
        In addition to the parameters inherited from the abstractProvisioner,
        the Google launchCluster takes the following parameters:
        keyName: The key used to communicate with instances
        botoPath: Boto credentials for reading an AWS jobStore (optional).
        vpcSubnet: A subnet (optional).
        """
        if 'keyName' not in kwargs:
            raise RuntimeError("A keyPairName is required for the GCE provisioner.")
        self._keyName = kwargs['keyName']
        if 'botoPath' in kwargs:
            self._botoPath = kwargs['botoPath']
        self._vpcSubnet = kwargs['vpcSubnet'] if 'vpcSubnet' in kwargs else None

        # Throws an error if cluster exists
        self._instanceGroup = self._gceDriver.ex_create_instancegroup(self.clusterName, self._zone)
        logger.debug('Launching leader')

        # GCE doesn't have a dictionary tags field. The tags field is just a string list.
        # Therefore, dumping tags into the description.
        tags = {'Owner': self._keyName, 'clusterName': self.clusterName}
        if 'userTags' in kwargs:
            tags.update(kwargs['userTags'])
        self._tags = json.dumps(tags)

        userData =  self._getCloudConfigUserData('leader')
        metadata = {'items': [{'key': 'user-data', 'value': userData}]}
        imageType = 'flatcar-stable'
        sa_scopes = [{'scopes': ['compute', 'storage-full']}]
        disk = {}
        disk['initializeParams'] = {
            'sourceImage': self.SOURCE_IMAGE,
            'diskSizeGb' : leaderStorage }
        disk.update({'boot': True,
             'autoDelete': True })
        name= 'l' + str(uuid.uuid4())
        leader = self._gceDriver.create_node(name, leaderNodeType, imageType,
                                            location=self._zone,
                                            ex_service_accounts=sa_scopes,
                                            ex_metadata=metadata,
                                            ex_subnetwork=self._vpcSubnet,
                                            ex_disks_gce_struct = [disk],
                                            description=self._tags,
                                            ex_preemptible=False)

        self._instanceGroup.add_instances([leader])
        self._leaderPrivateIP = leader.private_ips[0] # needed if adding workers
        #self.subnetID = leader.subnet_id #TODO: get subnetID

        # Wait for the appliance to start and inject credentials.
        leaderNode = Node(publicIP=leader.public_ips[0], privateIP=leader.private_ips[0],
                          name=leader.name, launchTime=leader.created_at, nodeType=leader.size,
                          preemptable=False, tags=self._tags)
        leaderNode.waitForNode('toil_leader', keyName=self._keyName)
        leaderNode.copySshKeys(self._keyName)
        leaderNode.injectFile(self._credentialsPath, GoogleJobStore.nodeServiceAccountJson, 'toil_leader')
        if self._botoPath:
            leaderNode.injectFile(self._botoPath, self.NODE_BOTO_PATH, 'toil_leader')
        logger.debug('Launched leader')
Esempio n. 4
0
    def _setSSH(self, leader: Node = None) -> str:
        """
        Generate a key pair, save it in /root/.ssh/id_rsa.pub on the leader,
        and return the public key. The file /root/.sshSuccess is used to
        prevent this operation from running twice.

        Also starts the ssh agent on the local node, if operating on the local
        node.

        :param leader: Node to operate on, if not the current machine.

        :return: Public key, without the "ssh-rsa" part.
        """

        # To work locally or remotely we need to do all our setup work as one
        # big bash -c
        command = [
            'bash', '-c',
            ('set -e; if [ ! -e /root/.sshSuccess ] ; '
             'then ssh-keygen -f /root/.ssh/id_rsa -t rsa -N ""; '
             'touch /root/.sshSuccess; fi; chmod 700 /root/.ssh;')
        ]

        if leader is None:
            # Run locally
            subprocess.check_call(command)

            # Grab from local file
            with open('/root/.ssh/id_rsa.pub') as f:
                leaderPublicKey = f.read()
        else:
            # Run remotely
            leader.sshInstance(*command, appliance=True)

            # Grab from remote file
            with tempfile.TemporaryDirectory() as tmpdir:
                localFile = os.path.join(tmpdir, 'id_rsa.pub')
                leader.extractFile('/root/.ssh/id_rsa.pub', localFile,
                                   'toil_leader')

                with open(localFile) as f:
                    leaderPublicKey = f.read()

        # Drop the key type and keep just the key data
        leaderPublicKey = leaderPublicKey.split(' ')[1]

        # confirm it really is an RSA public key
        assert leaderPublicKey.startswith('AAAAB3NzaC1yc2E'), leaderPublicKey
        return leaderPublicKey
Esempio n. 5
0
    def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs):
        """
        In addition to the parameters inherited from the abstractProvisioner,
        the AWS launchCluster takes the following parameters:
        keyName: The key used to communicate with instances
        vpcSubnet: A subnet (optional).
        """
        if 'keyName' not in kwargs:
            raise RuntimeError("A keyPairName is required for the AWS provisioner.")
        self._keyName = kwargs['keyName']
        self._vpcSubnet = kwargs['vpcSubnet'] if 'vpcSubnet' in kwargs else None

        profileARN = self._getProfileARN()
        # the security group name is used as the cluster identifier
        sgs = self._createSecurityGroup()
        bdm = self._getBlockDeviceMapping(E2Instances[leaderNodeType], rootVolSize=leaderStorage)

        self._masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' # dummy key
        userData =  self._getCloudConfigUserData('leader', self._masterPublicKey)
        specKwargs = {'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs],
                  'instance_type': leaderNodeType,
                  'user_data': userData, 'block_device_map': bdm,
                  'instance_profile_arn': profileARN,
                  'placement': self._zone}
        if self._vpcSubnet:
            specKwargs["subnet_id"] = self._vpcSubnet
        instances = create_ondemand_instances(self._ctx.ec2, image_id=self._discoverAMI(),
                                                  spec=specKwargs, num_instances=1)

        # wait for the leader to finish setting up
        leader = instances[0]
        wait_instances_running(self._ctx.ec2, [leader])
        self._waitForIP(leader)
        leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address,
                          name=leader.id, launchTime=leader.launch_time, nodeType=leaderNodeType,
                          preemptable=False, tags=leader.tags)
        leaderNode.waitForNode('toil_leader')

        defaultTags = {'Name': self.clusterName, 'Owner': owner}
        if kwargs['userTags']:
            defaultTags.update(kwargs['userTags'])

        # if we running launch cluster we need to save this data as it won't be generated
        # from the metadata. This data is needed to launch worker nodes.
        self._leaderPrivateIP = leader.private_ip_address
        self._addTags([leader], defaultTags)
        self._tags = leader.tags
        self._subnetID = leader.subnet_id
Esempio n. 6
0
    def getProvisionedWorkers(self, nodeType, preemptable):
        assert self._leaderPrivateIP
        entireCluster = self._getNodesInCluster(nodeType=nodeType)
        logger.debug('All nodes in cluster: %s', entireCluster)
        workerInstances = []
        for instance in entireCluster:
            scheduling = instance.extra.get('scheduling')
            # If this field is not found in the extra meta-data, assume the node is not preemptable.
            if scheduling and scheduling.get('preemptible',
                                             False) != preemptable:
                continue
            isWorker = True
            for ip in instance.private_ips:
                if ip == self._leaderPrivateIP:
                    isWorker = False
                    break  # don't include the leader
            if isWorker and instance.state == 'running':
                workerInstances.append(instance)

        logger.debug('All workers found in cluster: %s', workerInstances)
        return [
            Node(publicIP=i.public_ips[0],
                 privateIP=i.private_ips[0],
                 name=i.name,
                 launchTime=i.created_at,
                 nodeType=i.size,
                 preemptable=preemptable,
                 tags=None) for i in workerInstances
        ]
Esempio n. 7
0
 def getProvisionedWorkers(self, nodeType, preemptable):
     assert self._leaderPrivateIP
     entireCluster = self._getNodesInCluster(both=True, nodeType=nodeType)
     logger.debug('All nodes in cluster: %s', entireCluster)
     workerInstances = [
         i for i in entireCluster
         if i.private_ip_address != self._leaderPrivateIP
     ]
     logger.debug('All workers found in cluster: %s', workerInstances)
     workerInstances = [
         i for i in workerInstances
         if preemptable != (i.spot_instance_request_id is None)
     ]
     logger.debug('%spreemptable workers found in cluster: %s',
                  'non-' if not preemptable else '', workerInstances)
     workerInstances = awsFilterImpairedNodes(workerInstances,
                                              self._ctx.ec2)
     return [
         Node(publicIP=i.ip_address,
              privateIP=i.private_ip_address,
              name=i.id,
              launchTime=i.launch_time,
              nodeType=i.instance_type,
              preemptable=preemptable,
              tags=i.tags) for i in workerInstances
     ]
Esempio n. 8
0
 def testNoLaunchingIfDeltaAlreadyMet(self):
     """
     Check that the scaler doesn't try to launch "0" more instances if
     the delta was able to be met by unignoring nodes.
     """
     # We have only one node type for simplicity
     self.provisioner.nodeTypes = ['c4.8xlarge']
     self.provisioner.nodeShapes = [c4_8xlarge]
     scaler = ClusterScaler(self.provisioner, self.leader, self.config)
     # Pretend there is one ignored worker in the cluster
     self.provisioner.getProvisionedWorkers = MagicMock(return_value=[
         Node('127.0.0.1',
              '127.0.0.1',
              'testNode',
              datetime.datetime.now().isoformat(),
              nodeType='c4.8xlarge',
              preemptable=True)
     ])
     scaler.ignoredNodes.add('127.0.0.1')
     # Exercise the updateClusterSize logic
     self.provisioner.addNodes = MagicMock()
     scaler.updateClusterSize({c4_8xlarge: 1})
     self.assertFalse(self.provisioner.addNodes.called,
                      "addNodes was called when no new nodes were needed")
     self.assertEqual(
         len(scaler.ignoredNodes), 0,
         "The scaler didn't unignore an ignored node when "
         "scaling up")
Esempio n. 9
0
    def _getNodes(self, role=None, nodeType=None):
        """
        Return a list of Node objects representing the instances in the cluster
        with the given role and nodeType.
        :param role: leader, work, or None for both
        :param nodeType: An instance type or None for all types.
        :return: A list of Node objects.
        """
        allNodes = self._azureComputeClient.virtual_machines.list(self.clusterName)
        rv = []
        allNodeNames = []
        for node in allNodes:
            allNodeNames.append(node.name)
            nodeRole = node.tags.get('role', None)
            if node.provisioning_state != 'Succeeded' or (role is not None and nodeRole != role):
                continue
            if nodeType and node.hardware_profile.vm_size != nodeType:
                continue

            network_interface = self._azureNetworkClient.network_interfaces.get(self.clusterName, node.name)
            if not network_interface.ip_configurations:
                continue # no networks assigned to this node
            publicIP = self._azureNetworkClient.public_ip_addresses.get(self.clusterName, node.name)
            rv.append(Node(
                publicIP=publicIP.ip_address,
                privateIP=network_interface.ip_configurations[0].private_ip_address,
                name=node.name,
                launchTime=None,  # Not used with Azure.
                nodeType=node.hardware_profile.vm_size,
                preemptable=False) # Azure doesn't have preemptable nodes
            )
        logger.debug('All nodes in cluster: ' + ', '.join(allNodeNames))
        return rv
Esempio n. 10
0
 def getLeader(self):
     instances = self._getNodesInCluster()
     instances.sort(key=lambda x: x.created_at)
     try:
         leader = instances[0]  # assume leader was launched first
     except IndexError:
         raise NoSuchClusterException(self.clusterName)
     return Node(publicIP=leader.public_ips[0], privateIP=leader.private_ips[0],
                       name=leader.name, launchTime=leader.created_at, nodeType=leader.size,
                       preemptable=False, tags=None)
Esempio n. 11
0
    def getLeader(self, wait=False):
        assert self._ctx
        instances = self._getNodesInCluster(nodeType=None, both=True)
        instances.sort(key=lambda x: x.launch_time)
        try:
            leader = instances[0]  # assume leader was launched first
        except IndexError:
            raise NoSuchClusterException(self.clusterName)
        leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address,
                          name=leader.id, launchTime=leader.launch_time, nodeType=None,
                          preemptable=False, tags=leader.tags)
        if wait:
            logger.debug("Waiting for toil_leader to enter 'running' state...")
            wait_instances_running(self._ctx.ec2, [leader])
            logger.debug('... toil_leader is running')
            self._waitForIP(leader)
            leaderNode.waitForNode('toil_leader')

        return leaderNode
Esempio n. 12
0
    def _addNodes(self, numNodes, nodeType, preemptable=False):
        nodeShape = self.getNodeShape(nodeType=nodeType,
                                      preemptable=preemptable)

        class Worker(object):
            def __init__(self, jobQueue, updatedJobsQueue, secondsPerJob):
                self.busyEvent = Event()
                self.stopEvent = Event()

                def workerFn():
                    while True:
                        if self.stopEvent.is_set():
                            return
                        try:
                            jobID = jobQueue.get(timeout=1.0)
                        except Empty:
                            continue
                        updatedJobsQueue.put(jobID)
                        self.busyEvent.set()
                        time.sleep(secondsPerJob)
                        self.busyEvent.clear()

                self.startTime = time.time()
                self.worker = Thread(target=workerFn)
                self.worker.start()

            def stop(self):
                self.stopEvent.set()
                self.worker.join()
                return time.time() - self.startTime

        for _ in range(numNodes):
            node = Node('127.0.0.1',
                        uuid.uuid4(),
                        'testNode',
                        datetime.datetime.now().isoformat() + 'Z',
                        nodeType=nodeType,
                        preemptable=preemptable)
            self.nodesToWorker[node] = Worker(self.jobQueue,
                                              self.updatedJobsQueue,
                                              self.secondsPerJob)
            self.workers[nodeShape].append(self.nodesToWorker[node])
        self.maxWorkers[nodeShape] = max(self.maxWorkers[nodeShape],
                                         len(self.workers[nodeShape]))
Esempio n. 13
0
    def addNodes(self, nodeType, numNodes, preemptable, spotBid=None):
        assert self._leaderPrivateIP

        # If keys are rsynced, then the mesos-slave needs to be started after the keys have been
        # transferred. The waitForKey.sh script loops on the new VM until it finds the keyPath file, then it starts the
        # mesos-slave. If there are multiple keys to be transferred, then the last one to be transferred must be
        # set to keyPath.
        keyPath = None
        botoExists = False
        if self._botoPath is not None and os.path.exists(self._botoPath):
            keyPath = self.NODE_BOTO_PATH
            botoExists = True
        elif self._sseKey:
            keyPath = self._sseKey

        if not preemptable:
            logger.debug('Launching %s non-preemptable nodes', numNodes)
        else:
            logger.debug('Launching %s preemptable nodes', numNodes)

        #kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id
        userData = self._getCloudConfigUserData('worker',
                                                self._masterPublicKey, keyPath,
                                                preemptable)
        metadata = {'items': [{'key': 'user-data', 'value': userData}]}
        imageType = 'flatcar-stable'
        sa_scopes = [{'scopes': ['compute', 'storage-full']}]
        disk = {}
        disk['initializeParams'] = {
            'sourceImage':
            self.SOURCE_IMAGE,
            'diskSizeGb':
            self._nodeStorageOverrides.get(nodeType, self._nodeStorage)
        }
        disk.update({'boot': True, 'autoDelete': True})

        # TODO:
        #  - bug in gce.py for ex_create_multiple_nodes (erroneously, doesn't allow image and disk to specified)
        #  - ex_create_multiple_nodes is limited to 1000 nodes
        #    - use a different function
        #    - or write a loop over the rest of this function, with 1K nodes max on each iteration
        #instancesLaunched = driver.ex_create_multiple_nodes(
        retries = 0
        workersCreated = 0
        # Try a few times to create the requested number of workers
        while numNodes - workersCreated > 0 and retries < 3:
            instancesLaunched = self.ex_create_multiple_nodes(
                '',
                nodeType,
                imageType,
                numNodes - workersCreated,
                location=self._zone,
                ex_service_accounts=sa_scopes,
                ex_metadata=metadata,
                ex_disks_gce_struct=[disk],
                description=self._tags,
                ex_preemptible=preemptable)
            failedWorkers = []
            for instance in instancesLaunched:
                if isinstance(instance, GCEFailedNode):
                    logger.error(
                        "Worker failed to launch with code %s. Error message: %s"
                        % (instance.code, instance.error))
                    continue

                node = Node(
                    publicIP=instance.public_ips[0],
                    privateIP=instance.private_ips[0],
                    name=instance.name,
                    launchTime=instance.created_at,
                    nodeType=instance.size,
                    preemptable=False,
                    tags=self._tags)  #FIXME: what should tags be set to?
                try:
                    self._injectWorkerFiles(node, botoExists)
                    logger.debug("Created worker %s" % node.publicIP)
                    self._instanceGroup.add_instances([instance])
                    workersCreated += 1
                except Exception as e:
                    logger.error(
                        "Failed to configure worker %s. Error message: %s" %
                        (node.name, e))
                    failedWorkers.append(instance)
            if failedWorkers:
                logger.error("Terminating %d failed workers" %
                             len(failedWorkers))
                self._terminateInstances(failedWorkers)
            retries += 1

        logger.debug('Launched %d new instance(s)', numNodes)
        if numNodes != workersCreated:
            logger.error("Failed to launch %d worker(s)",
                         numNodes - workersCreated)
        return workersCreated
Esempio n. 14
0
    def addNodes(self, nodeType, numNodes, preemptable, spotBid=None):
        assert self._leaderPrivateIP
        if preemptable and not spotBid:
            if self._spotBidsMap and nodeType in self._spotBidsMap:
                spotBid = self._spotBidsMap[nodeType]
            else:
                raise RuntimeError(
                    "No spot bid given for a preemptable node request.")
        instanceType = E2Instances[nodeType]
        bdm = self._getBlockDeviceMapping(instanceType,
                                          rootVolSize=self._nodeStorage)

        keyPath = self._sseKey if self._sseKey else None
        userData = self._getCloudConfigUserData('worker',
                                                self._masterPublicKey, keyPath,
                                                preemptable)
        if isinstance(userData, text_type):
            # Spot-market provisioning requires bytes for user data.
            userData = userData.encode('utf-8')
        sgs = [
            sg for sg in self._ctx.ec2.get_all_security_groups()
            if sg.name in self._leaderSecurityGroupNames
        ]
        kwargs = {
            'key_name': self._keyName,
            'security_group_ids': [sg.id for sg in sgs],
            'instance_type': instanceType.name,
            'user_data': userData,
            'block_device_map': bdm,
            'instance_profile_arn': self._leaderProfileArn,
            'placement': self._zone,
            'subnet_id': self._subnetID
        }

        instancesLaunched = []

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                # after we start launching instances we want to ensure the full setup is done
                # the biggest obstacle is AWS request throttling, so we retry on these errors at
                # every request in this method
                if not preemptable:
                    logger.debug('Launching %s non-preemptable nodes',
                                 numNodes)
                    instancesLaunched = create_ondemand_instances(
                        self._ctx.ec2,
                        image_id=self._discoverAMI(),
                        spec=kwargs,
                        num_instances=numNodes)
                else:
                    logger.debug('Launching %s preemptable nodes', numNodes)
                    kwargs['placement'] = getSpotZone(spotBid,
                                                      instanceType.name,
                                                      self._ctx)
                    # force generator to evaluate
                    instancesLaunched = list(
                        create_spot_instances(
                            ec2=self._ctx.ec2,
                            price=spotBid,
                            image_id=self._discoverAMI(),
                            tags={'clusterName': self.clusterName},
                            spec=kwargs,
                            num_instances=numNodes,
                            tentative=True))
                    # flatten the list
                    instancesLaunched = [
                        item for sublist in instancesLaunched
                        for item in sublist
                    ]

        for attempt in retry(predicate=awsRetryPredicate):
            with attempt:
                wait_instances_running(self._ctx.ec2, instancesLaunched)

        self._tags[_TOIL_NODE_TYPE_TAG_KEY] = 'worker'
        AWSProvisioner._addTags(instancesLaunched, self._tags)
        if self._sseKey:
            for i in instancesLaunched:
                self._waitForIP(i)
                node = Node(publicIP=i.ip_address,
                            privateIP=i.private_ip_address,
                            name=i.id,
                            launchTime=i.launch_time,
                            nodeType=i.instance_type,
                            preemptable=preemptable,
                            tags=i.tags)
                node.waitForNode('toil_worker')
                node.coreRsync([self._sseKey, ':' + self._sseKey],
                               applianceName='toil_worker')
        logger.debug('Launched %s new instance(s)', numNodes)
        return len(instancesLaunched)
Esempio n. 15
0
    def launchCluster(self,
                      leaderNodeType: str,
                      leaderStorage: int,
                      owner: str,
                      keyName: str,
                      botoPath: str,
                      userTags: dict,
                      vpcSubnet: str,
                      awsEc2ProfileArn: str,
                      awsEc2ExtraSecurityGroupIds: list):
        """
        Starts a single leader node and populates this class with the leader's metadata.

        :param leaderNodeType: An AWS instance type, like "t2.medium", for example.
        :param leaderStorage: An integer number of gigabytes to provide the leader instance with.
        :param owner: Resources will be tagged with this owner string.
        :param keyName: The ssh key to use to access the leader node.
        :param botoPath: The path to the boto credentials directory.
        :param userTags: Optionally provided user tags to put on the leader.
        :param vpcSubnet: Optionally specify the VPC subnet.
        :param awsEc2ProfileArn: Optionally provide the profile ARN.
        :param awsEc2ExtraSecurityGroupIds: Optionally provide additional security group IDs.
        :return: None
        """
        self._keyName = keyName
        self._vpcSubnet = vpcSubnet

        profileArn = awsEc2ProfileArn or self._getProfileArn()
        # the security group name is used as the cluster identifier
        sgs = self._createSecurityGroup()
        bdm = [
            {
                'DeviceName': '/dev/xvda',
                'Ebs': {
                    'DeleteOnTermination': True,
                    'VolumeSize': leaderStorage,
                    'VolumeType': 'gp2'
                }
            },
        ]

        self._masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' # dummy key
        userData = self._getCloudConfigUserData('leader', self._masterPublicKey)
        if isinstance(userData, text_type):
            # Spot-market provisioning requires bytes for user data.
            # We probably won't have a spot-market leader, but who knows!
            userData = userData.encode('utf-8')
        instances = create_instances(self.ec2,
                                     image_id=self._discoverAMI(),
                                     num_instances=1,
                                     key_name=self._keyName,
                                     security_group_ids=[sg.id for sg in sgs] + awsEc2ExtraSecurityGroupIds,
                                     instance_type=leaderNodeType,
                                     user_data=userData,
                                     block_device_map=bdm,
                                     # instance_profile_arn={'Arn': profileArn},
                                     placement={'AvailabilityZone': self._zone},
                                     subnet_id=self._vpcSubnet)

        # wait for the leader to finish setting up
        leader = instances[0]
        leader.wait_until_running()

        default_tags = {'Name': self.clusterName, 'Owner': owner, _TOIL_NODE_TYPE_TAG_KEY: 'leader'}
        default_tags.update(userTags)

        tags = []
        for user_key, user_value in default_tags.items():
            tags.append({'Key': user_key, 'Value': user_value})
        leader.create_tags(Tags=tags)

        self._tags = leader.tags
        self._leaderPrivateIP = leader.private_ip_address
        self._subnetID = leader.subnet_id

        leaderNode = Node(publicIP=leader.public_ip_address, privateIP=leader.private_ip_address,
                          name=leader.id, launchTime=leader.launch_time, nodeType=leaderNodeType,
                          preemptable=False, tags=leader.tags)
        leaderNode.waitForNode('toil_leader')