def getProvisionedWorkers(self, nodeType, preemptable): assert self._leaderPrivateIP entireCluster = self._getNodesInCluster(both=True, nodeType=nodeType) logger.debug('All nodes in cluster: %s', entireCluster) workerInstances = [ i for i in entireCluster if i.private_ip_address != self._leaderPrivateIP ] logger.debug('All workers found in cluster: %s', workerInstances) workerInstances = [ i for i in workerInstances if preemptable != (i.spot_instance_request_id is None) ] logger.debug('%spreemptable workers found in cluster: %s', 'non-' if not preemptable else '', workerInstances) workerInstances = awsFilterImpairedNodes(workerInstances, self._ctx.ec2) return [ Node(publicIP=i.ip_address, privateIP=i.private_ip_address, name=i.id, launchTime=i.launch_time, nodeType=i.instance_type, preemptable=preemptable, tags=i.tags) for i in workerInstances ]
def getProvisionedWorkers(self, nodeType, preemptable): assert self._leaderPrivateIP entireCluster = self._getNodesInCluster(nodeType=nodeType) logger.debug('All nodes in cluster: %s', entireCluster) workerInstances = [] for instance in entireCluster: scheduling = instance.extra.get('scheduling') # If this field is not found in the extra meta-data, assume the node is not preemptable. if scheduling and scheduling.get('preemptible', False) != preemptable: continue isWorker = True for ip in instance.private_ips: if ip == self._leaderPrivateIP: isWorker = False break # don't include the leader if isWorker and instance.state == 'running': workerInstances.append(instance) logger.debug('All workers found in cluster: %s', workerInstances) return [ Node(publicIP=i.public_ips[0], privateIP=i.private_ips[0], name=i.name, launchTime=i.created_at, nodeType=i.size, preemptable=preemptable, tags=None) for i in workerInstances ]
def getLeader(self, wait=False, returnRawInstance=False): assert self._ctx instances = self._getNodesInCluster(nodeType=None, both=True) instances.sort(key=lambda x: x.launch_time) try: leader = instances[0] # assume leader was launched first except IndexError: raise NoSuchClusterException(self.clusterName) if (leader.tags.get(_TOIL_NODE_TYPE_TAG_KEY) or 'leader') != 'leader': raise InvalidClusterStateException( 'Invalid cluster state! The first launched instance appears not to be the leader ' 'as it is missing the "leader" tag. The safest recovery is to destroy the cluster ' 'and restart the job. Incorrect Leader ID: %s' % leader.id) leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address, name=leader.id, launchTime=leader.launch_time, nodeType=None, preemptable=False, tags=leader.tags) if wait: logger.debug("Waiting for toil_leader to enter 'running' state...") wait_instances_running(self._ctx.ec2, [leader]) logger.debug('... toil_leader is running') self._waitForIP(leader) leaderNode.waitForNode('toil_leader') return leader if returnRawInstance else leaderNode
def testNoLaunchingIfDeltaAlreadyMet(self): """ Check that the scaler doesn't try to launch "0" more instances if the delta was able to be met by unignoring nodes. """ # We have only one node type for simplicity self.provisioner.nodeTypes = ['c4.8xlarge'] self.provisioner.nodeShapes = [c4_8xlarge] scaler = ClusterScaler(self.provisioner, self.leader, self.config) # Pretend there is one ignored worker in the cluster self.provisioner.getProvisionedWorkers = MagicMock(return_value=[ Node('127.0.0.1', '127.0.0.1', 'testNode', datetime.datetime.now().isoformat(), nodeType='c4.8xlarge', preemptable=True) ]) scaler.ignoredNodes.add('127.0.0.1') # Exercise the updateClusterSize logic self.provisioner.addNodes = MagicMock() scaler.updateClusterSize({c4_8xlarge: 1}) self.assertFalse(self.provisioner.addNodes.called, "addNodes was called when no new nodes were needed") self.assertEqual( len(scaler.ignoredNodes), 0, "The scaler didn't unignore an ignored node when " "scaling up")
def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs): """ In addition to the parameters inherited from the abstractProvisioner, the Google launchCluster takes the following parameters: keyName: The key used to communicate with instances botoPath: Boto credentials for reading an AWS jobStore (optional). vpcSubnet: A subnet (optional). """ if 'keyName' not in kwargs: raise RuntimeError("A keyPairName is required for the GCE provisioner.") self._keyName = kwargs['keyName'] if 'botoPath' in kwargs: self._botoPath = kwargs['botoPath'] self._vpcSubnet = kwargs['vpcSubnet'] if 'vpcSubnet' in kwargs else None # Throws an error if cluster exists self._instanceGroup = self._gceDriver.ex_create_instancegroup(self.clusterName, self._zone) logger.debug('Launching leader') # GCE doesn't have a dictionary tags field. The tags field is just a string list. # Therefore, dumping tags into the description. tags = {'Owner': self._keyName, 'clusterName': self.clusterName} if 'userTags' in kwargs: tags.update(kwargs['userTags']) self._tags = json.dumps(tags) userData = self._getCloudConfigUserData('leader') metadata = {'items': [{'key': 'user-data', 'value': userData}]} imageType = 'flatcar-stable' sa_scopes = [{'scopes': ['compute', 'storage-full']}] disk = {} disk['initializeParams'] = { 'sourceImage': self.SOURCE_IMAGE, 'diskSizeGb' : leaderStorage } disk.update({'boot': True, 'autoDelete': True }) name= 'l' + str(uuid.uuid4()) leader = self._gceDriver.create_node(name, leaderNodeType, imageType, location=self._zone, ex_service_accounts=sa_scopes, ex_metadata=metadata, ex_subnetwork=self._vpcSubnet, ex_disks_gce_struct = [disk], description=self._tags, ex_preemptible=False) self._instanceGroup.add_instances([leader]) self._leaderPrivateIP = leader.private_ips[0] # needed if adding workers #self.subnetID = leader.subnet_id #TODO: get subnetID # Wait for the appliance to start and inject credentials. leaderNode = Node(publicIP=leader.public_ips[0], privateIP=leader.private_ips[0], name=leader.name, launchTime=leader.created_at, nodeType=leader.size, preemptable=False, tags=self._tags) leaderNode.waitForNode('toil_leader', keyName=self._keyName) leaderNode.copySshKeys(self._keyName) leaderNode.injectFile(self._credentialsPath, GoogleJobStore.nodeServiceAccountJson, 'toil_leader') if self._botoPath: leaderNode.injectFile(self._botoPath, self.NODE_BOTO_PATH, 'toil_leader') logger.debug('Launched leader')
def _getNodes(self, role=None, nodeType=None): """ Return a list of Node objects representing the instances in the cluster with the given role and nodeType. :param role: leader, work, or None for both :param nodeType: An instance type or None for all types. :return: A list of Node objects. """ allNodes = self._azureComputeClient.virtual_machines.list(self.clusterName) rv = [] allNodeNames = [] for node in allNodes: allNodeNames.append(node.name) nodeRole = node.tags.get('role', None) if node.provisioning_state != 'Succeeded' or (role is not None and nodeRole != role): continue if nodeType and node.hardware_profile.vm_size != nodeType: continue network_interface = self._azureNetworkClient.network_interfaces.get(self.clusterName, node.name) if not network_interface.ip_configurations: continue # no networks assigned to this node publicIP = self._azureNetworkClient.public_ip_addresses.get(self.clusterName, node.name) rv.append(Node( publicIP=publicIP.ip_address, privateIP=network_interface.ip_configurations[0].private_ip_address, name=node.name, launchTime=None, # Not used with Azure. nodeType=node.hardware_profile.vm_size, preemptable=False) # Azure doesn't have preemptable nodes ) logger.debug('All nodes in cluster: ' + ', '.join(allNodeNames)) return rv
def getLeader(self): instances = self._getNodesInCluster() instances.sort(key=lambda x: x.created_at) try: leader = instances[0] # assume leader was launched first except IndexError: raise NoSuchClusterException(self.clusterName) return Node(publicIP=leader.public_ips[0], privateIP=leader.private_ips[0], name=leader.name, launchTime=leader.created_at, nodeType=leader.size, preemptable=False, tags=None)
def launchCluster(self, leaderNodeType, leaderStorage, owner, **kwargs): """ In addition to the parameters inherited from the abstractProvisioner, the AWS launchCluster takes the following parameters: keyName: The key used to communicate with instances vpcSubnet: A subnet (optional). """ if 'keyName' not in kwargs: raise RuntimeError("A keyPairName is required for the AWS provisioner.") self._keyName = kwargs['keyName'] self._vpcSubnet = kwargs['vpcSubnet'] if 'vpcSubnet' in kwargs else None profileARN = self._getProfileARN() # the security group name is used as the cluster identifier sgs = self._createSecurityGroup() bdm = self._getBlockDeviceMapping(E2Instances[leaderNodeType], rootVolSize=leaderStorage) self._masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' # dummy key userData = self._getCloudConfigUserData('leader', self._masterPublicKey) specKwargs = {'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': leaderNodeType, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': profileARN, 'placement': self._zone} if self._vpcSubnet: specKwargs["subnet_id"] = self._vpcSubnet instances = create_ondemand_instances(self._ctx.ec2, image_id=self._discoverAMI(), spec=specKwargs, num_instances=1) # wait for the leader to finish setting up leader = instances[0] wait_instances_running(self._ctx.ec2, [leader]) self._waitForIP(leader) leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address, name=leader.id, launchTime=leader.launch_time, nodeType=leaderNodeType, preemptable=False, tags=leader.tags) leaderNode.waitForNode('toil_leader') defaultTags = {'Name': self.clusterName, 'Owner': owner} if kwargs['userTags']: defaultTags.update(kwargs['userTags']) # if we running launch cluster we need to save this data as it won't be generated # from the metadata. This data is needed to launch worker nodes. self._leaderPrivateIP = leader.private_ip_address self._addTags([leader], defaultTags) self._tags = leader.tags self._subnetID = leader.subnet_id
def _addNodes(self, numNodes, nodeType, preemptable=False): nodeShape = self.getNodeShape(nodeType=nodeType, preemptable=preemptable) class Worker(object): def __init__(self, jobQueue, updatedJobsQueue, secondsPerJob): self.busyEvent = Event() self.stopEvent = Event() def workerFn(): while True: if self.stopEvent.is_set(): return try: jobID = jobQueue.get(timeout=1.0) except Empty: continue updatedJobsQueue.put(jobID) self.busyEvent.set() time.sleep(secondsPerJob) self.busyEvent.clear() self.startTime = time.time() self.worker = Thread(target=workerFn) self.worker.start() def stop(self): self.stopEvent.set() self.worker.join() return time.time() - self.startTime for _ in range(numNodes): node = Node('127.0.0.1', uuid.uuid4(), 'testNode', datetime.datetime.now().isoformat() + 'Z', nodeType=nodeType, preemptable=preemptable) self.nodesToWorker[node] = Worker(self.jobQueue, self.updatedJobsQueue, self.secondsPerJob) self.workers[nodeShape].append(self.nodesToWorker[node]) self.maxWorkers[nodeShape] = max(self.maxWorkers[nodeShape], len(self.workers[nodeShape]))
def getLeader(self, wait=False): assert self._ctx instances = self._getNodesInCluster(nodeType=None, both=True) instances.sort(key=lambda x: x.launch_time) try: leader = instances[0] # assume leader was launched first except IndexError: raise NoSuchClusterException(self.clusterName) leaderNode = Node(publicIP=leader.ip_address, privateIP=leader.private_ip_address, name=leader.id, launchTime=leader.launch_time, nodeType=None, preemptable=False, tags=leader.tags) if wait: logger.debug("Waiting for toil_leader to enter 'running' state...") wait_instances_running(self._ctx.ec2, [leader]) logger.debug('... toil_leader is running') self._waitForIP(leader) leaderNode.waitForNode('toil_leader') return leaderNode
def addNodes(self, nodeType, numNodes, preemptable, spotBid=None): assert self._leaderPrivateIP # If keys are rsynced, then the mesos-slave needs to be started after the keys have been # transferred. The waitForKey.sh script loops on the new VM until it finds the keyPath file, then it starts the # mesos-slave. If there are multiple keys to be transferred, then the last one to be transferred must be # set to keyPath. keyPath = None botoExists = False if self._botoPath is not None and os.path.exists(self._botoPath): keyPath = self.NODE_BOTO_PATH botoExists = True elif self._sseKey: keyPath = self._sseKey if not preemptable: logger.debug('Launching %s non-preemptable nodes', numNodes) else: logger.debug('Launching %s preemptable nodes', numNodes) #kwargs["subnet_id"] = self.subnetID if self.subnetID else self._getClusterInstance(self.instanceMetaData).subnet_id userData = self._getCloudConfigUserData('worker', self._masterPublicKey, keyPath, preemptable) metadata = {'items': [{'key': 'user-data', 'value': userData}]} imageType = 'flatcar-stable' sa_scopes = [{'scopes': ['compute', 'storage-full']}] disk = {} disk['initializeParams'] = { 'sourceImage': self.SOURCE_IMAGE, 'diskSizeGb': self._nodeStorageOverrides.get(nodeType, self._nodeStorage) } disk.update({'boot': True, 'autoDelete': True}) # TODO: # - bug in gce.py for ex_create_multiple_nodes (erroneously, doesn't allow image and disk to specified) # - ex_create_multiple_nodes is limited to 1000 nodes # - use a different function # - or write a loop over the rest of this function, with 1K nodes max on each iteration #instancesLaunched = driver.ex_create_multiple_nodes( retries = 0 workersCreated = 0 # Try a few times to create the requested number of workers while numNodes - workersCreated > 0 and retries < 3: instancesLaunched = self.ex_create_multiple_nodes( '', nodeType, imageType, numNodes - workersCreated, location=self._zone, ex_service_accounts=sa_scopes, ex_metadata=metadata, ex_disks_gce_struct=[disk], description=self._tags, ex_preemptible=preemptable) failedWorkers = [] for instance in instancesLaunched: if isinstance(instance, GCEFailedNode): logger.error( "Worker failed to launch with code %s. Error message: %s" % (instance.code, instance.error)) continue node = Node( publicIP=instance.public_ips[0], privateIP=instance.private_ips[0], name=instance.name, launchTime=instance.created_at, nodeType=instance.size, preemptable=False, tags=self._tags) #FIXME: what should tags be set to? try: self._injectWorkerFiles(node, botoExists) logger.debug("Created worker %s" % node.publicIP) self._instanceGroup.add_instances([instance]) workersCreated += 1 except Exception as e: logger.error( "Failed to configure worker %s. Error message: %s" % (node.name, e)) failedWorkers.append(instance) if failedWorkers: logger.error("Terminating %d failed workers" % len(failedWorkers)) self._terminateInstances(failedWorkers) retries += 1 logger.debug('Launched %d new instance(s)', numNodes) if numNodes != workersCreated: logger.error("Failed to launch %d worker(s)", numNodes - workersCreated) return workersCreated
def addNodes(self, nodeType, numNodes, preemptable, spotBid=None): assert self._leaderPrivateIP if preemptable and not spotBid: if self._spotBidsMap and nodeType in self._spotBidsMap: spotBid = self._spotBidsMap[nodeType] else: raise RuntimeError( "No spot bid given for a preemptable node request.") instanceType = E2Instances[nodeType] bdm = self._getBlockDeviceMapping(instanceType, rootVolSize=self._nodeStorage) keyPath = self._sseKey if self._sseKey else None userData = self._getCloudConfigUserData('worker', self._masterPublicKey, keyPath, preemptable) if isinstance(userData, text_type): # Spot-market provisioning requires bytes for user data. userData = userData.encode('utf-8') sgs = [ sg for sg in self._ctx.ec2.get_all_security_groups() if sg.name in self._leaderSecurityGroupNames ] kwargs = { 'key_name': self._keyName, 'security_group_ids': [sg.id for sg in sgs], 'instance_type': instanceType.name, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': self._leaderProfileArn, 'placement': self._zone, 'subnet_id': self._subnetID } instancesLaunched = [] for attempt in retry(predicate=awsRetryPredicate): with attempt: # after we start launching instances we want to ensure the full setup is done # the biggest obstacle is AWS request throttling, so we retry on these errors at # every request in this method if not preemptable: logger.debug('Launching %s non-preemptable nodes', numNodes) instancesLaunched = create_ondemand_instances( self._ctx.ec2, image_id=self._discoverAMI(), spec=kwargs, num_instances=numNodes) else: logger.debug('Launching %s preemptable nodes', numNodes) kwargs['placement'] = getSpotZone(spotBid, instanceType.name, self._ctx) # force generator to evaluate instancesLaunched = list( create_spot_instances( ec2=self._ctx.ec2, price=spotBid, image_id=self._discoverAMI(), tags={'clusterName': self.clusterName}, spec=kwargs, num_instances=numNodes, tentative=True)) # flatten the list instancesLaunched = [ item for sublist in instancesLaunched for item in sublist ] for attempt in retry(predicate=awsRetryPredicate): with attempt: wait_instances_running(self._ctx.ec2, instancesLaunched) self._tags[_TOIL_NODE_TYPE_TAG_KEY] = 'worker' AWSProvisioner._addTags(instancesLaunched, self._tags) if self._sseKey: for i in instancesLaunched: self._waitForIP(i) node = Node(publicIP=i.ip_address, privateIP=i.private_ip_address, name=i.id, launchTime=i.launch_time, nodeType=i.instance_type, preemptable=preemptable, tags=i.tags) node.waitForNode('toil_worker') node.coreRsync([self._sseKey, ':' + self._sseKey], applianceName='toil_worker') logger.debug('Launched %s new instance(s)', numNodes) return len(instancesLaunched)
def launchCluster(self, leaderNodeType: str, leaderStorage: int, owner: str, keyName: str, botoPath: str, userTags: dict, vpcSubnet: str, awsEc2ProfileArn: str, awsEc2ExtraSecurityGroupIds: list): """ Starts a single leader node and populates this class with the leader's metadata. :param leaderNodeType: An AWS instance type, like "t2.medium", for example. :param leaderStorage: An integer number of gigabytes to provide the leader instance with. :param owner: Resources will be tagged with this owner string. :param keyName: The ssh key to use to access the leader node. :param botoPath: The path to the boto credentials directory. :param userTags: Optionally provided user tags to put on the leader. :param vpcSubnet: Optionally specify the VPC subnet. :param awsEc2ProfileArn: Optionally provide the profile ARN. :param awsEc2ExtraSecurityGroupIds: Optionally provide additional security group IDs. :return: None """ self._keyName = keyName self._vpcSubnet = vpcSubnet profileArn = awsEc2ProfileArn or self._getProfileArn() # the security group name is used as the cluster identifier sgs = self._createSecurityGroup() bdm = [ { 'DeviceName': '/dev/xvda', 'Ebs': { 'DeleteOnTermination': True, 'VolumeSize': leaderStorage, 'VolumeType': 'gp2' } }, ] self._masterPublicKey = 'AAAAB3NzaC1yc2Enoauthorizedkeyneeded' # dummy key userData = self._getCloudConfigUserData('leader', self._masterPublicKey) if isinstance(userData, text_type): # Spot-market provisioning requires bytes for user data. # We probably won't have a spot-market leader, but who knows! userData = userData.encode('utf-8') instances = create_instances(self.ec2, image_id=self._discoverAMI(), num_instances=1, key_name=self._keyName, security_group_ids=[sg.id for sg in sgs] + awsEc2ExtraSecurityGroupIds, instance_type=leaderNodeType, user_data=userData, block_device_map=bdm, # instance_profile_arn={'Arn': profileArn}, placement={'AvailabilityZone': self._zone}, subnet_id=self._vpcSubnet) # wait for the leader to finish setting up leader = instances[0] leader.wait_until_running() default_tags = {'Name': self.clusterName, 'Owner': owner, _TOIL_NODE_TYPE_TAG_KEY: 'leader'} default_tags.update(userTags) tags = [] for user_key, user_value in default_tags.items(): tags.append({'Key': user_key, 'Value': user_value}) leader.create_tags(Tags=tags) self._tags = leader.tags self._leaderPrivateIP = leader.private_ip_address self._subnetID = leader.subnet_id leaderNode = Node(publicIP=leader.public_ip_address, privateIP=leader.private_ip_address, name=leader.id, launchTime=leader.launch_time, nodeType=leaderNodeType, preemptable=False, tags=leader.tags) leaderNode.waitForNode('toil_leader')