def testAMIFinding(self): for zone in ['us-west-2a', 'eu-central-1a', 'sa-east-1b']: provisioner = AWSProvisioner('fakename', 'mesos', zone, 10000, None, None) ami = provisioner._discoverAMI() # Make sure we got an AMI and it looks plausible assert (ami.startswith('ami-'))
def _test(self, spotInstances=False): from toil.provisioners.aws.awsProvisioner import AWSProvisioner self.createClusterUtil() # get the leader so we know the IP address - we don't need to wait since create cluster # already insures the leader is running leader = AWSProvisioner._getLeader(wait=False, clusterName=self.clusterName) assert len(self.getMatchingRoles(self.clusterName)) == 1 # --never-download prevents silent upgrades to pip, wheel and setuptools venv_command = ['virtualenv', '--system-site-packages', '--never-download', '/home/venv'] self.sshUtil(venv_command) upgrade_command = ['/home/venv/bin/pip', 'install', 'setuptools==28.7.1'] self.sshUtil(upgrade_command) yaml_command = ['/home/venv/bin/pip', 'install', 'pyyaml==3.12'] self.sshUtil(yaml_command) # install toil scripts install_command = ['/home/venv/bin/pip', 'install', 'toil-scripts==%s' % self.toilScripts] self.sshUtil(install_command) toilOptions = ['--batchSystem=mesos', '--workDir=/var/lib/toil', '--mesosMaster=%s:5050' % leader.private_ip_address, '--clean=always', '--retryCount=2'] toilOptions.extend(['--provisioner=aws', '--nodeType=' + self.instanceType, '--maxNodes=%s' % self.numWorkers, '--logDebug']) if spotInstances: toilOptions.extend([ '--preemptableNodeType=%s:%s' % (self.instanceType, self.spotBid), # The RNASeq pipeline does not specify a preemptability requirement so we # need to specify a default, otherwise jobs would never get scheduled. '--defaultPreemptable', '--maxPreemptableNodes=%s' % self.numWorkers]) toilOptions = ' '.join(toilOptions) # TOIL_AWS_NODE_DEBUG prevents the provisioner from killing nodes that # fail a status check. This allows for easier debugging of # https://github.com/BD2KGenomics/toil/issues/1141 runCommand = ['bash', '-c', 'PATH=/home/venv/bin/:$PATH ' 'TOIL_AWS_NODE_DEBUG=True ' 'TOIL_SCRIPTS_TEST_NUM_SAMPLES='+str(self.numSamples)+ ' TOIL_SCRIPTS_TEST_TOIL_OPTIONS=' + pipes.quote(toilOptions) + ' TOIL_SCRIPTS_TEST_JOBSTORE=' + self.jobStore + ' /home/venv/bin/python -m unittest -v' + ' toil_scripts.rnaseq_cgl.test.test_rnaseq_cgl.RNASeqCGLTest.test_manifest'] self.sshUtil(runCommand) assert len(self.getMatchingRoles(self.clusterName)) == 1 AWSProvisioner.destroyCluster(self.clusterName) assert len(self.getMatchingRoles(self.clusterName)) == 0
def launchCluster(self): from boto.ec2.blockdevicemapping import BlockDeviceType self.createClusterUtil(args=[ '--leaderStorage', str(self.requestedLeaderStorage), '--nodeTypes', ",".join( self.instanceTypes), '-w', ",".join(self.numWorkers), '--nodeStorage', str(self.requestedLeaderStorage) ]) ctx = AWSProvisioner._buildContext(self.clusterName) nodes = AWSProvisioner._getNodesInCluster(ctx, self.clusterName, both=True) nodes.sort(key=lambda x: x.launch_time) # assuming that leader is first workers = nodes[1:] # test that two worker nodes were created self.assertEqual(2, len(workers)) # test that workers have expected storage size # just use the first worker worker = workers[0] worker = next(wait_instances_running(ctx.ec2, [worker])) rootBlockDevice = worker.block_device_mapping["/dev/xvda"] self.assertTrue(isinstance(rootBlockDevice, BlockDeviceType)) rootVolume = ctx.ec2.get_all_volumes( volume_ids=[rootBlockDevice.volume_id])[0] self.assertGreaterEqual(rootVolume.size, self.requestedNodeStorage)
def _getScript(self): def restartScript(): from toil.job import Job import argparse import os def f0(job): if 'FAIL' in os.environ: raise RuntimeError('failed on purpose') if __name__ == '__main__': parser = argparse.ArgumentParser() Job.Runner.addToilOptions(parser) options = parser.parse_args() rootJob = Job.wrapJobFn(f0, cores=0.5, memory='50 M', disk='50 M') Job.Runner.startToil(rootJob, options) script = dedent('\n'.join(getsource(restartScript).split('\n')[1:])) # use appliance ssh method instead of sshutil so we can specify input param AWSProvisioner._sshAppliance(self.leader.ip_address, 'tee', self.scriptName, input=script)
def _getScript(self): def userScript(): from toil.job import Job from toil.common import Toil # Because this is the only job in the pipeline and because it is preemptable, # there will be no non-preemptable jobs. The non-preemptable scaler will therefore # not request any nodes initially. And since we made it impossible for the # preemptable scaler to allocate any nodes (using an abnormally low spot bid), # we will observe a deficit of preemptable nodes that the non-preemptable scaler will # compensate for by spinning up non-preemptable nodes instead. # def job(job, disk='10M', cores=1, memory='10M', preemptable=True): pass if __name__ == '__main__': options = Job.Runner.getDefaultArgumentParser().parse_args() with Toil(options) as toil: if toil.config.restart: toil.restart() else: toil.start(Job.wrapJobFn(job)) script = dedent('\n'.join(getsource(userScript).split('\n')[1:])) # use appliance ssh method instead of sshutil so we can specify input param AWSProvisioner._sshAppliance(self.leader.ip_address, 'tee', '/home/userScript.py', input=script)
def launchCluster(self): self.createClusterUtil(args=['-w', '2']) ctx = AWSProvisioner._buildContext(self.clusterName) # test that two worker nodes were created + 1 for leader self.assertEqual( 2 + 1, len( AWSProvisioner._getNodesInCluster(ctx, self.clusterName, both=True)))
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--nodeType", dest='nodeType', required=True, help="Node type for {non-|}preemptable nodes. The syntax depends on the " "provisioner used. For the aws provisioner this is the name of an " "EC2 instance type followed by a colon and the price in dollar to " "bid for a spot instance, for example 'c3.8xlarge:0.42'.") parser.add_argument("--keyPairName", dest='keyPairName', required=True, help="The name of the AWS key pair to include on the instance") parser.add_argument("-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its" "children. Tags are of the form: " " -t key1=value1 --tag key2=value2 " "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {" " \"Name\": clusterName," " \"Owner\": IAM username" " }. ") parser.add_argument("--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not specified." "This subnet needs to have auto assign IPs turned on.") parser.add_argument("-w", "--workers", dest='workers', default=0, type=int, help="Specify a number of workers to launch alongside the leader when the " "cluster is created. This can be useful if running toil without " "auto-scaling but with need of more hardware support") config = parseBasicOptions(parser) setLoggingFromOptions(config) tagsDict = None if config.tags is None else createTagsDict(config.tags) spotBid = None if config.provisioner == 'aws': logger.info('Using aws provisioner.') try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: raise RuntimeError('The aws extra must be installed to use this provisioner') provisioner = AWSProvisioner() parsedBid = config.nodeType.split(':', 1) if len(config.nodeType) != len(parsedBid[0]): # there is a bid spotBid = float(parsedBid[1]) config.nodeType = parsedBid[0] else: assert False provisioner.launchCluster(instanceType=config.nodeType, keyName=config.keyPairName, clusterName=config.clusterName, workers=config.workers, spotBid=spotBid, userTags=tagsDict, zone=config.zone, vpcSubnet=config.vpcSubnet)
def testAWSProvisionerUtils(self): clusterName = 'cluster-utils-test' + str(uuid.uuid4()) try: system([ self.toilMain, 'launch-cluster', '--nodeType=t2.micro', '--keyPairName=jenkins@jenkins-master', clusterName, '--provisioner=aws' ]) finally: system([ self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName ]) try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner # launch preemptable master with same name system([ self.toilMain, 'launch-cluster', '--nodeType=m3.medium:0.2', '--keyPairName=jenkins@jenkins-master', clusterName, '--provisioner=aws', '--logLevel=DEBUG' ]) system([ self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName ]) testStrings = [ "'foo'", '"foo"', ' foo', '$PATH', '"', "'", '\\', '| cat', '&& cat', '; cat' ] for test in testStrings: logger.info('Testing SSH with special string: %s', test) compareTo = "import sys; assert sys.argv[1]==%r" % test AWSProvisioner.sshLeader(clusterName=clusterName, args=['python', '-', test], input=compareTo) try: AWSProvisioner.sshLeader(clusterName=clusterName, args=['nonsenseShouldFail']) except RuntimeError: pass else: self.fail( 'The remote command failed silently where it should have ' 'raised an error') finally: system([ self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName ])
def cluster_factory(provisioner, clusterName=None, clusterType='mesos', zone=None, nodeStorage=50, nodeStorageOverrides=None, sseKey=None): """ Find and instantiate the appropriate provisioner instance to make clusters in the given cloud. Raises ClusterTypeNotSupportedException if the given provisioner does not implement clusters of the given type. :param provisioner: The cloud type of the cluster. :param clusterName: The name of the cluster. :param clusterType: The type of cluster: 'mesos' or 'kubernetes'. :param zone: The cloud zone :return: A cluster object for the the cloud type. """ if provisioner == 'aws': try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: logger.error('The aws extra must be installed to use this provisioner') raise return AWSProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey) elif provisioner == 'gce': try: from toil.provisioners.gceProvisioner import GCEProvisioner except ImportError: logger.error('The google extra must be installed to use this provisioner') raise return GCEProvisioner(clusterName, clusterType, zone, nodeStorage, nodeStorageOverrides, sseKey) else: raise RuntimeError("Invalid provisioner '%s'" % provisioner)
def cluster_factory(provisioner, clusterName=None, zone=None, nodeStorage=50, nodeStorageOverrides=None, sseKey=None): """ :param clusterName: The name of the cluster. :param provisioner: The cloud type of the cluster. :param zone: The cloud zone :return: A cluster object for the the cloud type. """ if provisioner == 'aws': try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: logger.error( 'The aws extra must be installed to use this provisioner') raise return AWSProvisioner(clusterName, zone, nodeStorage, nodeStorageOverrides, sseKey) elif provisioner == 'gce': try: from toil.provisioners.gceProvisioner import GCEProvisioner except ImportError: logger.error( 'The google extra must be installed to use this provisioner') raise return GCEProvisioner(clusterName, zone, nodeStorage, nodeStorageOverrides, sseKey) else: raise RuntimeError("Invalid provisioner '%s'" % provisioner)
def launchCluster(self): self.createClusterUtil(args=['--leaderStorage', str(self.requestedLeaderStorage), '-w', '2', '--nodeStorage', str(self.requestedLeaderStorage)]) ctx = AWSProvisioner._buildContext(self.clusterName) nodes = AWSProvisioner._getNodesInCluster(ctx, self.clusterName, both=True) nodes.sort(key=lambda x: x.launch_time) # assuming that leader is first workers = nodes[1:] # test that two worker nodes were created self.assertEqual(2, len(workers)) # test that workers have expected storage size # just use the first worker worker = workers[0] worker = next(wait_instances_running(ctx.ec2, [worker])) rootBlockDevice = worker.block_device_mapping["/dev/xvda"] self.assertTrue(isinstance(rootBlockDevice, BlockDeviceType)) rootVolume = ctx.ec2.get_all_volumes(volume_ids=[rootBlockDevice.volume_id])[0] self.assertGreaterEqual(rootVolume.size, self.requestedNodeStorage)
def testAWSProvisionerUtils(self): clusterName = 'cluster-utils-test' + str(uuid.uuid4()) try: system([self.toilMain, 'launch-cluster', '--nodeType=t2.micro', '--keyPairName=jenkins@jenkins-master', clusterName, '--provisioner=aws']) finally: system([self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName]) try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner # launch preemptable master with same name system([self.toilMain, 'launch-cluster', '--nodeType=m3.medium:0.2', '--keyPairName=jenkins@jenkins-master', clusterName, '--provisioner=aws', '--logLevel=DEBUG']) system([self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName]) testStrings = ["'foo'", '"foo"', ' foo', '$PATH', '"', "'", '\\', '| cat', '&& cat', '; cat' ] for test in testStrings: logger.info('Testing SSH with special string: %s', test) compareTo = "import sys; assert sys.argv[1]==%r" % test AWSProvisioner.sshLeader(clusterName=clusterName, args=['python', '-', test], input=compareTo) try: AWSProvisioner.sshLeader(clusterName=clusterName, args=['nonsenseShouldFail']) except RuntimeError: pass else: self.fail('The remote command failed silently where it should have ' 'raised an error') finally: system([self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName])
def getRootVolID(self): """ Adds in test to check that EBS volume is build with adequate size. Otherwise is functionally equivalent to parent. :return: volumeID """ volumeID = super(AWSAutoscaleTest, self).getRootVolID() ctx = AWSProvisioner._buildContext(self.clusterName) rootVolume = ctx.ec2.get_all_volumes(volume_ids=[volumeID])[0] # test that the leader is given adequate storage self.assertGreaterEqual(rootVolume.size, self.requestedLeaderStorage) return volumeID
def _setProvisioner(self): if self.config.provisioner is None: self._provisioner = None elif self.config.provisioner == 'cgcloud': logger.info('Using cgcloud provisioner.') from toil.provisioners.cgcloud.provisioner import CGCloudProvisioner self._provisioner = CGCloudProvisioner(self.config, self._batchSystem) elif self.config.provisioner == 'aws': logger.info('Using AWS provisioner.') from toil.provisioners.aws.awsProvisioner import AWSProvisioner self._provisioner = AWSProvisioner(self.config, self._batchSystem) else: # Command line parser shold have checked argument validity already assert False, self.config.provisioner
def clusterFactory(provisioner, clusterName=None, zone=None, nodeStorage=50, sseKey=None): """ :param clusterName: The name of the cluster. :param provisioner: The cloud type of the cluster. :param zone: The cloud zone :return: A cluster object for the the cloud type. """ if provisioner == 'aws': try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner from toil.lib.ec2Credentials import enable_metadata_credential_caching except ImportError: logger.error( 'The aws extra must be installed to use this provisioner') raise enable_metadata_credential_caching() # monkey patch for AWS return AWSProvisioner(clusterName, zone, nodeStorage, sseKey) elif provisioner == 'gce': try: from toil.provisioners.gceProvisioner import GCEProvisioner except ImportError: logger.error( 'The google extra must be installed to use this provisioner') raise return GCEProvisioner(clusterName, zone, nodeStorage, sseKey) elif provisioner == 'azure': try: from toil.provisioners.azure.azureProvisioner import AzureProvisioner except ImportError: logger.error( 'The azure extra must be installed to use this provisioner') raise return AzureProvisioner(clusterName, zone, nodeStorage) else: raise RuntimeError("Invalid provisioner '%s'" % provisioner)
def test_read_write_global_files(self): """ Make sure the `_write_file_to_cloud()` and `_read_file_from_cloud()` functions of the AWS provisioner work as intended. """ provisioner = AWSProvisioner(f'aws-provisioner-test-{uuid4()}', 'mesos', 'us-west-2a', 50, None, None) key = 'config/test.txt' contents = b"Hello, this is a test." try: url = provisioner._write_file_to_cloud(key, contents=contents) self.assertTrue(url.startswith("s3://")) self.assertEqual(contents, provisioner._read_file_from_cloud(key)) finally: # the cluster was never launched, but we need to clean up the s3 bucket provisioner.destroyCluster()
def testAWSProvisionerUtils(self): clusterName = 'cluster-utils-test' + str(uuid.uuid4()) keyName = os.getenv('TOIL_AWS_KEYNAME') try: # --provisioner flag should default to aws, so we're not explicitly # specifying that here system([self.toilMain, 'launch-cluster', '--nodeType=t2.micro', '--keyPairName=' + keyName, clusterName]) finally: system([self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName]) try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner userTags = {'key1': 'value1', 'key2': 'value2', 'key3': 'value3'} tags = {'Name': clusterName, 'Owner': keyName} tags.update(userTags) # launch preemptable master with same name system([self.toilMain, 'launch-cluster', '-t', 'key1=value1', '-t', 'key2=value2', '--tag', 'key3=value3', '--nodeType=m3.medium:0.2', '--keyPairName=' + keyName, clusterName, '--provisioner=aws', '--logLevel=DEBUG']) # test leader tags leaderTags = AWSProvisioner._getLeader(clusterName).tags self.assertEqual(tags, leaderTags) # Test strict host key checking # Doesn't work when run locally. if(keyName == 'jenkins@jenkins-master'): try: AWSProvisioner.sshLeader(clusterName=clusterName, strict=True) except RuntimeError: pass else: self.fail("Host key verification passed where it should have failed") # Add the host key to known_hosts so that the rest of the tests can # pass without choking on the verification prompt. AWSProvisioner.sshLeader(clusterName=clusterName, strict=True, sshOptions=['-oStrictHostKeyChecking=no']) system([self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName]) testStrings = ["'foo'", '"foo"', ' foo', '$PATH', '"', "'", '\\', '| cat', '&& cat', '; cat' ] for test in testStrings: logger.info('Testing SSH with special string: %s', test) compareTo = "import sys; assert sys.argv[1]==%r" % test AWSProvisioner.sshLeader(clusterName=clusterName, args=['python', '-', test], input=compareTo) try: AWSProvisioner.sshLeader(clusterName=clusterName, args=['nonsenseShouldFail']) except RuntimeError: pass else: self.fail('The remote command failed silently where it should have ' 'raised an error') AWSProvisioner.sshLeader(clusterName=clusterName, args=['python', '-c', "import os; assert os.environ['TOIL_WORKDIR']=='/var/lib/toil'"]) # `toil rsync-cluster` # Testing special characters - string.punctuation fname = '!"#$%&\'()*+,-.;<=>:\ ?@[\\\\]^_`{|}~' testData = os.urandom(3 * (10**6)) with tempfile.NamedTemporaryFile(suffix=fname) as tmpFile: relpath = os.path.basename(tmpFile.name) tmpFile.write(testData) tmpFile.flush() # Upload file to leader AWSProvisioner.rsyncLeader(clusterName=clusterName, args=[tmpFile.name, ":"]) # Ensure file exists AWSProvisioner.sshLeader(clusterName=clusterName, args=["test", "-e", relpath]) tmpDir = tempfile.mkdtemp() # Download the file again and make sure it's the same file # `--protect-args` needed because remote bash chokes on special characters AWSProvisioner.rsyncLeader(clusterName=clusterName, args=["--protect-args", ":" + relpath, tmpDir]) with open(os.path.join(tmpDir, relpath), "r") as f: self.assertEqual(f.read(), testData, "Downloaded file does not match original file") finally: system([self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName]) try: shutil.rmtree(tmpDir) except NameError: pass
def _test(self, spotInstances=False, fulfillableBid=True): """ Does the work of the testing. Many features' test are thrown in here is no particular order :param spotInstances: Specify if you want to use spotInstances :param fulfillableBid: If false, the bid will never succeed. Used to test bid failure """ if not fulfillableBid: self.spotBid = '0.01' from toil.provisioners.aws.awsProvisioner import AWSProvisioner self.launchCluster() # get the leader so we know the IP address - we don't need to wait since create cluster # already insures the leader is running self.leader = AWSProvisioner._getLeader(wait=False, clusterName=self.clusterName) ctx = AWSProvisioner._buildContext(self.clusterName) assert len(self.getMatchingRoles(self.clusterName)) == 1 # --never-download prevents silent upgrades to pip, wheel and setuptools venv_command = ['virtualenv', '--system-site-packages', '--never-download', '/home/venv'] self.sshUtil(venv_command) upgrade_command = ['/home/venv/bin/pip', 'install', 'setuptools==28.7.1'] self.sshUtil(upgrade_command) yaml_command = ['/home/venv/bin/pip', 'install', 'pyyaml==3.12'] self.sshUtil(yaml_command) self._getScript() toilOptions = [self.jobStore, '--batchSystem=mesos', '--workDir=/var/lib/toil', '--clean=always', '--retryCount=2', '--clusterStats=/home/', '--logDebug', '--logFile=/home/sort.log', '--provisioner=aws'] if spotInstances: toilOptions.extend([ '--preemptableNodeType=%s:%s' % (self.instanceType, self.spotBid), # The RNASeq pipeline does not specify a preemptability requirement so we # need to specify a default, otherwise jobs would never get scheduled. '--defaultPreemptable', '--maxPreemptableNodes=%s' % self.numWorkers]) else: toilOptions.extend(['--nodeType=' + self.instanceType, '--maxNodes=%s' % self.numWorkers]) self._runScript(toilOptions) assert len(self.getMatchingRoles(self.clusterName)) == 1 checkStatsCommand = ['/home/venv/bin/python', '-c', 'import json; import os; ' 'json.load(open("/home/" + [f for f in os.listdir("/home/") ' 'if f.endswith(".json")].pop()))' ] self.sshUtil(checkStatsCommand) volumeID = self.getRootVolID() ctx = AWSProvisioner._buildContext(self.clusterName) AWSProvisioner.destroyCluster(self.clusterName) self.leader.update() for attempt in range(6): # https://github.com/BD2KGenomics/toil/issues/1567 # retry this for up to 1 minute until the volume disappears try: ctx.ec2.get_all_volumes(volume_ids=[volumeID]) time.sleep(10) except EC2ResponseError as e: if e.status == 400 and 'InvalidVolume.NotFound' in e.code: break else: raise else: self.fail('Volume with ID %s was not cleaned up properly' % volumeID) assert len(self.getMatchingRoles(self.clusterName)) == 0
def getMatchingRoles(self, clusterName): from toil.provisioners.aws.awsProvisioner import AWSProvisioner ctx = AWSProvisioner._buildContext(clusterName) roles = list(ctx.local_roles()) return roles
def _test(self, spotInstances=False): from toil.provisioners.aws.awsProvisioner import AWSProvisioner leader = AWSProvisioner.launchCluster(instanceType=self.instanceType, keyName=self.keyName, clusterName=self.clusterName) # --never-download prevents silent upgrades to pip, wheel and setuptools venv_command = 'virtualenv --system-site-packages --never-download /home/venv' AWSProvisioner._sshAppliance(leader.ip_address, command=venv_command) upgrade_command = '/home/venv/bin/pip install setuptools==28.7.1' AWSProvisioner._sshAppliance(leader.ip_address, command=upgrade_command) yaml_command = '/home/venv/bin/pip install pyyaml==3.12' AWSProvisioner._sshAppliance(leader.ip_address, command=yaml_command) # install toil scripts install_command = ('/home/venv/bin/pip install toil-scripts==%s' % self.toilScripts) AWSProvisioner._sshAppliance(leader.ip_address, command=install_command) # install curl install_command = 'sudo apt-get -y install curl' AWSProvisioner._sshAppliance(leader.ip_address, command=install_command) toilOptions = [ '--batchSystem=mesos', '--workDir=/var/lib/toil', '--mesosMaster=%s:5050' % leader.private_ip_address, '--clean=always', '--retryCount=0' ] toilOptions.extend([ '--provisioner=aws', '--nodeType=' + self.instanceType, '--maxNodes=%s' % self.numWorkers, '--logDebug' ]) if spotInstances: toilOptions.extend([ '--preemptableNodeType=%s:%s' % (self.instanceType, self.spotBid), # The RNASeq pipeline does not specify a preemptability requirement so we # need to specify a default, otherwise jobs would never get scheduled. '--defaultPreemptable', '--maxPreemptableNodes=%s' % self.numWorkers ]) toilOptions = ' '.join(toilOptions) runCommand = 'bash -c \\"export PATH=/home/venv/bin/:$PATH;export TOIL_SCRIPTS_TEST_NUM_SAMPLES=%i; export TOIL_SCRIPTS_TEST_TOIL_OPTIONS=' + pipes.quote(toilOptions) + \ '; export TOIL_SCRIPTS_TEST_JOBSTORE=' + self.jobStore + \ '; /home/venv/bin/python -m unittest -v' + \ ' toil_scripts.rnaseq_cgl.test.test_rnaseq_cgl.RNASeqCGLTest.test_manifest\\"' runCommand %= self.numSamples AWSProvisioner._sshAppliance(leader.ip_address, runCommand)
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument( "--leaderNodeType", dest="leaderNodeType", required=True, help="Non-preemptable node type to use for the cluster leader.") parser.add_argument( "--keyPairName", dest='keyPairName', required=True, help="On AWS, the name of the AWS key pair to include on the instance." " On Google/GCE, this is the ssh key pair." " On Azure, this will be used as the owner tag.") parser.add_argument( "--publicKeyFile", dest='publicKeyFile', default="~/.ssh/id_rsa.pub", help="On Azure, the file" " containing the key pairs (the first key pair will be used).") parser.add_argument( "--boto", dest='botoPath', help="The path to the boto credentials directory. This is transferred " "to all nodes in order to access the AWS jobStore from non-AWS instances." ) parser.add_argument( "-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its " "children. Tags are of the form:\n" " -t key1=value1 --tag key2=value2\n" "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {\n" " \"Name\": clusterName,\n" " \"Owner\": IAM username\n" " }. ") parser.add_argument( "--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not " "specified. This subnet needs to have auto assign IPs turned on.") parser.add_argument( "--nodeTypes", dest='nodeTypes', default=None, type=str, help="Comma-separated list of node types to create while launching the " "leader. The syntax for each node type depends on the provisioner " "used. For the aws provisioner this is the name of an EC2 instance " "type followed by a colon and the price in dollar to bid for a spot " "instance, for example 'c3.8xlarge:0.42'. Must also provide the " "--workers argument to specify how many workers of each node type " "to create.") parser.add_argument( "-w", "--workers", dest='workers', default=None, type=str, help= "Comma-separated list of the number of workers of each node type to " "launch alongside the leader when the cluster is created. This can be " "useful if running toil without auto-scaling but with need of more " "hardware support") parser.add_argument( "--leaderStorage", dest='leaderStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for the leader " "instance. This is an EBS volume.") parser.add_argument( "--nodeStorage", dest='nodeStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for any worker " "instances created when using the -w flag. This is an EBS volume.") parser.add_argument( '--forceDockerAppliance', dest='forceDockerAppliance', action='store_true', default=False, help= "Disables sanity checking the existence of the docker image specified " "by TOIL_APPLIANCE_SELF, which Toil uses to provision mesos for " "autoscaling.") parser.add_argument( "--azureStorageCredentials", dest='azureStorageCredentials', type=str, default=credential_file_path, help= "The location of the file containing the Azure storage credentials. If not specified," " the default file is used with Azure provisioning. Use 'None' to disable" " the transfer of credentials.") config = parseBasicOptions(parser) tagsDict = None if config.tags is None else createTagsDict(config.tags) # checks the validity of TOIL_APPLIANCE_SELF before proceeding checkToilApplianceSelf = applianceSelf( forceDockerAppliance=config.forceDockerAppliance) spotBids = [] nodeTypes = [] preemptableNodeTypes = [] numNodes = [] numPreemptableNodes = [] leaderSpotBid = None if config.provisioner == 'aws': logger.info('Using aws provisioner.') try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: logger.error( 'The aws extra must be installed to use this provisioner') raise provisioner = AWSProvisioner() elif config.provisioner == 'azure': try: from toil.provisioners.azure.azureProvisioner import AzureProvisioner except ImportError: raise RuntimeError( 'The aws extra must be installed to use this provisioner') provisioner = AzureProvisioner() elif config.provisioner == 'gce': logger.info('Using a gce provisioner.') try: from toil.provisioners.gceProvisioner import GCEProvisioner except ImportError: logger.error( 'The google extra must be installed to use this provisioner') raise provisioner = GCEProvisioner() else: assert False #Parse leader node type and spot bid parsedBid = config.leaderNodeType.split(':', 1) if len(config.leaderNodeType) != len(parsedBid[0]): leaderSpotBid = float(parsedBid[1]) config.leaderNodeType = parsedBid[0] if (config.nodeTypes or config.workers) and not (config.nodeTypes and config.workers): raise RuntimeError( "The --nodeTypes and --workers options must be specified together," ) if config.nodeTypes: nodeTypesList = config.nodeTypes.split(",") numWorkersList = config.workers.split(",") if not len(nodeTypesList) == len(numWorkersList): raise RuntimeError( "List of node types must be the same length as the list of workers." ) for nodeTypeStr, num in zip(nodeTypesList, numWorkersList): parsedBid = nodeTypeStr.split(':', 1) if len(nodeTypeStr) != len(parsedBid[0]): #Is a preemptable node preemptableNodeTypes.append(parsedBid[0]) spotBids.append(float(parsedBid[1])) numPreemptableNodes.append(int(num)) else: nodeTypes.append(nodeTypeStr) numNodes.append(int(num)) provisioner.launchCluster( leaderNodeType=config.leaderNodeType, leaderSpotBid=leaderSpotBid, nodeTypes=nodeTypes, preemptableNodeTypes=preemptableNodeTypes, numWorkers=numNodes, numPreemptableWorkers=numPreemptableNodes, keyName=config.keyPairName, botoPath=config.botoPath, clusterName=config.clusterName, spotBids=spotBids, userTags=tagsDict, zone=config.zone, leaderStorage=config.leaderStorage, nodeStorage=config.nodeStorage, vpcSubnet=config.vpcSubnet, publicKeyFile=config.publicKeyFile, azureStorageCredentials=config.azureStorageCredentials)
def _test(self, preemptableJobs=False): """ Does the work of the testing. Many features' test are thrown in here is no particular order """ from toil.provisioners.aws.awsProvisioner import AWSProvisioner self.launchCluster() # get the leader so we know the IP address - we don't need to wait since create cluster # already insures the leader is running self.leader = AWSProvisioner._getLeader(wait=False, clusterName=self.clusterName) ctx = AWSProvisioner._buildContext(self.clusterName) assert len(self.getMatchingRoles(self.clusterName)) == 1 # --never-download prevents silent upgrades to pip, wheel and setuptools venv_command = [ 'virtualenv', '--system-site-packages', '--never-download', '/home/venv' ] self.sshUtil(venv_command) upgrade_command = [ '/home/venv/bin/pip', 'install', 'setuptools==28.7.1' ] self.sshUtil(upgrade_command) yaml_command = ['/home/venv/bin/pip', 'install', 'pyyaml==3.12'] self.sshUtil(yaml_command) self._getScript() toilOptions = [ self.jobStore, '--batchSystem=mesos', '--workDir=/var/lib/toil', '--clean=always', '--retryCount=2', '--clusterStats=/home/', '--logDebug', '--logFile=/home/sort.log', '--provisioner=aws' ] toilOptions.extend([ '--nodeTypes=' + ",".join(self.instanceTypes), '--maxNodes=%s' % ",".join(self.numWorkers) ]) if preemptableJobs: toilOptions.extend(['--defaultPreemptable']) self._runScript(toilOptions) assert len(self.getMatchingRoles(self.clusterName)) == 1 checkStatsCommand = [ '/home/venv/bin/python', '-c', 'import json; import os; ' 'json.load(open("/home/" + [f for f in os.listdir("/home/") ' 'if f.endswith(".json")].pop()))' ] self.sshUtil(checkStatsCommand) from boto.exception import EC2ResponseError volumeID = self.getRootVolID() ctx = AWSProvisioner._buildContext(self.clusterName) AWSProvisioner.destroyCluster(self.clusterName) self.leader.update() for attempt in range(6): # https://github.com/BD2KGenomics/toil/issues/1567 # retry this for up to 1 minute until the volume disappears try: ctx.ec2.get_all_volumes(volume_ids=[volumeID]) time.sleep(10) except EC2ResponseError as e: if e.status == 400 and 'InvalidVolume.NotFound' in e.code: break else: raise else: self.fail('Volume with ID %s was not cleaned up properly' % volumeID) assert len(self.getMatchingRoles(self.clusterName)) == 0
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--nodeType", dest='nodeType', required=True, help="Node type for {non-|}preemptable nodes. The syntax depends on the " "provisioner used. For the aws provisioner this is the name of an " "EC2 instance type followed by a colon and the price in dollar to " "bid for a spot instance, for example 'c3.8xlarge:0.42'.") parser.add_argument("--keyPairName", dest='keyPairName', required=True, help="The name of the AWS key pair to include on the instance") parser.add_argument("-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its " "children. Tags are of the form:\n" " --t key1=value1 --tag key2=value2\n" "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {\n" " \"Name\": clusterName,\n" " \"Owner\": IAM username\n" " }. ") parser.add_argument("--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not specified. " "This subnet needs to have auto assign IPs turned on.") parser.add_argument("-w", "--workers", dest='workers', default=0, type=int, help="Specify a number of workers to launch alongside the leader when the " "cluster is created. This can be useful if running toil without " "auto-scaling but with need of more hardware support") parser.add_argument("--leaderStorage", dest='leaderStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for the leader instance. " "This is an EBS volume.") parser.add_argument("--nodeStorage", dest='nodeStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for any worker instances " "created when using the -w flag. This is an EBS volume.") config = parseBasicOptions(parser) tagsDict = None if config.tags is None else createTagsDict(config.tags) spotBid = None if config.provisioner == 'aws': logger.info('Using aws provisioner.') try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: raise RuntimeError('The aws extra must be installed to use this provisioner') provisioner = AWSProvisioner() parsedBid = config.nodeType.split(':', 1) if len(config.nodeType) != len(parsedBid[0]): # there is a bid spotBid = float(parsedBid[1]) config.nodeType = parsedBid[0] else: assert False provisioner.launchCluster(instanceType=config.nodeType, keyName=config.keyPairName, clusterName=config.clusterName, workers=config.workers, spotBid=spotBid, userTags=tagsDict, zone=config.zone, leaderStorage=config.leaderStorage, nodeStorage=config.nodeStorage, vpcSubnet=config.vpcSubnet)
def tearDown(self): from toil.provisioners.aws.awsProvisioner import AWSProvisioner AWSProvisioner.destroyCluster(self.clusterName)
def _test(self, spotInstances=False, fulfillableBid=True): """ Does the work of the testing. Many features' test are thrown in here is no particular order :param spotInstances: Specify if you want to use spotInstances :param fulfillableBid: If false, the bid will never succeed. Used to test bid failure """ if not fulfillableBid: self.spotBid = '0.01' from toil.provisioners.aws.awsProvisioner import AWSProvisioner self.launchCluster() # get the leader so we know the IP address - we don't need to wait since create cluster # already insures the leader is running self.leader = AWSProvisioner._getLeader(wait=False, clusterName=self.clusterName) ctx = AWSProvisioner._buildContext(self.clusterName) assert len(self.getMatchingRoles(self.clusterName)) == 1 # --never-download prevents silent upgrades to pip, wheel and setuptools venv_command = [ 'virtualenv', '--system-site-packages', '--never-download', '/home/venv' ] self.sshUtil(venv_command) upgrade_command = [ '/home/venv/bin/pip', 'install', 'setuptools==28.7.1' ] self.sshUtil(upgrade_command) yaml_command = ['/home/venv/bin/pip', 'install', 'pyyaml==3.12'] self.sshUtil(yaml_command) self._getScript() toilOptions = [ self.jobStore, '--batchSystem=mesos', '--workDir=/var/lib/toil', '--clean=always', '--retryCount=2', '--clusterStats=/home/', '--logDebug', '--logFile=/home/sort.log', '--provisioner=aws' ] if spotInstances: toilOptions.extend([ '--preemptableNodeType=%s:%s' % (self.instanceType, self.spotBid), # The RNASeq pipeline does not specify a preemptability requirement so we # need to specify a default, otherwise jobs would never get scheduled. '--defaultPreemptable', '--maxPreemptableNodes=%s' % self.numWorkers ]) else: toilOptions.extend([ '--nodeType=' + self.instanceType, '--maxNodes=%s' % self.numWorkers ]) self._runScript(toilOptions) assert len(self.getMatchingRoles(self.clusterName)) == 1 checkStatsCommand = [ '/home/venv/bin/python', '-c', 'import json; import os; ' 'json.load(open("/home/" + [f for f in os.listdir("/home/") ' 'if f.endswith(".json")].pop()))' ] self.sshUtil(checkStatsCommand) volumeID = self.getRootVolID() ctx = AWSProvisioner._buildContext(self.clusterName) AWSProvisioner.destroyCluster(self.clusterName) self.leader.update() for attempt in range(6): # https://github.com/BD2KGenomics/toil/issues/1567 # retry this for up to 1 minute until the volume disappears try: ctx.ec2.get_all_volumes(volume_ids=[volumeID]) time.sleep(10) except EC2ResponseError as e: if e.status == 400 and 'InvalidVolume.NotFound' in e.code: break else: raise else: self.fail('Volume with ID %s was not cleaned up properly' % volumeID) assert len(self.getMatchingRoles(self.clusterName)) == 0
def main(): parser = getBasicOptionParser() parser = addBasicProvisionerOptions(parser) parser.add_argument("--leaderNodeType", dest="leaderNodeType", required=True, help="Non-preemptable node type to use for the cluster leader.") parser.add_argument("--keyPairName", dest='keyPairName', required=True, help="The name of the AWS or ssh key pair to include on the instance") parser.add_argument("--boto", dest='botoPath', help="The path to the boto credentials directory. This is transferred to all " " nodes in order to access the AWS jobStore from non-AWS instances.") parser.add_argument("-t", "--tag", metavar='NAME=VALUE', dest='tags', default=[], action='append', help="Tags are added to the AWS cluster for this node and all of its " "children. Tags are of the form:\n" " -t key1=value1 --tag key2=value2\n" "Multiple tags are allowed and each tag needs its own flag. By " "default the cluster is tagged with " " {\n" " \"Name\": clusterName,\n" " \"Owner\": IAM username\n" " }. ") parser.add_argument("--vpcSubnet", help="VPC subnet ID to launch cluster in. Uses default subnet if not specified. " "This subnet needs to have auto assign IPs turned on.") parser.add_argument("--nodeTypes", dest='nodeTypes', default=None, type=str, help="Comma-separated list of node types to create while launching the leader. The " "syntax for each node type depends on the " "provisioner used. For the aws provisioner this is the name of an " "EC2 instance type followed by a colon and the price in dollar to " "bid for a spot instance, for example 'c3.8xlarge:0.42'. Must also provide " "the --workers argument to specify how many workers of each node type to create") parser.add_argument("-w", "--workers", dest='workers', default=None, type=str, help="Comma-separated list of the number of workers of each node type to launch " "alongside the leader when the " "cluster is created. This can be useful if running toil without " "auto-scaling but with need of more hardware support") parser.add_argument("--leaderStorage", dest='leaderStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for the leader instance. " "This is an EBS volume.") parser.add_argument("--nodeStorage", dest='nodeStorage', type=int, default=50, help="Specify the size (in gigabytes) of the root volume for any worker instances " "created when using the -w flag. This is an EBS volume.") config = parseBasicOptions(parser) tagsDict = None if config.tags is None else createTagsDict(config.tags) spotBids = [] nodeTypes = [] preemptableNodeTypes = [] numNodes = [] numPreemptableNodes = [] leaderSpotBid = None if config.provisioner == 'aws': logger.info('Using aws provisioner.') try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner except ImportError: logger.error('The aws extra must be installed to use this provisioner') raise provisioner = AWSProvisioner() elif config.provisioner == 'gce': logger.info('Using a gce provisioner.') try: from toil.provisioners.gceProvisioner import GCEProvisioner except ImportError: logger.error('The google extra must be installed to use this provisioner') raise provisioner = GCEProvisioner() else: assert False #Parse leader node type and spot bid parsedBid = config.leaderNodeType.split(':', 1) if len(config.leaderNodeType) != len(parsedBid[0]): leaderSpotBid = float(parsedBid[1]) config.leaderNodeType = parsedBid[0] if (config.nodeTypes or config.workers) and not (config.nodeTypes and config.workers): raise RuntimeError("The --nodeTypes and --workers options must be specified together,") if config.nodeTypes: nodeTypesList = config.nodeTypes.split(",") numWorkersList = config.workers.split(",") if not len(nodeTypesList) == len(numWorkersList): raise RuntimeError("List of node types must be same length as list of numbers of workers.") for nodeTypeStr, num in zip(nodeTypesList, numWorkersList): parsedBid = nodeTypeStr.split(':', 1) if len(nodeTypeStr) != len(parsedBid[0]): #Is a preemptable node preemptableNodeTypes.append(parsedBid[0]) spotBids.append(float(parsedBid[1])) numPreemptableNodes.append(int(num)) else: nodeTypes.append(nodeTypeStr) numNodes.append(int(num)) provisioner.launchCluster(leaderNodeType=config.leaderNodeType, leaderSpotBid=leaderSpotBid, nodeTypes=nodeTypes, preemptableNodeTypes=preemptableNodeTypes, numWorkers=numNodes, numPreemptableWorkers = numPreemptableNodes, keyName=config.keyPairName, botoPath=config.botoPath, clusterName=config.clusterName, spotBids=spotBids, userTags=tagsDict, zone=config.zone, leaderStorage=config.leaderStorage, nodeStorage=config.nodeStorage, vpcSubnet=config.vpcSubnet)
def testAWSProvisionerUtils(self): clusterName = 'cluster-utils-test' + str(uuid.uuid4()) keyName = os.getenv('TOIL_AWS_KEYNAME') try: # --provisioner flag should default to aws, so we're not explicitly # specifying that here system([ self.toilMain, 'launch-cluster', '--leaderNodeType=t2.micro', '--keyPairName=' + keyName, clusterName ]) finally: system([ self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName ]) try: from toil.provisioners.aws.awsProvisioner import AWSProvisioner userTags = {'key1': 'value1', 'key2': 'value2', 'key3': 'value3'} tags = {'Name': clusterName, 'Owner': keyName} tags.update(userTags) # launch preemptable master with same name system([ self.toilMain, 'launch-cluster', '-t', 'key1=value1', '-t', 'key2=value2', '--tag', 'key3=value3', '--leaderNodeType=m3.medium:0.2', '--keyPairName=' + keyName, clusterName, '--provisioner=aws', '--logLevel=DEBUG' ]) # test leader tags leaderTags = AWSProvisioner._getLeader(clusterName).tags self.assertEqual(tags, leaderTags) # Test strict host key checking # Doesn't work when run locally. if (keyName == 'jenkins@jenkins-master'): try: AWSProvisioner.sshLeader(clusterName=clusterName, strict=True) except RuntimeError: pass else: self.fail( "Host key verification passed where it should have failed" ) # Add the host key to known_hosts so that the rest of the tests can # pass without choking on the verification prompt. AWSProvisioner.sshLeader(clusterName=clusterName, strict=True, sshOptions=['-oStrictHostKeyChecking=no']) system([ self.toilMain, 'ssh-cluster', '--provisioner=aws', clusterName ]) testStrings = [ "'foo'", '"foo"', ' foo', '$PATH', '"', "'", '\\', '| cat', '&& cat', '; cat' ] for test in testStrings: logger.info('Testing SSH with special string: %s', test) compareTo = "import sys; assert sys.argv[1]==%r" % test AWSProvisioner.sshLeader(clusterName=clusterName, args=['python', '-', test], input=compareTo) try: AWSProvisioner.sshLeader(clusterName=clusterName, args=['nonsenseShouldFail']) except RuntimeError: pass else: self.fail( 'The remote command failed silently where it should have ' 'raised an error') AWSProvisioner.sshLeader( clusterName=clusterName, args=[ 'python', '-c', "import os; assert os.environ['TOIL_WORKDIR']=='/var/lib/toil'" ]) # `toil rsync-cluster` # Testing special characters - string.punctuation fname = '!"#$%&\'()*+,-.;<=>:\ ?@[\\\\]^_`{|}~' testData = os.urandom(3 * (10**6)) with tempfile.NamedTemporaryFile(suffix=fname) as tmpFile: relpath = os.path.basename(tmpFile.name) tmpFile.write(testData) tmpFile.flush() # Upload file to leader AWSProvisioner.rsyncLeader(clusterName=clusterName, args=[tmpFile.name, ":"]) # Ensure file exists AWSProvisioner.sshLeader(clusterName=clusterName, args=["test", "-e", relpath]) tmpDir = tempfile.mkdtemp() # Download the file again and make sure it's the same file # `--protect-args` needed because remote bash chokes on special characters AWSProvisioner.rsyncLeader( clusterName=clusterName, args=["--protect-args", ":" + relpath, tmpDir]) with open(os.path.join(tmpDir, relpath), "r") as f: self.assertEqual( f.read(), testData, "Downloaded file does not match original file") finally: system([ self.toilMain, 'destroy-cluster', '--provisioner=aws', clusterName ]) try: shutil.rmtree(tmpDir) except NameError: pass