def tearDownClass( cls ): ctx = Context( os.environ[ 'CGCLOUD_ZONE' ], os.environ[ 'CGCLOUD_NAMESPACE' ] ) # Only cleanup if the context is using the default test namespace. If another namespace # is configured, we can't assume that all resources were created by the test and that # they can therefore be removed. if cls.cleanup and ctx.namespace == cls.namespace: ctx.cleanup() super( CgcloudTestCase, cls ).tearDownClass( )
def launchCluster(cls, instanceType, keyName, clusterName, spotBid=None): ctx = Context(availability_zone='us-west-2a', namespace=cls._toNameSpace(clusterName)) profileARN = cls._getProfileARN(ctx) # the security group name is used as the cluster identifier cls._createSecurityGroup(ctx, clusterName) bdm = cls._getBlockDeviceMapping(ec2_instance_types[instanceType]) dockerLeaderData = cls.dockerInfo().rsplit(':', 1) leaderRepo = dockerLeaderData[0] leaderTag = dockerLeaderData[1] leaderData = {'role': 'leader', 'tag': leaderTag, 'args': leaderArgs.format(name=clusterName), 'repo': leaderRepo} userData = awsUserData.format(**leaderData) kwargs = {'key_name': keyName, 'security_groups': [clusterName], 'instance_type': instanceType, 'user_data': userData, 'block_device_map': bdm, 'instance_profile_arn': profileARN} if not spotBid: logger.info('Launching non-preemptable leader') create_ondemand_instances(ctx.ec2, image_id=coreOSAMI, spec=kwargs, num_instances=1) else: logger.info('Launching preemptable leader') # force generator to evaluate list(create_spot_instances(ec2=ctx.ec2, price=spotBid, image_id=coreOSAMI, clusterName=clusterName, spec=kwargs, num_instances=1)) return cls._getLeader(clusterName=clusterName, wait=True)
def run( self, options ): zone = options.availability_zone namespace = options.namespace ctx = None try: ctx = Context( availability_zone=zone, namespace=namespace ) except ValueError as e: raise UserError( cause=e ) except: # print the namespace without __me__ substituted log.error( "An error occurred. Using zone '%s' and namespace '%s'", zone, namespace ) raise else: # print the namespace with __me__ substituted log.info( "Using zone '%s' and namespace '%s'", ctx.availability_zone, ctx.namespace ) return self.run_in_ctx( options, ctx ) finally: if ctx is not None: ctx.close( )
def _buildContext(cls, clusterName, zone=None): if zone is None: zone = getCurrentAWSZone() if zone is None: raise RuntimeError( 'Could not determine availability zone. Insure that one of the following ' 'is true: the --zone flag is set, the TOIL_AWS_ZONE environment variable ' 'is set, ec2_region_name is set in the .boto file, or that ' 'you are running on EC2.') return Context(availability_zone=zone, namespace=cls._toNameSpace(clusterName))
def run(self, options): zone = options.availability_zone namespace = options.namespace ctx = None try: ctx = Context(availability_zone=zone, namespace=namespace) except ValueError as e: raise UserError(cause=e) except: # print the namespace without __me__ substituted log.error("An error occurred. Using zone '%s' and namespace '%s'", zone, namespace) raise else: # print the namespace with __me__ substituted log.info("Using zone '%s' and namespace '%s'", ctx.availability_zone, ctx.namespace) return self.run_in_ctx(options, ctx) finally: if ctx is not None: ctx.close()
def __init__(self, config, batchSystem): self.instanceMetaData = get_instance_metadata() self.clusterName = self.instanceMetaData['security-groups'] self.ctx = Context(availability_zone='us-west-2a', namespace=self._toNameSpace(self.clusterName)) self.spotBid = None assert config.preemptableNodeType or config.nodeType if config.preemptableNodeType is not None: nodeBidTuple = config.preemptableNodeType.split(':', 1) self.spotBid = nodeBidTuple[1] self.instanceType = ec2_instance_types[nodeBidTuple[0]] else: self.instanceType = ec2_instance_types[config.nodeType] self.batchSystem = batchSystem self.leaderIP = self.instanceMetaData['local-ipv4'] self.keyName = self.instanceMetaData['public-keys'].keys()[0]
def run(): log.info("Entering main loop.") ctx = Context(availability_zone=options.availability_zone, namespace=options.namespace) throttle = LocalThrottle(min_interval=options.interval) for i in itertools.count(): throttle.throttle() try: log.info("Starting run %i.", i) Agent(ctx, options).run() log.info("Completed run %i.", i) except (SystemExit, KeyboardInterrupt): log.info('Terminating.') break except: log.exception('Abandoning run due to exception')
def setUpClass(cls): super(CgcloudTestCase, cls).setUpClass() if running_on_ec2(): os.environ.setdefault( 'CGCLOUD_ZONE', get_instance_metadata()['placement']['availability-zone']) # Using the d32 of a binary string that starts with a 4-byte, big-endian time stamp # yields compact names whose lexicographical sorting is consistent with the historical # order. We add the process ID so we can run tests concurrently in child processes using # the pytest-xdist plugin. suffix = aws_d32.encode(pack('>II', int(time.time()), os.getpid())) assert len(suffix) == test_namespace_suffix_length cls.__namespace = '/test/%s/' % suffix os.environ.setdefault('CGCLOUD_NAMESPACE', cls.__namespace) cls.ctx = Context(availability_zone=os.environ['CGCLOUD_ZONE'], namespace=os.environ['CGCLOUD_NAMESPACE'])
def _getLeader(cls, clusterName, wait=False): ctx = Context(availability_zone='us-west-2a', namespace=cls._toNameSpace(clusterName)) instances = cls.__getNodesInCluster(ctx, clusterName, both=True) instances.sort(key=lambda x: x.launch_time) leader = instances[0] # assume leader was launched first if wait: logger.info("Waiting for leader to enter 'running' state...") wait_transition(leader, {'pending'}, 'running') logger.info('... leader is running') cls._waitForIP(leader) leaderIP = leader.ip_address cls._waitForSSHPort(leaderIP) # wait here so docker commands can be used reliably afterwards cls._waitForDockerDaemon(leaderIP) cls._waitForAppliance(leaderIP) return leader
def destroyCluster(cls, clusterName): def expectedShutdownErrors(e): return e.status == 400 and 'dependent object' in e.body ctx = Context(availability_zone='us-west-2a', namespace=cls._toNameSpace(clusterName)) instances = cls.__getNodesInCluster(ctx, clusterName, both=True) spotIDs = cls._getSpotRequestIDs(ctx, clusterName) if spotIDs: ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs) if instances: cls._deleteIAMProfiles(instances=instances, ctx=ctx) cls._terminateInstance(instances=instances, ctx=ctx) logger.info('Deleting security group...') for attempt in retry_ec2(retry_after=30, retry_for=300, retry_while=expectedShutdownErrors): with attempt: ctx.ec2.delete_security_group(name=clusterName) logger.info('... Succesfully deleted security group')