def create_spot_instances(ec2, price, image_id, spec, clusterName, num_instances=1, timeout=None, tentative=False): """ Adapted from cgcloud.lib.ec2.create_spot_instances to tag spot requests with the cluster name so they can be discovered and cleaned up at a later time :rtype: Iterator[list[Instance]] """ def spotRequestNotFound(e): return e.error_code == "InvalidSpotInstanceRequestID.NotFound" for attempt in retry_ec2(retry_for=a_long_time, retry_while=inconsistencies_detected): with attempt: requests = ec2.request_spot_instances(price, image_id, count=num_instances, **spec) for requestID in (request.id for request in requests): for attempt in retry_ec2(retry_while=spotRequestNotFound): with attempt: ec2.create_tags([requestID], {'clusterName': clusterName}) num_active, num_other = 0, 0 # noinspection PyUnboundLocalVariable,PyTypeChecker # request_spot_instances's type annotation is wrong for batch in wait_spot_requests_active(ec2, requests, timeout=timeout, tentative=tentative): instance_ids = [] for request in batch: if request.state == 'active': instance_ids.append(request.instance_id) num_active += 1 else: logger.info('Request %s in unexpected state %s.', request.id, request.state) num_other += 1 if instance_ids: # This next line is the reason we batch. It's so we can get multiple instances in # a single request. yield ec2.get_only_instances(instance_ids) if not num_active: message = 'None of the spot requests entered the active state' if tentative: logger.warn(message + '.') else: raise RuntimeError(message) if num_other: logger.warn('%i request(s) entered a state other than active.', num_other)
def _tag_object_persistently( self, tagged_ec2_object, tags_dict ): """ Object tagging occasionally fails with "NotFound" types of errors so we need to retry a few times. Sigh ... :type tagged_ec2_object: boto.ec2.TaggedEC2Object """ for attempt in retry_ec2( ): with attempt: tagged_ec2_object.add_tags( tags_dict )
def __delete_image_snapshot( self, image, wait=True ): for root_device in self.possible_root_devices: root_bdt = image.block_device_mapping.get( root_device ) if root_bdt: snapshot_id = image.block_device_mapping[ root_device ].snapshot_id log.info( "Deleting snapshot %s.", snapshot_id ) # It is safe to retry this indefinitely because a snapshot can only be # referenced by one AMI. See also https://github.com/boto/boto/issues/3019. for attempt in retry_ec2( retry_for=a_long_time if wait else 0, retry_while=lambda e: e.error_code == 'InvalidSnapshot.InUse' ): with attempt: self.ctx.ec2.delete_snapshot( snapshot_id ) return raise RuntimeError( 'Could not determine root device in AMI' )
def destroyCluster(cls, clusterName): def expectedShutdownErrors(e): return e.status == 400 and 'dependent object' in e.body ctx = Context(availability_zone='us-west-2a', namespace=cls._toNameSpace(clusterName)) instances = cls.__getNodesInCluster(ctx, clusterName, both=True) spotIDs = cls._getSpotRequestIDs(ctx, clusterName) if spotIDs: ctx.ec2.cancel_spot_instance_requests(request_ids=spotIDs) if instances: cls._deleteIAMProfiles(instances=instances, ctx=ctx) cls._terminateInstance(instances=instances, ctx=ctx) logger.info('Deleting security group...') for attempt in retry_ec2(retry_after=30, retry_for=300, retry_while=expectedShutdownErrors): with attempt: ctx.ec2.delete_security_group(name=clusterName) logger.info('... Succesfully deleted security group')
def _create( self ): """ Requests the RunInstances EC2 API call but accounts for the race between recently created instance profiles, IAM roles and an instance creation that refers to them. :rtype: boto.ec2.instance.Reservation """ instance_type = self.instance_creation_args[ 'instance_type' ] log.info( 'Creating %s instance(s) ... ', instance_type ) def inconsistencies_detected( e ): if e.code == 'InvalidGroup.NotFound': return True m = e.error_message.lower( ) return 'invalid iam instance profile' in m or 'no associated iam roles' in m for attempt in retry_ec2( retry_for=a_long_time, retry_while=inconsistencies_detected ): with attempt: return self.ctx.ec2.run_instances( self.image_id, **self.instance_creation_args )