def terminate_pool_instances(self, pool, instances, config, terminateByPool=False): """ Terminate an instance with the given configuration """ instance_ids_by_region = self.get_instance_ids_by_region(instances) for region in instance_ids_by_region: cluster = Laniakea(None) try: cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "terminate_pool_instances", msg)) return None try: if terminateByPool: boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk)}) # Data consistency checks for boto_instance in boto_instances: assert ((boto_instance.id in instance_ids_by_region[region]) or (boto_instance.state_code == INSTANCE_STATE['shutting-down'] or boto_instance.state_code == INSTANCE_STATE['terminated'])) cluster.terminate(boto_instances) else: logger.info("[Pool %d] Terminating %s instances in region %s" % (pool.id, len(instance_ids_by_region[region]),region)) cluster.terminate(cluster.find(instance_ids=instance_ids_by_region[region])) except boto.exception.EC2ResponseError as msg: logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "terminate_pool_instances", msg)) return 1
def _terminate_pool_instances(pool, instances, config, terminateByPool=False): """ Terminate an instance with the given configuration """ from .models import INSTANCE_STATE, PoolStatusEntry, POOL_STATUS_ENTRY_TYPE instance_ids_by_region = _get_instance_ids_by_region(instances) for region in instance_ids_by_region: cluster = Laniakea(None) try: cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: # Log this error to the pool status messages entry = PoolStatusEntry() entry.type = POOL_STATUS_ENTRY_TYPE['unclassified'] entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() logger.exception( "[Pool %d] terminate_pool_instances: laniakea failure: %s", pool.id, msg) return None try: if terminateByPool: boto_instances = cluster.find( filters={"tag:" + SPOTMGR_TAG + "-PoolId": str(pool.pk)}) # Data consistency checks for boto_instance in boto_instances: # state_code is a 16-bit value where the high byte is # an opaque internal value and should be ignored. state_code = boto_instance.state_code & 255 if not ( (boto_instance.id in instance_ids_by_region[region]) or (state_code == INSTANCE_STATE['shutting-down'] or state_code == INSTANCE_STATE['terminated'])): logger.error( "[Pool %d] Instance with EC2 ID %s (status %d) " "is not in region list for region %s", pool.id, boto_instance.id, state_code, region) cluster.terminate(boto_instances) else: logger.info("[Pool %d] Terminating %s instances in region %s", pool.id, len(instance_ids_by_region[region]), region) cluster.terminate( cluster.find(instance_ids=instance_ids_by_region[region])) except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg: logger.exception( "[Pool %d] terminate_pool_instances: boto failure: %s", pool.id, msg) return 1
def terminate_pool_instances(self, pool, instances, config, terminateByPool=False): """ Terminate an instance with the given configuration """ instance_ids_by_region = self.get_instance_ids_by_region(instances) for region in instance_ids_by_region: cluster = Laniakea(None) try: cluster.connect( region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: # Log this error to the pool status messages entry = PoolStatusEntry() entry.type = 0 entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "terminate_pool_instances", msg)) return None try: if terminateByPool: boto_instances = cluster.find( filters={"tag:SpotManager-PoolId": str(pool.pk)}) # Data consistency checks for boto_instance in boto_instances: if not ((boto_instance.id in instance_ids_by_region[region]) or (boto_instance.state_code == INSTANCE_STATE['shutting-down'] or boto_instance.state_code == INSTANCE_STATE['terminated'])): logger.error( "[Pool %d] Instance with EC2 ID %s (status %d) is not in region list for region %s" % (pool.id, boto_instance.id, boto_instance.state_code, region)) cluster.terminate(boto_instances) else: logger.info( "[Pool %d] Terminating %s instances in region %s" % (pool.id, len(instance_ids_by_region[region]), region)) cluster.terminate( cluster.find( instance_ids=instance_ids_by_region[region])) except boto.exception.EC2ResponseError as msg: logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "terminate_pool_instances", msg)) return 1
def terminate_pool_instances(self, pool, instances, config, terminateByPool=False): """ Terminate an instance with the given configuration """ instance_ids_by_region = self.get_instance_ids_by_region(instances) for region in instance_ids_by_region: cluster = Laniakea(None) try: cluster.connect( region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "terminate_pool_instances", msg)) return None try: if terminateByPool: boto_instances = cluster.find( filters={"tag:SpotManager-PoolId": str(pool.pk)}) # Data consistency checks for boto_instance in boto_instances: assert ((boto_instance.id in instance_ids_by_region[region]) or (boto_instance.state_code == INSTANCE_STATE['shutting-down'] or boto_instance.state_code == INSTANCE_STATE['terminated'])) cluster.terminate(boto_instances) else: logger.info( "[Pool %d] Terminating %s instances in region %s" % (pool.id, len(instance_ids_by_region[region]), region)) cluster.terminate( cluster.find( instance_ids=instance_ids_by_region[region])) except boto.exception.EC2ResponseError as msg: logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "terminate_pool_instances", msg)) return 1
def terminate_pool_instances(self, pool, instances, config, terminateByPool=False): """ Terminate an instance with the given configuration """ instance_ids_by_region = self.get_instance_ids_by_region(instances) for region in instance_ids_by_region: cluster = Laniakea(None) try: cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: # Log this error to the pool status messages entry = PoolStatusEntry() entry.type = 0 entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "terminate_pool_instances", msg)) return None try: if terminateByPool: boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk)}) # Data consistency checks for boto_instance in boto_instances: # state_code is a 16-bit value where the high byte is # an opaque internal value and should be ignored. state_code = boto_instance.state_code & 255 if not ((boto_instance.id in instance_ids_by_region[region]) or (state_code == INSTANCE_STATE['shutting-down'] or state_code == INSTANCE_STATE['terminated'])): logger.error("[Pool %d] Instance with EC2 ID %s (status %d) is not in region list for region %s" % (pool.id, boto_instance.id, state_code, region)) cluster.terminate(boto_instances) else: logger.info("[Pool %d] Terminating %s instances in region %s" % (pool.id, len(instance_ids_by_region[region]), region)) cluster.terminate(cluster.find(instance_ids=instance_ids_by_region[region])) except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg: logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "terminate_pool_instances", msg)) return 1
def update_pool_instances(self, pool, config): """ Check the state of the instances in a pool and update it in the database """ instances = Instance.objects.filter(pool=pool) instance_ids_by_region = self.get_instance_ids_by_region(instances) instances_by_ids = self.get_instances_by_ids(instances) instances_left = [] debug_boto_instance_ids_seen = set() debug_not_updatable_continue = set() debug_not_in_region = {} for instance_id in instances_by_ids: if instance_id: instances_left.append(instances_by_ids[instance_id]) for region in instance_ids_by_region: cluster = Laniakea(None) try: cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: # Log this error to the pool status messages entry = PoolStatusEntry() entry.type = 0 entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "update_pool_instances", msg)) return None try: boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk)}) for boto_instance in boto_instances: # Store ID seen for debugging purposes debug_boto_instance_ids_seen.add(boto_instance.id) # state_code is a 16-bit value where the high byte is # an opaque internal value and should be ignored. state_code = boto_instance.state_code & 255 if "SpotManager-Updatable" not in boto_instance.tags or int(boto_instance.tags["SpotManager-Updatable"]) <= 0: # The instance is not marked as updatable. We must not touch it because # a spawning thread is still managing this instance. However, we must also # remove this instance from the instances_left list if it's already in our # database, because otherwise our code here would delete it from the database. if boto_instance.id in instance_ids_by_region[region]: instances_left.remove(instances_by_ids[boto_instance.id]) else: debug_not_updatable_continue.add(boto_instance.id) continue instance = None # Whenever we see an instance that is not in our instance list for that region, # make sure it's a terminated instance because we should never have a running # instance that matches the search above but is not in our database. if not boto_instance.id in instance_ids_by_region[region]: if not ((state_code == INSTANCE_STATE['shutting-down'] or state_code == INSTANCE_STATE['terminated'])): # As a last resort, try to find the instance in our database. # If the instance was saved to our database between the entrance # to this function and the search query sent to EC2, then the instance # will not be in our instances list but returned by EC2. In this # case, we try to load it directly from the database. q = Instance.objects.filter(ec2_instance_id=boto_instance.id) if q: instance = q[0] logger.error("[Pool %d] Instance with EC2 ID %s was reloaded from database." % (pool.id, boto_instance.id)) else: logger.error("[Pool %d] Instance with EC2 ID %s is not in our database." % (pool.id, boto_instance.id)) # Terminate at this point, we run in an inconsistent state assert(False) debug_not_in_region[boto_instance.id] = state_code continue instance = instances_by_ids[boto_instance.id] instances_left.remove(instance) # Check the status code and update if necessary if instance.status_code != state_code: instance.status_code = state_code instance.save() # If for some reason we don't have a hostname yet, # update it accordingly. if not instance.hostname: instance.hostname = boto_instance.public_dns_name instance.save() except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg: logger.exception("%s: boto failure: %s" % ("update_pool_instances", msg)) return 1 if instances_left: for instance in instances_left: if not instance.ec2_instance_id in debug_boto_instance_ids_seen: logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2." % (pool.id, instance.ec2_instance_id)) if instance.ec2_instance_id in debug_not_updatable_continue: logger.error("[Pool %d] Deleting instance with EC2 ID %s from our database because it is not updatable but not in our region." % (pool.id, instance.ec2_instance_id)) if instance.ec2_instance_id in debug_not_in_region: logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has state code %s on EC2" % (pool.id, instance.ec2_instance_id, debug_not_in_region[instance.ec2_instance_id])) logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database." % (pool.id, instance.ec2_instance_id)) instance.delete()
def update_pool_instances(self, pool, config): """ Check the state of the instances in a pool and update it in the database """ instances = Instance.objects.filter(pool=pool) instance_ids_by_region = self.get_instance_ids_by_region(instances) instances_by_ids = self.get_instances_by_ids(instances) instances_left = [] for instance_id in instances_by_ids: if instance_id: instances_left.append(instances_by_ids[instance_id]) for region in instance_ids_by_region: cluster = Laniakea(None) try: cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: # Log this error to the pool status messages entry = PoolStatusEntry() entry.type = 0 entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "update_pool_instances", msg)) return None try: boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk), "tag:SpotManager-Updatable" : "1"}) for boto_instance in boto_instances: instance = None # Whenever we see an instance that is not in our instance list for that region, # make sure it's a terminated instance because we should never have a running # instance that matches the search above but is not in our database. if not boto_instance.id in instance_ids_by_region[region]: if not ((boto_instance.state_code == INSTANCE_STATE['shutting-down'] or boto_instance.state_code == INSTANCE_STATE['terminated'])): # As a last resort, try to find the instance in our database. # If the instance was saved to our database between the entrance # to this function and the search query sent to EC2, then the instance # will not be in our instances list but returned by EC2. In this # case, we try to load it directly from the database. q = Instance.objects.filter(ec2_instance_id = boto_instance.id) if q: instance = q[0] else: logger.error("[Pool %d] Instance with EC2 ID %s is not in our database." % (pool.id, boto_instance.id)) # Terminate at this point, we run in an inconsistent state assert(False) continue if not instance: instance = instances_by_ids[boto_instance.id] instances_left.remove(instance) # Check the status code and update if necessary if instance.status_code != boto_instance.state_code: instance.status_code = boto_instance.state_code instance.save() # If for some reason we don't have a hostname yet, # update it accordingly. if not instance.hostname: instance.hostname = boto_instance.public_dns_name instance.save() except boto.exception.EC2ResponseError as msg: logger.exception("%s: boto failure: %s" % ("update_pool_instances", msg)) return 1 if instances_left: for instance in instances_left: logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2." % (pool.id, instance.ec2_instance_id)) instance.delete()
def update_pool_instances(self, pool, config): """ Check the state of the instances in a pool and update it in the database """ instances = Instance.objects.filter(pool=pool) instance_ids_by_region = self.get_instance_ids_by_region(instances) instances_by_ids = self.get_instances_by_ids(instances) instances_left = [] debug_boto_instance_ids_seen = set() debug_not_updatable_continue = set() debug_not_in_region = {} for instance_id in instances_by_ids: if instance_id: instances_left.append(instances_by_ids[instance_id]) for region in instance_ids_by_region: cluster = Laniakea(None) try: cluster.connect( region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: # Log this error to the pool status messages entry = PoolStatusEntry() entry.type = 0 entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "update_pool_instances", msg)) return None try: boto_instances = cluster.find( filters={"tag:SpotManager-PoolId": str(pool.pk)}) for boto_instance in boto_instances: # Store ID seen for debugging purposes debug_boto_instance_ids_seen.add(boto_instance.id) # state_code is a 16-bit value where the high byte is # an opaque internal value and should be ignored. state_code = boto_instance.state_code & 255 if "SpotManager-Updatable" not in boto_instance.tags or int( boto_instance.tags["SpotManager-Updatable"]) <= 0: # The instance is not marked as updatable. We must not touch it because # a spawning thread is still managing this instance. However, we must also # remove this instance from the instances_left list if it's already in our # database, because otherwise our code here would delete it from the database. if boto_instance.id in instance_ids_by_region[region]: instances_left.remove( instances_by_ids[boto_instance.id]) else: debug_not_updatable_continue.add(boto_instance.id) continue instance = None # Whenever we see an instance that is not in our instance list for that region, # make sure it's a terminated instance because we should never have a running # instance that matches the search above but is not in our database. if not boto_instance.id in instance_ids_by_region[region]: if not ( (state_code == INSTANCE_STATE['shutting-down'] or state_code == INSTANCE_STATE['terminated'])): # As a last resort, try to find the instance in our database. # If the instance was saved to our database between the entrance # to this function and the search query sent to EC2, then the instance # will not be in our instances list but returned by EC2. In this # case, we try to load it directly from the database. q = Instance.objects.filter( ec2_instance_id=boto_instance.id) if q: instance = q[0] logger.error( "[Pool %d] Instance with EC2 ID %s was reloaded from database." % (pool.id, boto_instance.id)) else: logger.error( "[Pool %d] Instance with EC2 ID %s is not in our database." % (pool.id, boto_instance.id)) # Terminate at this point, we run in an inconsistent state assert (False) debug_not_in_region[boto_instance.id] = state_code continue instance = instances_by_ids[boto_instance.id] instances_left.remove(instance) # Check the status code and update if necessary if instance.status_code != state_code: instance.status_code = state_code instance.save() # If for some reason we don't have a hostname yet, # update it accordingly. if not instance.hostname: instance.hostname = boto_instance.public_dns_name instance.save() except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg: logger.exception("%s: boto failure: %s" % ("update_pool_instances", msg)) return 1 if instances_left: for instance in instances_left: if not instance.ec2_instance_id in debug_boto_instance_ids_seen: logger.info( "[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2." % (pool.id, instance.ec2_instance_id)) if instance.ec2_instance_id in debug_not_updatable_continue: logger.error( "[Pool %d] Deleting instance with EC2 ID %s from our database because it is not updatable but not in our region." % (pool.id, instance.ec2_instance_id)) if instance.ec2_instance_id in debug_not_in_region: logger.info( "[Pool %d] Deleting instance with EC2 ID %s from our database, has state code %s on EC2" % (pool.id, instance.ec2_instance_id, debug_not_in_region[instance.ec2_instance_id])) logger.info( "[Pool %d] Deleting instance with EC2 ID %s from our database." % (pool.id, instance.ec2_instance_id)) instance.delete()
def _update_pool_instances(pool, config): """Check the state of the instances in a pool and update it in the database""" from .models import Instance, INSTANCE_STATE, PoolStatusEntry, POOL_STATUS_ENTRY_TYPE instances = Instance.objects.filter(pool=pool) instance_ids_by_region = _get_instance_ids_by_region(instances) instances_by_ids = _get_instances_by_ids(instances) instances_left = [] instances_created = False debug_boto_instance_ids_seen = set() debug_not_updatable_continue = set() debug_not_in_region = {} for instance in instances_by_ids.values(): if instance.status_code != INSTANCE_STATE['requested']: instances_left.append(instance) # set config to this pool for now in case we set tags on fulfilled spot requests config.ec2_tags[SPOTMGR_TAG + '-PoolId'] = str(pool.pk) for region in instance_ids_by_region: cluster = Laniakea(None) try: cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: # Log this error to the pool status messages entry = PoolStatusEntry() entry.type = POOL_STATUS_ENTRY_TYPE['unclassified'] entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() logger.exception( "[Pool %d] update_pool_instances: laniakea failure: %s", pool.id, msg) return try: # first check status of pending spot requests requested = [] for instance_id in instance_ids_by_region[region]: if instances_by_ids[instance_id].status_code == INSTANCE_STATE[ 'requested']: requested.append(instance_id) if requested: boto_results = cluster.check_spot_requests( requested, config.ec2_tags) for req_id, result in zip(requested, boto_results): instance = instances_by_ids[req_id] if isinstance(result, boto.ec2.instance.Instance): logger.info( "[Pool %d] spot request fulfilled %s -> %s", pool.id, req_id, result.id) # spot request has been fulfilled instance.hostname = result.public_dns_name instance.ec2_instance_id = result.id # state_code is a 16-bit value where the high byte is # an opaque internal value and should be ignored. instance.status_code = result.state_code & 255 instance.save() # update local data structures to use the new instances instead del instances_by_ids[req_id] instances_by_ids[result.id] = instance instance_ids_by_region[region].append(result.id) # don't add it to instances_left yet to avoid race with adding tags # Now that we saved the object into our database, mark the instance as updatable # so our update code can pick it up and update it accordingly when it changes states result.add_tag(SPOTMGR_TAG + "-Updatable", "1") instances_created = True # request object is returned in case request is closed/cancelled/failed elif isinstance( result, boto.ec2.spotinstancerequest.SpotInstanceRequest): if result.state in {"cancelled", "closed"}: # this is normal, remove from DB and move on logger.info("[Pool %d] spot request %s is %s", pool.id, req_id, result.state) instances_by_ids[req_id].delete() elif result.state in {"open", "active"}: # this should not happen! warn and leave in DB in case it's fulfilled later logger.warning( "[Pool %d] Request %s is %s and %s.", pool.id, req_id, result.status.code, result.state) else: # state=failed msg = "Request %s is %s and %s." % ( req_id, result.status.code, result.state) entry = PoolStatusEntry() entry.type = POOL_STATUS_ENTRY_TYPE['unclassified'] entry.pool = pool entry.msg = str(msg) entry.isCritical = True entry.save() logger.error("[Pool %d] %s", pool.id, msg) instances_by_ids[req_id].delete() elif result is None: logger.info("[Pool %d] spot request %s is still open", pool.pk, req_id) else: logger.warning("[Pool %d] spot request %s returned %s", pool.pk, req_id, type(result).__name__) boto_instances = cluster.find( filters={"tag:" + SPOTMGR_TAG + "-PoolId": str(pool.pk)}) for boto_instance in boto_instances: # Store ID seen for debugging purposes debug_boto_instance_ids_seen.add(boto_instance.id) # state_code is a 16-bit value where the high byte is # an opaque internal value and should be ignored. state_code = boto_instance.state_code & 255 if (SPOTMGR_TAG + "-Updatable" not in boto_instance.tags or int(boto_instance.tags[SPOTMGR_TAG + "-Updatable"]) <= 0): # The instance is not marked as updatable. We must not touch it because # a spawning thread is still managing this instance. However, we must also # remove this instance from the instances_left list if it's already in our # database, because otherwise our code here would delete it from the database. if boto_instance.id in instance_ids_by_region[region]: instances_left.remove( instances_by_ids[boto_instance.id]) else: debug_not_updatable_continue.add(boto_instance.id) continue instance = None # Whenever we see an instance that is not in our instance list for that region, # make sure it's a terminated instance because we should never have a running # instance that matches the search above but is not in our database. if boto_instance.id not in instance_ids_by_region[region]: if state_code not in [ INSTANCE_STATE['shutting-down'], INSTANCE_STATE['terminated'] ]: # As a last resort, try to find the instance in our database. # If the instance was saved to our database between the entrance # to this function and the search query sent to EC2, then the instance # will not be in our instances list but returned by EC2. In this # case, we try to load it directly from the database. q = Instance.objects.filter( ec2_instance_id=boto_instance.id) if q: instance = q[0] logger.error( "[Pool %d] Instance with EC2 ID %s was reloaded from database.", pool.id, boto_instance.id) else: logger.error( "[Pool %d] Instance with EC2 ID %s is not in our database.", pool.id, boto_instance.id) # Terminate at this point, we run in an inconsistent state assert (False) debug_not_in_region[boto_instance.id] = state_code continue instance = instances_by_ids[boto_instance.id] instances_left.remove(instance) # Check the status code and update if necessary if instance.status_code != state_code: instance.status_code = state_code instance.save() # If for some reason we don't have a hostname yet, # update it accordingly. if not instance.hostname: instance.hostname = boto_instance.public_dns_name instance.save() except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg: if "MaxSpotInstanceCountExceeded" in str(msg): logger.warning( "[Pool %d] update_pool_instances: Maximum instance count exceeded for region %s", pool.id, region) if not PoolStatusEntry.objects.filter( pool=pool, type=POOL_STATUS_ENTRY_TYPE[ 'max-spot-instance-count-exceeded']): entry = PoolStatusEntry() entry.pool = pool entry.type = POOL_STATUS_ENTRY_TYPE[ 'max-spot-instance-count-exceeded'] entry.msg = "Auto-selected region exceeded its maximum spot instance count." entry.save() elif "Service Unavailable" in str(msg): logger.warning( "[Pool %d] update_pool_instances: Temporary failure in region %s: %s", pool.id, region, msg) entry = PoolStatusEntry() entry.pool = pool entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure'] entry.msg = "Temporary failure occurred: %s" % msg entry.save() else: logger.exception( "[Pool %d] update_pool_instances: boto failure: %s", pool.id, msg) entry = PoolStatusEntry() entry.type = POOL_STATUS_ENTRY_TYPE['unclassified'] entry.pool = pool entry.isCritical = True entry.msg = "Unclassified error occurred: %s" % msg entry.save() return for instance in instances_left: reasons = [] if instance.ec2_instance_id not in debug_boto_instance_ids_seen: reasons.append("no corresponding machine on EC2") if instance.ec2_instance_id in debug_not_updatable_continue: reasons.append("not updatable") if instance.ec2_instance_id in debug_not_in_region: reasons.append("has state code %s on EC2 but not in our region" % debug_not_in_region[instance.ec2_instance_id]) if not reasons: reasons.append("?") logger.info( "[Pool %d] Deleting instance with EC2 ID %s from our database: %s", pool.id, instance.ec2_instance_id, ", ".join(reasons)) instance.delete() if instances_created: # Delete certain warnings we might have created earlier that no longer apply # If we ever exceeded the maximum spot instance count, we can clear # the warning now because we obviously succeeded in starting some instances. PoolStatusEntry.objects.filter( pool=pool, type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded'] ).delete() # The same holds for temporary failures of any sort PoolStatusEntry.objects.filter( pool=pool, type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete()
def update_pool_instances(self, pool, config): """ Check the state of the instances in a pool and update it in the database """ instances = Instance.objects.filter(pool=pool) instance_ids_by_region = self.get_instance_ids_by_region(instances) instances_by_ids = self.get_instances_by_ids(instances) instances_left = [] for instance_id in instances_by_ids: if instance_id: instances_left.append(instances_by_ids[instance_id]) for region in instance_ids_by_region: cluster = Laniakea(None) try: cluster.connect( region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key) except Exception as msg: logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "update_pool_instances", msg)) return None try: boto_instances = cluster.find( filters={ "tag:SpotManager-PoolId": str(pool.pk), "tag:SpotManager-Updatable": "1" }) for boto_instance in boto_instances: instance = None # Whenever we see an instance that is not in our instance list for that region, # make sure it's a terminated instance because we should never have a running # instance that matches the search above but is not in our database. if not boto_instance.id in instance_ids_by_region[region]: if not ((boto_instance.state_code == INSTANCE_STATE['shutting-down'] or boto_instance.state_code == INSTANCE_STATE['terminated'])): # As a last resort, try to find the instance in our database. # If the instance was saved to our database between the entrance # to this function and the search query sent to EC2, then the instance # will not be in our instances list but returned by EC2. In this # case, we try to load it directly from the database. q = Instance.objects.filter( ec2_instance_id=boto_instance.id) if q: instance = q[0] else: logger.error( "[Pool %d] Instance with EC2 ID %s is not in our database." % (pool.id, boto_instance.id)) # Terminate at this point, we run in an inconsistent state assert (False) continue if not instance: instance = instances_by_ids[boto_instance.id] instances_left.remove(instance) # Check the status code and update if necessary if instance.status_code != boto_instance.state_code: instance.status_code = boto_instance.state_code instance.save() # If for some reason we don't have a hostname yet, # update it accordingly. if not instance.hostname: instance.hostname = boto_instance.public_dns_name instance.save() except boto.exception.EC2ResponseError as msg: logger.exception("%s: boto failure: %s" % ("update_pool_instances", msg)) return 1 if instances_left: for instance in instances_left: logger.info( "[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2." % (pool.id, instance.ec2_instance_id)) instance.delete()