def terminate_pool_instances(self,
                                 pool,
                                 instances,
                                 config,
                                 terminateByPool=False):
        """ Terminate an instance with the given configuration """
        instance_ids_by_region = self.get_instance_ids_by_region(instances)

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "terminate_pool_instances", msg))
                return None

            try:
                if terminateByPool:
                    boto_instances = cluster.find(
                        filters={"tag:SpotManager-PoolId": str(pool.pk)})

                    # Data consistency checks
                    for boto_instance in boto_instances:
                        if not ((boto_instance.id
                                 in instance_ids_by_region[region]) or
                                (boto_instance.state_code
                                 == INSTANCE_STATE['shutting-down']
                                 or boto_instance.state_code
                                 == INSTANCE_STATE['terminated'])):
                            logger.error(
                                "[Pool %d] Instance with EC2 ID %s (status %d) is not in region list for region %s"
                                % (pool.id, boto_instance.id,
                                   boto_instance.state_code, region))

                    cluster.terminate(boto_instances)
                else:
                    logger.info(
                        "[Pool %d] Terminating %s instances in region %s" %
                        (pool.id, len(instance_ids_by_region[region]), region))
                    cluster.terminate(
                        cluster.find(
                            instance_ids=instance_ids_by_region[region]))
            except boto.exception.EC2ResponseError as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" %
                                 (pool.id, "terminate_pool_instances", msg))
                return 1
    def terminate_pool_instances(self, pool, instances, config, terminateByPool=False):
        """ Terminate an instance with the given configuration """
        instance_ids_by_region = self.get_instance_ids_by_region(instances)

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "terminate_pool_instances", msg))
                return None

            try:
                if terminateByPool:
                    boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk)})

                    # Data consistency checks
                    for boto_instance in boto_instances:
                        # state_code is a 16-bit value where the high byte is
                        # an opaque internal value and should be ignored.
                        state_code = boto_instance.state_code & 255
                        if not ((boto_instance.id in instance_ids_by_region[region])
                                or (state_code == INSTANCE_STATE['shutting-down']
                                or state_code == INSTANCE_STATE['terminated'])):
                            logger.error("[Pool %d] Instance with EC2 ID %s (status %d) is not in region list for region %s" % (pool.id, boto_instance.id, state_code, region))

                    cluster.terminate(boto_instances)
                else:
                    logger.info("[Pool %d] Terminating %s instances in region %s" % (pool.id, len(instance_ids_by_region[region]), region))
                    cluster.terminate(cluster.find(instance_ids=instance_ids_by_region[region]))
            except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "terminate_pool_instances", msg))
                return 1
        def start_instances_async(pool, config, count, images, region, zone, instances):
            userdata = LaniakeaCommandLine.handle_import_tags(config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)

                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Configuration error: Failed to compile userdata"
                entry.save()

                for instance in instances:
                    instance.delete()

                raise RuntimeError("start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." % (pool.id, count))
                boto_instances = cluster.create_spot(config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20 * 60)
                canceled_requests = count - len(boto_instances)

                logger.info("[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), canceled_requests))

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    instances[i].status_code = boto_instances[i].state_code & 255
                    instances[i].save()

                    assert(instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if canceled_requests > 0:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info("[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk))
                        instances[i].delete()

                # Delete certain warnings we might have created earlier that no longer apply

                # If we ever exceeded the maximum spot instance count, we can clear
                # the warning now because we obviously succeeded in starting some instances.
                PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']).delete()

                # The same holds for temporary failures of any sort
                PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete()

                # Do not delete unclassified errors here for now, so the user can see them.

            except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg:
                if "MaxSpotInstanceCountExceeded" in str(msg):
                    logger.warning("[Pool %d] Maximum instance count exceeded for region %s" % (pool.id, region))
                    if not PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']):
                        entry = PoolStatusEntry()
                        entry.pool = pool
                        entry.type = POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']
                        entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                        entry.save()
                elif "Service Unavailable" in str(msg):
                    logger.warning("[Pool %d] Temporary failure in region %s: %s" % (pool.id, region, msg))
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                    entry.msg = "Temporary failure occurred: %s" % msg
                    entry.save()
                else:
                    logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg))
                    entry = PoolStatusEntry()
                    entry.type = 0
                    entry.pool = pool
                    entry.isCritical = True
                    entry.msg = "Unclassified error occurred: %s" % msg
                    entry.save()

                # Delete all pending instances, assuming that an exception from laniakea
                # means that all instance requests failed.
                for instance in instances:
                    instance.delete()

                return
    def start_pool_instances(self, pool, config, count=1):
        """ Start an instance with the given configuration """

        images = self.create_laniakea_images(config)

        # Figure out where to put our instances
        try:
            (region, zone, rejected) = self.get_best_region_zone(config)
        except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg:
            # In case of temporary failures here, we will retry again in the next cycle
            logger.warn("[Pool %d] Failed to aquire spot instance prices: %s." % (pool.id, msg))
            return
        except (RuntimeError) as msg:
            logger.error("[Pool %d] Failed to compile userdata." % pool.id)
            entry = PoolStatusEntry()
            entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
            entry.pool = pool
            entry.isCritical = True
            entry.msg = "Configuration error: %s" % msg
            entry.save()
            return

        priceLowEntries = PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['price-too-low'])

        if not region:
            logger.warn("[Pool %d] No allowed region was cheap enough to spawn instances." % pool.id)

            if not priceLowEntries:
                entry = PoolStatusEntry()
                entry.pool = pool
                entry.type = POOL_STATUS_ENTRY_TYPE['price-too-low']
                entry.msg = "No allowed region was cheap enough to spawn instances."
                for zone in rejected:
                    entry.msg += "\n%s at %s" % (zone, rejected[zone])
                entry.save()
            return
        else:
            if priceLowEntries:
                priceLowEntries.delete()

        logger.debug("[Pool %d] Using region %s with availability zone %s." % (pool.id, region, zone))

        instances = []

        # Create all our instances as pending, the async thread will update them once
        # they have been spawned.
        for i in range(0, count):
            instance = Instance()
            instance.ec2_region = region
            instance.ec2_zone = zone
            instance.status_code = INSTANCE_STATE["requested"]
            instance.pool = pool
            instance.save()
            instances.append(instance)

        # This method will run async to spawn our machines
        def start_instances_async(pool, config, count, images, region, zone, instances):
            userdata = LaniakeaCommandLine.handle_import_tags(config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)

                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Configuration error: Failed to compile userdata"
                entry.save()

                for instance in instances:
                    instance.delete()

                raise RuntimeError("start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." % (pool.id, count))
                boto_instances = cluster.create_spot(config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20 * 60)
                canceled_requests = count - len(boto_instances)

                logger.info("[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), canceled_requests))

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    instances[i].status_code = boto_instances[i].state_code & 255
                    instances[i].save()

                    assert(instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if canceled_requests > 0:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info("[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk))
                        instances[i].delete()

                # Delete certain warnings we might have created earlier that no longer apply

                # If we ever exceeded the maximum spot instance count, we can clear
                # the warning now because we obviously succeeded in starting some instances.
                PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']).delete()

                # The same holds for temporary failures of any sort
                PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete()

                # Do not delete unclassified errors here for now, so the user can see them.

            except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg:
                if "MaxSpotInstanceCountExceeded" in str(msg):
                    logger.warning("[Pool %d] Maximum instance count exceeded for region %s" % (pool.id, region))
                    if not PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']):
                        entry = PoolStatusEntry()
                        entry.pool = pool
                        entry.type = POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']
                        entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                        entry.save()
                elif "Service Unavailable" in str(msg):
                    logger.warning("[Pool %d] Temporary failure in region %s: %s" % (pool.id, region, msg))
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                    entry.msg = "Temporary failure occurred: %s" % msg
                    entry.save()
                else:
                    logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg))
                    entry = PoolStatusEntry()
                    entry.type = 0
                    entry.pool = pool
                    entry.isCritical = True
                    entry.msg = "Unclassified error occurred: %s" % msg
                    entry.save()

                # Delete all pending instances, assuming that an exception from laniakea
                # means that all instance requests failed.
                for instance in instances:
                    instance.delete()

                return

        # TODO: We don't get any information back from the async method call here, but should handle failures!
        t = threading.Thread(target=start_instances_async, args=(pool, config, count, images, region, zone, instances))
        async_start_threads_by_poolid[pool.id] = t
        t.start()
 def update_pool_instances(self, pool, config):
     """ Check the state of the instances in a pool and update it in the database """
     instances = Instance.objects.filter(pool=pool)
     instance_ids_by_region = self.get_instance_ids_by_region(instances)
     instances_by_ids = self.get_instances_by_ids(instances)
     instances_left = []
     
     for instance_id in instances_by_ids:
         if instance_id:
             instances_left.append(instances_by_ids[instance_id])
     
     for region in instance_ids_by_region:
         cluster = Laniakea(None)
         try:
             cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
         except Exception as msg:
             # Log this error to the pool status messages
             entry = PoolStatusEntry()
             entry.type = 0
             entry.pool = pool
             entry.msg = str(msg)
             entry.isCritical = True
             entry.save()
             
             logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "update_pool_instances", msg))
             return None
     
         try:
             boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk), "tag:SpotManager-Updatable" : "1"})
             
             
             for boto_instance in boto_instances:
                 instance = None
                 
                 # Whenever we see an instance that is not in our instance list for that region,
                 # make sure it's a terminated instance because we should never have a running 
                 # instance that matches the search above but is not in our database.
                 if not boto_instance.id in instance_ids_by_region[region]:
                     if not ((boto_instance.state_code == INSTANCE_STATE['shutting-down'] 
                         or boto_instance.state_code == INSTANCE_STATE['terminated'])):
                         
                         # As a last resort, try to find the instance in our database.
                         # If the instance was saved to our database between the entrance
                         # to this function and the search query sent to EC2, then the instance
                         # will not be in our instances list but returned by EC2. In this
                         # case, we try to load it directly from the database.
                         q = Instance.objects.filter(ec2_instance_id = boto_instance.id)
                         if q:
                             instance = q[0]
                         else:
                             logger.error("[Pool %d] Instance with EC2 ID %s is not in our database." % (pool.id, boto_instance.id))
                                 
                             # Terminate at this point, we run in an inconsistent state
                             assert(False)
                         
                     continue
                 
                 if not instance:
                     instance = instances_by_ids[boto_instance.id]
                     instances_left.remove(instance)
                 
                 # Check the status code and update if necessary
                 if instance.status_code != boto_instance.state_code:
                     instance.status_code = boto_instance.state_code
                     instance.save()
                     
                 # If for some reason we don't have a hostname yet,
                 # update it accordingly.
                 if not instance.hostname:
                     instance.hostname = boto_instance.public_dns_name
                     instance.save()
                 
         except boto.exception.EC2ResponseError as msg:
             logger.exception("%s: boto failure: %s" % ("update_pool_instances", msg))
             return 1
     
     if instances_left:
         for instance in instances_left:
             logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2." % (pool.id, instance.ec2_instance_id))
             instance.delete()
Beispiel #6
0
    def check_instance_pools(self, initialCheck=False):
        # Check all start threads
        finished_start_thread_poolids = [
            id for id in async_start_threads_by_poolid
            if not async_start_threads_by_poolid[id].isAlive()
        ]
        for id in finished_start_thread_poolids:
            del async_start_threads_by_poolid[id]

        # Process all instance pools
        instance_pools = InstancePool.objects.all()
        for instance_pool in instance_pools:
            criticalPoolStatusEntries = PoolStatusEntry.objects.filter(
                pool=instance_pool, isCritical=True)

            if criticalPoolStatusEntries:
                continue

            if instance_pool.config.isCyclic(
            ) or instance_pool.config.getMissingParameters():
                entry = PoolStatusEntry()
                entry.pool = instance_pool
                entry.isCritical = True
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.msg = "Configuration error."
                entry.save()
                continue

            config = instance_pool.config.flatten()

            instances_missing = config.size
            running_instances = []

            self.update_pool_instances(instance_pool, config)

            # On our initial check, we only do everything up to the pool update
            # to ensure that for every pool, the pool update can run successfully.
            if initialCheck:
                continue

            instances = Instance.objects.filter(pool=instance_pool)

            for instance in instances:
                instance_status_code_fixed = False
                if instance.status_code >= 256:
                    logger.warning(
                        "[Pool %d] Instance with EC2 ID %s has weird state code %d, attempting to fix..."
                        % (instance_pool.id, instance.ec2_instance_id,
                           instance.status_code))
                    instance.status_code -= 256
                    instance_status_code_fixed = True

                if instance.status_code in [
                        INSTANCE_STATE['running'], INSTANCE_STATE['pending'],
                        INSTANCE_STATE['requested']
                ]:
                    instances_missing -= 1
                    running_instances.append(instance)
                elif instance.status_code in [
                        INSTANCE_STATE['shutting-down'],
                        INSTANCE_STATE['terminated']
                ]:
                    # The instance is no longer running, delete it from our database
                    logger.info(
                        "[Pool %d] Deleting terminated instance with EC2 ID %s from our database."
                        % (instance_pool.id, instance.ec2_instance_id))
                    instance.delete()
                else:
                    if instance_status_code_fixed:
                        # Restore original status code for error reporting
                        instance.status_code += 256

                    logger.error(
                        "[Pool %d] Instance with EC2 ID %s has unexpected state code %d"
                        % (instance_pool.id, instance.ec2_instance_id,
                           instance.status_code))
                    # In some cases, EC2 sends undocumented status codes and we don't know why
                    # For now, reset the status code to 0, consider the instance still present
                    # and hope that with the next update iteration, the problem will be gone.
                    instance.status_code = 0
                    instance.save()
                    instances_missing -= 1
                    running_instances.append(instance)

            # Continue working with the instances we have running
            instances = running_instances

            if not instance_pool.isEnabled:
                if running_instances:
                    self.terminate_pool_instances(instance_pool,
                                                  running_instances,
                                                  config,
                                                  terminateByPool=True)

                    # Try to update our terminated instances as soon as possible. If EC2 needs longer than
                    # the here specified sleep time, the instances will be updated with the next iteration
                    # of this pool, allowing other actions to be processed in-between.
                    # time.sleep(2)
                    # self.update_pool_instances(instance_pool, config)
                continue

            if (not instance_pool.last_cycled
                ) or instance_pool.last_cycled < timezone.now(
                ) - timezone.timedelta(seconds=config.cycle_interval):
                if pending_shutdown:
                    logger.info(
                        "[Pool %d] Shutdown pending, skipping pool cycle..." %
                        instance_pool.id)
                else:
                    logger.info(
                        "[Pool %d] Needs to be cycled, terminating all instances..."
                        % instance_pool.id)
                    instance_pool.last_cycled = timezone.now()
                    self.terminate_pool_instances(instance_pool,
                                                  instances,
                                                  config,
                                                  terminateByPool=True)
                    instance_pool.save()

                    logger.info("[Pool %d] Termination complete." %
                                instance_pool.id)

            if instances_missing > 0:
                if pending_shutdown:
                    logger.info(
                        "[Pool %d] Shutdown pending, not starting further instances..."
                        % instance_pool.id)
                elif instance_pool.id in async_start_threads_by_poolid:
                    logger.debug(
                        "[Pool %d] Already has a start thread running, not starting further instances..."
                        % instance_pool.id)
                else:
                    logger.info(
                        "[Pool %d] Needs %s more instances, starting..." %
                        (instance_pool.id, instances_missing))
                    self.start_pool_instances(instance_pool,
                                              config,
                                              count=instances_missing)
            elif instances_missing < 0:
                # Select the oldest instances we have running and terminate
                # them so we meet the size limitation again.
                logger.info(
                    "[Pool %d] Has %s instances over limit, terminating..." %
                    (instance_pool.id, -instances_missing))
                instances = Instance.objects.filter(
                    pool=instance_pool).order_by(
                        'created')[:-instances_missing]
                self.terminate_pool_instances(instance_pool, instances, config)
            else:
                logger.debug("[Pool %d] Size is ok." % instance_pool.id)
Beispiel #7
0
    def update_pool_instances(self, pool, config):
        """ Check the state of the instances in a pool and update it in the database """
        instances = Instance.objects.filter(pool=pool)
        instance_ids_by_region = self.get_instance_ids_by_region(instances)
        instances_by_ids = self.get_instances_by_ids(instances)
        instances_left = []

        debug_boto_instance_ids_seen = set()
        debug_not_updatable_continue = set()
        debug_not_in_region = {}

        for instance_id in instances_by_ids:
            if instance_id:
                instances_left.append(instances_by_ids[instance_id])

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "update_pool_instances", msg))
                return None

            try:
                boto_instances = cluster.find(
                    filters={"tag:SpotManager-PoolId": str(pool.pk)})

                for boto_instance in boto_instances:
                    # Store ID seen for debugging purposes
                    debug_boto_instance_ids_seen.add(boto_instance.id)

                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    state_code = boto_instance.state_code & 255

                    if "SpotManager-Updatable" not in boto_instance.tags or int(
                            boto_instance.tags["SpotManager-Updatable"]) <= 0:
                        # The instance is not marked as updatable. We must not touch it because
                        # a spawning thread is still managing this instance. However, we must also
                        # remove this instance from the instances_left list if it's already in our
                        # database, because otherwise our code here would delete it from the database.
                        if boto_instance.id in instance_ids_by_region[region]:
                            instances_left.remove(
                                instances_by_ids[boto_instance.id])
                        else:
                            debug_not_updatable_continue.add(boto_instance.id)
                        continue

                    instance = None

                    # Whenever we see an instance that is not in our instance list for that region,
                    # make sure it's a terminated instance because we should never have a running
                    # instance that matches the search above but is not in our database.
                    if not boto_instance.id in instance_ids_by_region[region]:
                        if not (
                            (state_code == INSTANCE_STATE['shutting-down']
                             or state_code == INSTANCE_STATE['terminated'])):

                            # As a last resort, try to find the instance in our database.
                            # If the instance was saved to our database between the entrance
                            # to this function and the search query sent to EC2, then the instance
                            # will not be in our instances list but returned by EC2. In this
                            # case, we try to load it directly from the database.
                            q = Instance.objects.filter(
                                ec2_instance_id=boto_instance.id)
                            if q:
                                instance = q[0]
                                logger.error(
                                    "[Pool %d] Instance with EC2 ID %s was reloaded from database."
                                    % (pool.id, boto_instance.id))
                            else:
                                logger.error(
                                    "[Pool %d] Instance with EC2 ID %s is not in our database."
                                    % (pool.id, boto_instance.id))

                                # Terminate at this point, we run in an inconsistent state
                                assert (False)
                        debug_not_in_region[boto_instance.id] = state_code
                        continue

                    instance = instances_by_ids[boto_instance.id]
                    instances_left.remove(instance)

                    # Check the status code and update if necessary
                    if instance.status_code != state_code:
                        instance.status_code = state_code
                        instance.save()

                    # If for some reason we don't have a hostname yet,
                    # update it accordingly.
                    if not instance.hostname:
                        instance.hostname = boto_instance.public_dns_name
                        instance.save()

            except (boto.exception.EC2ResponseError,
                    boto.exception.BotoServerError, ssl.SSLError,
                    socket.error) as msg:
                logger.exception("%s: boto failure: %s" %
                                 ("update_pool_instances", msg))
                return 1

        if instances_left:
            for instance in instances_left:
                if not instance.ec2_instance_id in debug_boto_instance_ids_seen:
                    logger.info(
                        "[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2."
                        % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_updatable_continue:
                    logger.error(
                        "[Pool %d] Deleting instance with EC2 ID %s from our database because it is not updatable but not in our region."
                        % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_in_region:
                    logger.info(
                        "[Pool %d] Deleting instance with EC2 ID %s from our database, has state code %s on EC2"
                        % (pool.id, instance.ec2_instance_id,
                           debug_not_in_region[instance.ec2_instance_id]))

                logger.info(
                    "[Pool %d] Deleting instance with EC2 ID %s from our database."
                    % (pool.id, instance.ec2_instance_id))
                instance.delete()
        def start_instances_async(pool, config, count, images, region, zone,
                                  instances):
            userdata = LaniakeaCommandLine.handle_import_tags(
                config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(
                userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)
                raise RuntimeError(
                    "start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." %
                            (pool.id, count))
                (boto_instances, boto_pending) = cluster.create_spot(
                    config.ec2_max_price,
                    tags=config.ec2_tags,
                    delete_on_termination=True,
                    timeout=20 * 60)

                logger.info(
                    "[Pool %d] Successfully created %s instances, %s requests timed out and were canceled"
                    % (pool.id, len(boto_instances), len(boto_pending)))

                assert (len(boto_instances) +
                        len(boto_pending)) == len(instances) == count

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    instances[i].status_code = boto_instances[i].state_code
                    instances[i].save()

                    assert (instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if boto_pending:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info(
                            "[Pool %d] Deleting instance with id %s (belongs to canceled request)"
                            % (pool.id, instances[i].pk))
                        instances[i].delete()

            except boto.exception.EC2ResponseError as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" %
                                 (pool.id, "start_instances_async", msg))
                return
Beispiel #9
0
    def start_pool_instances(self, pool, config, count=1):
        """ Start an instance with the given configuration """

        images = self.create_laniakea_images(config)

        # Figure out where to put our instances
        try:
            (region, zone, rejected) = self.get_best_region_zone(config)
        except (boto.exception.EC2ResponseError,
                boto.exception.BotoServerError, ssl.SSLError,
                socket.error) as msg:
            # In case of temporary failures here, we will retry again in the next cycle
            logger.warn(
                "[Pool %d] Failed to aquire spot instance prices: %s." %
                (pool.id, msg))
            return
        except (RuntimeError) as msg:
            logger.error("[Pool %d] Failed to compile userdata." % pool.id)
            entry = PoolStatusEntry()
            entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
            entry.pool = pool
            entry.isCritical = True
            entry.msg = "Configuration error: %s" % msg
            entry.save()
            return

        priceLowEntries = PoolStatusEntry.objects.filter(
            pool=pool, type=POOL_STATUS_ENTRY_TYPE['price-too-low'])

        if not region:
            logger.warn(
                "[Pool %d] No allowed region was cheap enough to spawn instances."
                % pool.id)

            if not priceLowEntries:
                entry = PoolStatusEntry()
                entry.pool = pool
                entry.type = POOL_STATUS_ENTRY_TYPE['price-too-low']
                entry.msg = "No allowed region was cheap enough to spawn instances."
                for zone in rejected:
                    entry.msg += "\n%s at %s" % (zone, rejected[zone])
                entry.save()
            return
        else:
            if priceLowEntries:
                priceLowEntries.delete()

        logger.debug("[Pool %d] Using region %s with availability zone %s." %
                     (pool.id, region, zone))

        instances = []

        # Create all our instances as pending, the async thread will update them once
        # they have been spawned.
        for i in range(0, count):
            instance = Instance()
            instance.ec2_region = region
            instance.ec2_zone = zone
            instance.status_code = INSTANCE_STATE["requested"]
            instance.pool = pool
            instance.save()
            instances.append(instance)

        # This method will run async to spawn our machines
        def start_instances_async(pool, config, count, images, region, zone,
                                  instances):
            userdata = LaniakeaCommandLine.handle_import_tags(
                config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(
                userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)

                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Configuration error: Failed to compile userdata"
                entry.save()

                for instance in instances:
                    instance.delete()

                raise RuntimeError(
                    "start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." %
                            (pool.id, count))
                boto_instances = cluster.create_spot(
                    config.ec2_max_price,
                    tags=config.ec2_tags,
                    delete_on_termination=True,
                    timeout=20 * 60)
                canceled_requests = count - len(boto_instances)

                logger.info(
                    "[Pool %d] Successfully created %s instances, %s requests timed out and were canceled"
                    % (pool.id, len(boto_instances), canceled_requests))

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    instances[
                        i].status_code = boto_instances[i].state_code & 255
                    instances[i].save()

                    assert (instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if canceled_requests > 0:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info(
                            "[Pool %d] Deleting instance with id %s (belongs to canceled request)"
                            % (pool.id, instances[i].pk))
                        instances[i].delete()

                # Delete certain warnings we might have created earlier that no longer apply

                # If we ever exceeded the maximum spot instance count, we can clear
                # the warning now because we obviously succeeded in starting some instances.
                PoolStatusEntry.objects.filter(
                    pool=pool,
                    type=POOL_STATUS_ENTRY_TYPE[
                        'max-spot-instance-count-exceeded']).delete()

                # The same holds for temporary failures of any sort
                PoolStatusEntry.objects.filter(
                    pool=pool,
                    type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete()

                # Do not delete unclassified errors here for now, so the user can see them.

            except (boto.exception.EC2ResponseError,
                    boto.exception.BotoServerError, ssl.SSLError,
                    socket.error) as msg:
                if "MaxSpotInstanceCountExceeded" in str(msg):
                    logger.warning(
                        "[Pool %d] Maximum instance count exceeded for region %s"
                        % (pool.id, region))
                    if not PoolStatusEntry.objects.filter(
                            pool=pool,
                            type=POOL_STATUS_ENTRY_TYPE[
                                'max-spot-instance-count-exceeded']):
                        entry = PoolStatusEntry()
                        entry.pool = pool
                        entry.type = POOL_STATUS_ENTRY_TYPE[
                            'max-spot-instance-count-exceeded']
                        entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                        entry.save()
                elif "Service Unavailable" in str(msg):
                    logger.warning(
                        "[Pool %d] Temporary failure in region %s: %s" %
                        (pool.id, region, msg))
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                    entry.msg = "Temporary failure occurred: %s" % msg
                    entry.save()
                else:
                    logger.exception("[Pool %d] %s: boto failure: %s" %
                                     (pool.id, "start_instances_async", msg))
                    entry = PoolStatusEntry()
                    entry.type = 0
                    entry.pool = pool
                    entry.isCritical = True
                    entry.msg = "Unclassified error occurred: %s" % msg
                    entry.save()

                # Delete all pending instances, assuming that an exception from laniakea
                # means that all instance requests failed.
                for instance in instances:
                    instance.delete()

                return

        # TODO: We don't get any information back from the async method call here, but should handle failures!
        t = threading.Thread(target=start_instances_async,
                             args=(pool, config, count, images, region, zone,
                                   instances))
        async_start_threads_by_poolid[pool.id] = t
        t.start()
    def check_instance_pools(self):
        instance_pools = InstancePool.objects.all()

        # Process all instance pools
        for instance_pool in instance_pools:
            criticalPoolStatusEntries = PoolStatusEntry.objects.filter(pool = instance_pool, isCritical = True)
            
            if criticalPoolStatusEntries:
                continue
            
            if instance_pool.config.isCyclic() or instance_pool.config.getMissingParameters():
                entry = PoolStatusEntry()
                entry.pool = instance_pool
                entry.isCritical = True
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.msg = "Configuration error."
                entry.save()
                continue
                
            config = instance_pool.config.flatten()
            
            instances_missing = config.size
            running_instances = []
            
            self.update_pool_instances(instance_pool, config)
            
            instances = Instance.objects.filter(pool=instance_pool)
            
            for instance in instances:
                if instance.status_code in [INSTANCE_STATE['running'], INSTANCE_STATE['pending'], INSTANCE_STATE['requested']]:
                    instances_missing -= 1
                    running_instances.append(instance)
                elif instance.status_code in [INSTANCE_STATE['shutting-down'], INSTANCE_STATE['terminated']]: 
                    # The instance is no longer running, delete it from our database
                    logger.info("[Pool %d] Deleting terminated instance with EC2 ID %s from our database." % (instance_pool.id, instance.ec2_instance_id))
                    instance.delete()
                else:
                    logger.error("[Pool %d] Instance with EC2 ID %s has unexpected state code %d" % (instance_pool.id, instance.ec2_instance_id, instance.status_code))
                    # Terminate here for now so we can see which status code we are not handling properly
                    assert(False)
            
            # Continue working with the instances we have running
            instances = running_instances
                    
            if not instance_pool.isEnabled:
                if running_instances:
                    self.terminate_pool_instances(instance_pool, running_instances, config, terminateByPool=True)
                    
                    # Try to update our terminated instances as soon as possible. If EC2 needs longer than
                    # the here specified sleep time, the instances will be updated with the next iteration
                    # of this pool, allowing other actions to be processed in-between.
                    #time.sleep(2)
                    #self.update_pool_instances(instance_pool, config)
                continue
            
            if (not instance_pool.last_cycled) or instance_pool.last_cycled < timezone.now() - timezone.timedelta(seconds=config.cycle_interval):
                logger.info("[Pool %d] Needs to be cycled, terminating all instances..." % instance_pool.id)
                instance_pool.last_cycled = timezone.now()
                self.terminate_pool_instances(instance_pool, instances, config, terminateByPool=True)
                instance_pool.save()
                
                # Try to update our terminated instances as soon as possible. If EC2 needs longer than
                # the here specified sleep time, the instances will be updated with the next iteration
                # of this pool, allowing other actions to be processed in-between.
                #time.sleep(2)
                #self.update_pool_instances(instance_pool, config)
                logger.info("[Pool %d] Termination complete." % instance_pool.id)
            
            # Determine which instances need to be cycled
            #outdated_instances = instances.filter(created__lt = timezone.now() - timezone.timedelta(seconds=config.cycle_interval))
            
            # Terminate all instances that need cycling
            #for instance in outdated_instances:
            #    self.terminate_instance(instance, config)
            #    instances_missing += 1
            
            if instances_missing > 0:
                logger.info("[Pool %d] Needs %s more instances, starting..." % (instance_pool.id, instances_missing))
                self.start_pool_instances(instance_pool, config, count=instances_missing)
            elif instances_missing < 0:
                # Select the oldest instances we have running and terminate
                # them so we meet the size limitation again.
                logger.info("[Pool %d] Has %s instances over limit, terminating..." % (instance_pool.id, -instances_missing))
                instances = Instance.objects.filter(pool=instance_pool).order_by('created')[:-instances_missing]
                self.terminate_pool_instances(instance_pool, instances, config)
            else:
                logger.debug("[Pool %d] Size is ok." % instance_pool.id)
     def start_instances_async(pool, config, count, images, region, zone, instances):
         userdata = LaniakeaCommandLine.handle_import_tags(config.ec2_userdata)
         userdata = LaniakeaCommandLine.handle_tags(userdata, config.ec2_userdata_macros)
         if not userdata:
             logger.error("[Pool %d] Failed to compile userdata." % pool.id)
             raise RuntimeError("start_instances_async: Failed to compile userdata")
         
         images["default"]['user_data'] = userdata
         images["default"]['placement'] = zone
         images["default"]['count'] = count
 
         cluster = Laniakea(images)
         try:
             cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
         except Exception as msg:
             logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg))
             
             # Log this error to the pool status messages
             entry = PoolStatusEntry()
             entry.pool = pool
             entry.msg = str(msg)
             entry.isCritical = True
             entry.save()
             
             # Delete all pending instances as we failed to create them
             for instance in instances:
                 instance.delete()
                 
             return
         
         config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)
 
         try:
             logger.info("[Pool %d] Creating %s instances..." % (pool.id, count))
             (boto_instances, boto_pending) = cluster.create_spot(config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20*60)
             
             logger.info("[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), len(boto_pending)))
             
             assert (len(boto_instances) + len(boto_pending)) == len(instances) == count
             
             for i in range(0,len(boto_instances)):
                 instances[i].hostname = boto_instances[i].public_dns_name
                 instances[i].ec2_instance_id = boto_instances[i].id
                 instances[i].status_code = boto_instances[i].state_code
                 instances[i].save()
                 
                 assert(instances[i].ec2_instance_id != None)
                 
                 # Now that we saved the object into our database, mark the instance as updatable
                 # so our update code can pick it up and update it accordingly when it changes states 
                 boto_instances[i].add_tag("SpotManager-Updatable", "1")
                 
             if boto_pending:
                 for i in range(len(boto_instances),count):
                     # Delete instances belong to canceled spot requests
                     logger.info("[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk))
                     instances[i].delete()
             
         except boto.exception.EC2ResponseError as msg:
             logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg))
             return
 def start_pool_instances(self, pool, config, count=1):
     """ Start an instance with the given configuration """
     
     images = self.create_laniakea_images(config)
     
     # Figure out where to put our instances
     (region, zone, rejected) = self.get_best_region_zone(config)
     
     priceLowEntries = PoolStatusEntry.objects.filter(pool = pool, type = POOL_STATUS_ENTRY_TYPE['price-too-low'])
     
     if not region:
         logger.warn("[Pool %d] No allowed region was cheap enough to spawn instances." % pool.id)
         
         if not priceLowEntries:
             entry = PoolStatusEntry()
             entry.pool = pool
             entry.type = POOL_STATUS_ENTRY_TYPE['price-too-low']
             entry.msg = "No allowed region was cheap enough to spawn instances."
             for zone in rejected:
                 entry.msg += "\n%s at %s" % (zone, rejected[zone])
             entry.save()
         return
     else:
         if priceLowEntries:
             priceLowEntries.delete()
     
     logger.debug("[Pool %d] Using region %s with availability zone %s." % (pool.id, region, zone))
     
     instances = []
     
     # Create all our instances as pending, the async thread will update them once
     # they have been spawned.
     for i in range(0,count):
         instance = Instance()
         instance.ec2_region = region
         instance.ec2_zone = zone
         instance.status_code = INSTANCE_STATE["requested"]
         instance.pool = pool
         instance.save()
         instances.append(instance)
     
     # This method will run async to spawn our machines
     def start_instances_async(pool, config, count, images, region, zone, instances):
         userdata = LaniakeaCommandLine.handle_import_tags(config.ec2_userdata)
         userdata = LaniakeaCommandLine.handle_tags(userdata, config.ec2_userdata_macros)
         if not userdata:
             logger.error("[Pool %d] Failed to compile userdata." % pool.id)
             raise RuntimeError("start_instances_async: Failed to compile userdata")
         
         images["default"]['user_data'] = userdata
         images["default"]['placement'] = zone
         images["default"]['count'] = count
 
         cluster = Laniakea(images)
         try:
             cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
         except Exception as msg:
             logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg))
             
             # Log this error to the pool status messages
             entry = PoolStatusEntry()
             entry.pool = pool
             entry.msg = str(msg)
             entry.isCritical = True
             entry.save()
             
             # Delete all pending instances as we failed to create them
             for instance in instances:
                 instance.delete()
                 
             return
         
         config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)
 
         try:
             logger.info("[Pool %d] Creating %s instances..." % (pool.id, count))
             (boto_instances, boto_pending) = cluster.create_spot(config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20*60)
             
             logger.info("[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), len(boto_pending)))
             
             assert (len(boto_instances) + len(boto_pending)) == len(instances) == count
             
             for i in range(0,len(boto_instances)):
                 instances[i].hostname = boto_instances[i].public_dns_name
                 instances[i].ec2_instance_id = boto_instances[i].id
                 instances[i].status_code = boto_instances[i].state_code
                 instances[i].save()
                 
                 assert(instances[i].ec2_instance_id != None)
                 
                 # Now that we saved the object into our database, mark the instance as updatable
                 # so our update code can pick it up and update it accordingly when it changes states 
                 boto_instances[i].add_tag("SpotManager-Updatable", "1")
                 
             if boto_pending:
                 for i in range(len(boto_instances),count):
                     # Delete instances belong to canceled spot requests
                     logger.info("[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk))
                     instances[i].delete()
             
         except boto.exception.EC2ResponseError as msg:
             logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg))
             return
     
     # TODO: We don't get any information back from the async method call here, but should handle failures!
     t = threading.Thread(target=start_instances_async, args = (pool, config, count, images, region, zone, instances))
     t.start()
    def check_instance_pools(self):
        instance_pools = InstancePool.objects.all()

        # Process all instance pools
        for instance_pool in instance_pools:
            criticalPoolStatusEntries = PoolStatusEntry.objects.filter(
                pool=instance_pool, isCritical=True)

            if criticalPoolStatusEntries:
                continue

            if instance_pool.config.isCyclic(
            ) or instance_pool.config.getMissingParameters():
                entry = PoolStatusEntry()
                entry.pool = instance_pool
                entry.isCritical = True
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.msg = "Configuration error."
                entry.save()
                continue

            config = instance_pool.config.flatten()

            instances_missing = config.size
            running_instances = []

            self.update_pool_instances(instance_pool, config)

            instances = Instance.objects.filter(pool=instance_pool)

            for instance in instances:
                if instance.status_code in [
                        INSTANCE_STATE['running'], INSTANCE_STATE['pending'],
                        INSTANCE_STATE['requested']
                ]:
                    instances_missing -= 1
                    running_instances.append(instance)
                elif instance.status_code in [
                        INSTANCE_STATE['shutting-down'],
                        INSTANCE_STATE['terminated']
                ]:
                    # The instance is no longer running, delete it from our database
                    logger.info(
                        "[Pool %d] Deleting terminated instance with EC2 ID %s from our database."
                        % (instance_pool.id, instance.ec2_instance_id))
                    instance.delete()
                else:
                    logger.error(
                        "[Pool %d] Instance with EC2 ID %s has unexpected state code %d"
                        % (instance_pool.id, instance.ec2_instance_id,
                           instance.status_code))
                    # Terminate here for now so we can see which status code we are not handling properly
                    assert (False)

            # Continue working with the instances we have running
            instances = running_instances

            if not instance_pool.isEnabled:
                if running_instances:
                    self.terminate_pool_instances(instance_pool,
                                                  running_instances,
                                                  config,
                                                  terminateByPool=True)

                    # Try to update our terminated instances as soon as possible. If EC2 needs longer than
                    # the here specified sleep time, the instances will be updated with the next iteration
                    # of this pool, allowing other actions to be processed in-between.
                    #time.sleep(2)
                    #self.update_pool_instances(instance_pool, config)
                continue

            if (not instance_pool.last_cycled
                ) or instance_pool.last_cycled < timezone.now(
                ) - timezone.timedelta(seconds=config.cycle_interval):
                logger.info(
                    "[Pool %d] Needs to be cycled, terminating all instances..."
                    % instance_pool.id)
                instance_pool.last_cycled = timezone.now()
                self.terminate_pool_instances(instance_pool,
                                              instances,
                                              config,
                                              terminateByPool=True)
                instance_pool.save()

                # Try to update our terminated instances as soon as possible. If EC2 needs longer than
                # the here specified sleep time, the instances will be updated with the next iteration
                # of this pool, allowing other actions to be processed in-between.
                #time.sleep(2)
                #self.update_pool_instances(instance_pool, config)
                logger.info("[Pool %d] Termination complete." %
                            instance_pool.id)

            # Determine which instances need to be cycled
            #outdated_instances = instances.filter(created__lt = timezone.now() - timezone.timedelta(seconds=config.cycle_interval))

            # Terminate all instances that need cycling
            #for instance in outdated_instances:
            #    self.terminate_instance(instance, config)
            #    instances_missing += 1

            if instances_missing > 0:
                logger.info("[Pool %d] Needs %s more instances, starting..." %
                            (instance_pool.id, instances_missing))
                self.start_pool_instances(instance_pool,
                                          config,
                                          count=instances_missing)
            elif instances_missing < 0:
                # Select the oldest instances we have running and terminate
                # them so we meet the size limitation again.
                logger.info(
                    "[Pool %d] Has %s instances over limit, terminating..." %
                    (instance_pool.id, -instances_missing))
                instances = Instance.objects.filter(
                    pool=instance_pool).order_by(
                        'created')[:-instances_missing]
                self.terminate_pool_instances(instance_pool, instances, config)
            else:
                logger.debug("[Pool %d] Size is ok." % instance_pool.id)
    def update_pool_instances(self, pool, config):
        """ Check the state of the instances in a pool and update it in the database """
        instances = Instance.objects.filter(pool=pool)
        instance_ids_by_region = self.get_instance_ids_by_region(instances)
        instances_by_ids = self.get_instances_by_ids(instances)
        instances_left = []

        debug_boto_instance_ids_seen = set()
        debug_not_updatable_continue = set()
        debug_not_in_region = {}


        for instance_id in instances_by_ids:
            if instance_id:
                instances_left.append(instances_by_ids[instance_id])

        for region in instance_ids_by_region:
            cluster = Laniakea(None)
            try:
                cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "update_pool_instances", msg))
                return None

            try:
                boto_instances = cluster.find(filters={"tag:SpotManager-PoolId" : str(pool.pk)})

                for boto_instance in boto_instances:
                    # Store ID seen for debugging purposes
                    debug_boto_instance_ids_seen.add(boto_instance.id)

                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    state_code = boto_instance.state_code & 255

                    if "SpotManager-Updatable" not in boto_instance.tags or int(boto_instance.tags["SpotManager-Updatable"]) <= 0:
                        # The instance is not marked as updatable. We must not touch it because
                        # a spawning thread is still managing this instance. However, we must also
                        # remove this instance from the instances_left list if it's already in our
                        # database, because otherwise our code here would delete it from the database.
                        if boto_instance.id in instance_ids_by_region[region]:
                            instances_left.remove(instances_by_ids[boto_instance.id])
                        else:
                            debug_not_updatable_continue.add(boto_instance.id)
                        continue

                    instance = None

                    # Whenever we see an instance that is not in our instance list for that region,
                    # make sure it's a terminated instance because we should never have a running
                    # instance that matches the search above but is not in our database.
                    if not boto_instance.id in instance_ids_by_region[region]:
                        if not ((state_code == INSTANCE_STATE['shutting-down']
                            or state_code == INSTANCE_STATE['terminated'])):

                            # As a last resort, try to find the instance in our database.
                            # If the instance was saved to our database between the entrance
                            # to this function and the search query sent to EC2, then the instance
                            # will not be in our instances list but returned by EC2. In this
                            # case, we try to load it directly from the database.
                            q = Instance.objects.filter(ec2_instance_id=boto_instance.id)
                            if q:
                                instance = q[0]
                                logger.error("[Pool %d] Instance with EC2 ID %s was reloaded from database." % (pool.id, boto_instance.id))
                            else:
                                logger.error("[Pool %d] Instance with EC2 ID %s is not in our database." % (pool.id, boto_instance.id))

                                # Terminate at this point, we run in an inconsistent state
                                assert(False)
                        debug_not_in_region[boto_instance.id] = state_code
                        continue

                    instance = instances_by_ids[boto_instance.id]
                    instances_left.remove(instance)

                    # Check the status code and update if necessary
                    if instance.status_code != state_code:
                        instance.status_code = state_code
                        instance.save()

                    # If for some reason we don't have a hostname yet,
                    # update it accordingly.
                    if not instance.hostname:
                        instance.hostname = boto_instance.public_dns_name
                        instance.save()

            except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg:
                logger.exception("%s: boto failure: %s" % ("update_pool_instances", msg))
                return 1

        if instances_left:
            for instance in instances_left:
                if not instance.ec2_instance_id in debug_boto_instance_ids_seen:
                    logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has no corresponding machine on EC2." % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_updatable_continue:
                    logger.error("[Pool %d] Deleting instance with EC2 ID %s from our database because it is not updatable but not in our region." % (pool.id, instance.ec2_instance_id))

                if instance.ec2_instance_id in debug_not_in_region:
                    logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database, has state code %s on EC2" % (pool.id, instance.ec2_instance_id, debug_not_in_region[instance.ec2_instance_id]))

                logger.info("[Pool %d] Deleting instance with EC2 ID %s from our database." % (pool.id, instance.ec2_instance_id))
                instance.delete()
Beispiel #15
0
        def start_instances_async(pool, config, count, images, region, zone,
                                  instances):
            userdata = LaniakeaCommandLine.handle_import_tags(
                config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(
                userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)

                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Configuration error: Failed to compile userdata"
                entry.save()

                for instance in instances:
                    instance.delete()

                raise RuntimeError(
                    "start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." %
                            (pool.id, count))
                boto_instances = cluster.create_spot(
                    config.ec2_max_price,
                    tags=config.ec2_tags,
                    delete_on_termination=True,
                    timeout=20 * 60)
                canceled_requests = count - len(boto_instances)

                logger.info(
                    "[Pool %d] Successfully created %s instances, %s requests timed out and were canceled"
                    % (pool.id, len(boto_instances), canceled_requests))

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    instances[
                        i].status_code = boto_instances[i].state_code & 255
                    instances[i].save()

                    assert (instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if canceled_requests > 0:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info(
                            "[Pool %d] Deleting instance with id %s (belongs to canceled request)"
                            % (pool.id, instances[i].pk))
                        instances[i].delete()

                # Delete certain warnings we might have created earlier that no longer apply

                # If we ever exceeded the maximum spot instance count, we can clear
                # the warning now because we obviously succeeded in starting some instances.
                PoolStatusEntry.objects.filter(
                    pool=pool,
                    type=POOL_STATUS_ENTRY_TYPE[
                        'max-spot-instance-count-exceeded']).delete()

                # The same holds for temporary failures of any sort
                PoolStatusEntry.objects.filter(
                    pool=pool,
                    type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete()

                # Do not delete unclassified errors here for now, so the user can see them.

            except (boto.exception.EC2ResponseError,
                    boto.exception.BotoServerError, ssl.SSLError,
                    socket.error) as msg:
                if "MaxSpotInstanceCountExceeded" in str(msg):
                    logger.warning(
                        "[Pool %d] Maximum instance count exceeded for region %s"
                        % (pool.id, region))
                    if not PoolStatusEntry.objects.filter(
                            pool=pool,
                            type=POOL_STATUS_ENTRY_TYPE[
                                'max-spot-instance-count-exceeded']):
                        entry = PoolStatusEntry()
                        entry.pool = pool
                        entry.type = POOL_STATUS_ENTRY_TYPE[
                            'max-spot-instance-count-exceeded']
                        entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                        entry.save()
                elif "Service Unavailable" in str(msg):
                    logger.warning(
                        "[Pool %d] Temporary failure in region %s: %s" %
                        (pool.id, region, msg))
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                    entry.msg = "Temporary failure occurred: %s" % msg
                    entry.save()
                else:
                    logger.exception("[Pool %d] %s: boto failure: %s" %
                                     (pool.id, "start_instances_async", msg))
                    entry = PoolStatusEntry()
                    entry.type = 0
                    entry.pool = pool
                    entry.isCritical = True
                    entry.msg = "Unclassified error occurred: %s" % msg
                    entry.save()

                # Delete all pending instances, assuming that an exception from laniakea
                # means that all instance requests failed.
                for instance in instances:
                    instance.delete()

                return
    def check_instance_pools(self, initialCheck=False):
        # Check all start threads
        finished_start_thread_poolids = [id for id in async_start_threads_by_poolid if not async_start_threads_by_poolid[id].isAlive()]
        for id in finished_start_thread_poolids:
            del async_start_threads_by_poolid[id]

        # Process all instance pools
        instance_pools = InstancePool.objects.all()
        for instance_pool in instance_pools:
            criticalPoolStatusEntries = PoolStatusEntry.objects.filter(pool=instance_pool, isCritical=True)

            if criticalPoolStatusEntries:
                continue

            if instance_pool.config.isCyclic() or instance_pool.config.getMissingParameters():
                entry = PoolStatusEntry()
                entry.pool = instance_pool
                entry.isCritical = True
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.msg = "Configuration error."
                entry.save()
                continue

            config = instance_pool.config.flatten()

            instances_missing = config.size
            running_instances = []

            self.update_pool_instances(instance_pool, config)

            # On our initial check, we only do everything up to the pool update
            # to ensure that for every pool, the pool update can run successfully.
            if initialCheck:
                continue

            instances = Instance.objects.filter(pool=instance_pool)

            for instance in instances:
                instance_status_code_fixed = False
                if instance.status_code >= 256:
                    logger.warning("[Pool %d] Instance with EC2 ID %s has weird state code %d, attempting to fix..." % (instance_pool.id, instance.ec2_instance_id, instance.status_code))
                    instance.status_code -= 256
                    instance_status_code_fixed = True

                if instance.status_code in [INSTANCE_STATE['running'], INSTANCE_STATE['pending'], INSTANCE_STATE['requested']]:
                    instances_missing -= 1
                    running_instances.append(instance)
                elif instance.status_code in [INSTANCE_STATE['shutting-down'], INSTANCE_STATE['terminated']]:
                    # The instance is no longer running, delete it from our database
                    logger.info("[Pool %d] Deleting terminated instance with EC2 ID %s from our database." % (instance_pool.id, instance.ec2_instance_id))
                    instance.delete()
                else:
                    if instance_status_code_fixed:
                        # Restore original status code for error reporting
                        instance.status_code += 256

                    logger.error("[Pool %d] Instance with EC2 ID %s has unexpected state code %d" % (instance_pool.id, instance.ec2_instance_id, instance.status_code))
                    # In some cases, EC2 sends undocumented status codes and we don't know why
                    # For now, reset the status code to 0, consider the instance still present
                    # and hope that with the next update iteration, the problem will be gone.
                    instance.status_code = 0
                    instance.save()
                    instances_missing -= 1
                    running_instances.append(instance)

            # Continue working with the instances we have running
            instances = running_instances

            if not instance_pool.isEnabled:
                if running_instances:
                    self.terminate_pool_instances(instance_pool, running_instances, config, terminateByPool=True)

                    # Try to update our terminated instances as soon as possible. If EC2 needs longer than
                    # the here specified sleep time, the instances will be updated with the next iteration
                    # of this pool, allowing other actions to be processed in-between.
                    # time.sleep(2)
                    # self.update_pool_instances(instance_pool, config)
                continue

            if (not instance_pool.last_cycled) or instance_pool.last_cycled < timezone.now() - timezone.timedelta(seconds=config.cycle_interval):
                if pending_shutdown:
                    logger.info("[Pool %d] Shutdown pending, skipping pool cycle..." % instance_pool.id)
                else:
                    logger.info("[Pool %d] Needs to be cycled, terminating all instances..." % instance_pool.id)
                    instance_pool.last_cycled = timezone.now()
                    self.terminate_pool_instances(instance_pool, instances, config, terminateByPool=True)
                    instance_pool.save()

                    logger.info("[Pool %d] Termination complete." % instance_pool.id)

            if instances_missing > 0:
                if pending_shutdown:
                    logger.info("[Pool %d] Shutdown pending, not starting further instances..." % instance_pool.id)
                elif instance_pool.id in async_start_threads_by_poolid:
                    logger.debug("[Pool %d] Already has a start thread running, not starting further instances..." % instance_pool.id)
                else:
                    logger.info("[Pool %d] Needs %s more instances, starting..." % (instance_pool.id, instances_missing))
                    self.start_pool_instances(instance_pool, config, count=instances_missing)
            elif instances_missing < 0:
                # Select the oldest instances we have running and terminate
                # them so we meet the size limitation again.
                logger.info("[Pool %d] Has %s instances over limit, terminating..." % (instance_pool.id, -instances_missing))
                instances = Instance.objects.filter(pool=instance_pool).order_by('created')[:-instances_missing]
                self.terminate_pool_instances(instance_pool, instances, config)
            else:
                logger.debug("[Pool %d] Size is ok." % instance_pool.id)
    def start_pool_instances(self, pool, config, count=1):
        """ Start an instance with the given configuration """

        images = self.create_laniakea_images(config)

        # Figure out where to put our instances
        (region, zone, rejected) = self.get_best_region_zone(config)

        priceLowEntries = PoolStatusEntry.objects.filter(
            pool=pool, type=POOL_STATUS_ENTRY_TYPE['price-too-low'])

        if not region:
            logger.warn(
                "[Pool %d] No allowed region was cheap enough to spawn instances."
                % pool.id)

            if not priceLowEntries:
                entry = PoolStatusEntry()
                entry.pool = pool
                entry.type = POOL_STATUS_ENTRY_TYPE['price-too-low']
                entry.msg = "No allowed region was cheap enough to spawn instances."
                for zone in rejected:
                    entry.msg += "\n%s at %s" % (zone, rejected[zone])
                entry.save()
            return
        else:
            if priceLowEntries:
                priceLowEntries.delete()

        logger.debug("[Pool %d] Using region %s with availability zone %s." %
                     (pool.id, region, zone))

        instances = []

        # Create all our instances as pending, the async thread will update them once
        # they have been spawned.
        for i in range(0, count):
            instance = Instance()
            instance.ec2_region = region
            instance.ec2_zone = zone
            instance.status_code = INSTANCE_STATE["requested"]
            instance.pool = pool
            instance.save()
            instances.append(instance)

        # This method will run async to spawn our machines
        def start_instances_async(pool, config, count, images, region, zone,
                                  instances):
            userdata = LaniakeaCommandLine.handle_import_tags(
                config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(
                userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)
                raise RuntimeError(
                    "start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." %
                            (pool.id, count))
                (boto_instances, boto_pending) = cluster.create_spot(
                    config.ec2_max_price,
                    tags=config.ec2_tags,
                    delete_on_termination=True,
                    timeout=20 * 60)

                logger.info(
                    "[Pool %d] Successfully created %s instances, %s requests timed out and were canceled"
                    % (pool.id, len(boto_instances), len(boto_pending)))

                assert (len(boto_instances) +
                        len(boto_pending)) == len(instances) == count

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    instances[i].status_code = boto_instances[i].state_code
                    instances[i].save()

                    assert (instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if boto_pending:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info(
                            "[Pool %d] Deleting instance with id %s (belongs to canceled request)"
                            % (pool.id, instances[i].pk))
                        instances[i].delete()

            except boto.exception.EC2ResponseError as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" %
                                 (pool.id, "start_instances_async", msg))
                return

        # TODO: We don't get any information back from the async method call here, but should handle failures!
        t = threading.Thread(target=start_instances_async,
                             args=(pool, config, count, images, region, zone,
                                   instances))
        t.start()