def start_instances_async(pool, config, count, images, region, zone, instances):
            userdata = LaniakeaCommandLine.handle_import_tags(config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)

                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Configuration error: Failed to compile userdata"
                entry.save()

                for instance in instances:
                    instance.delete()

                raise RuntimeError("start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." % (pool.id, count))
                boto_instances = cluster.create_spot(config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20 * 60)
                canceled_requests = count - len(boto_instances)

                logger.info("[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), canceled_requests))

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    instances[i].status_code = boto_instances[i].state_code & 255
                    instances[i].save()

                    assert(instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if canceled_requests > 0:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info("[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk))
                        instances[i].delete()

                # Delete certain warnings we might have created earlier that no longer apply

                # If we ever exceeded the maximum spot instance count, we can clear
                # the warning now because we obviously succeeded in starting some instances.
                PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']).delete()

                # The same holds for temporary failures of any sort
                PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete()

                # Do not delete unclassified errors here for now, so the user can see them.

            except (boto.exception.EC2ResponseError, boto.exception.BotoServerError, ssl.SSLError, socket.error) as msg:
                if "MaxSpotInstanceCountExceeded" in str(msg):
                    logger.warning("[Pool %d] Maximum instance count exceeded for region %s" % (pool.id, region))
                    if not PoolStatusEntry.objects.filter(pool=pool, type=POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']):
                        entry = PoolStatusEntry()
                        entry.pool = pool
                        entry.type = POOL_STATUS_ENTRY_TYPE['max-spot-instance-count-exceeded']
                        entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                        entry.save()
                elif "Service Unavailable" in str(msg):
                    logger.warning("[Pool %d] Temporary failure in region %s: %s" % (pool.id, region, msg))
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                    entry.msg = "Temporary failure occurred: %s" % msg
                    entry.save()
                else:
                    logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg))
                    entry = PoolStatusEntry()
                    entry.type = 0
                    entry.pool = pool
                    entry.isCritical = True
                    entry.msg = "Unclassified error occurred: %s" % msg
                    entry.save()

                # Delete all pending instances, assuming that an exception from laniakea
                # means that all instance requests failed.
                for instance in instances:
                    instance.delete()

                return
     def start_instances_async(pool, config, count, images, region, zone, instances):
         userdata = LaniakeaCommandLine.handle_import_tags(config.ec2_userdata)
         userdata = LaniakeaCommandLine.handle_tags(userdata, config.ec2_userdata_macros)
         if not userdata:
             logger.error("[Pool %d] Failed to compile userdata." % pool.id)
             raise RuntimeError("start_instances_async: Failed to compile userdata")
         
         images["default"]['user_data'] = userdata
         images["default"]['placement'] = zone
         images["default"]['count'] = count
 
         cluster = Laniakea(images)
         try:
             cluster.connect(region=region, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key)
         except Exception as msg:
             logger.exception("[Pool %d] %s: laniakea failure: %s" % (pool.id, "start_instances_async", msg))
             
             # Log this error to the pool status messages
             entry = PoolStatusEntry()
             entry.pool = pool
             entry.msg = str(msg)
             entry.isCritical = True
             entry.save()
             
             # Delete all pending instances as we failed to create them
             for instance in instances:
                 instance.delete()
                 
             return
         
         config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)
 
         try:
             logger.info("[Pool %d] Creating %s instances..." % (pool.id, count))
             (boto_instances, boto_pending) = cluster.create_spot(config.ec2_max_price, tags=config.ec2_tags, delete_on_termination=True, timeout=20*60)
             
             logger.info("[Pool %d] Successfully created %s instances, %s requests timed out and were canceled" % (pool.id, len(boto_instances), len(boto_pending)))
             
             assert (len(boto_instances) + len(boto_pending)) == len(instances) == count
             
             for i in range(0,len(boto_instances)):
                 instances[i].hostname = boto_instances[i].public_dns_name
                 instances[i].ec2_instance_id = boto_instances[i].id
                 instances[i].status_code = boto_instances[i].state_code
                 instances[i].save()
                 
                 assert(instances[i].ec2_instance_id != None)
                 
                 # Now that we saved the object into our database, mark the instance as updatable
                 # so our update code can pick it up and update it accordingly when it changes states 
                 boto_instances[i].add_tag("SpotManager-Updatable", "1")
                 
             if boto_pending:
                 for i in range(len(boto_instances),count):
                     # Delete instances belong to canceled spot requests
                     logger.info("[Pool %d] Deleting instance with id %s (belongs to canceled request)" % (pool.id, instances[i].pk))
                     instances[i].delete()
             
         except boto.exception.EC2ResponseError as msg:
             logger.exception("[Pool %d] %s: boto failure: %s" % (pool.id, "start_instances_async", msg))
             return
Ejemplo n.º 3
0
        def start_instances_async(pool, config, count, images, region, zone,
                                  instances):
            userdata = LaniakeaCommandLine.handle_import_tags(
                config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(
                userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)

                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Configuration error: Failed to compile userdata"
                entry.save()

                for instance in instances:
                    instance.delete()

                raise RuntimeError(
                    "start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.type = 0
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." %
                            (pool.id, count))
                boto_instances = cluster.create_spot(
                    config.ec2_max_price,
                    tags=config.ec2_tags,
                    delete_on_termination=True,
                    timeout=20 * 60)
                canceled_requests = count - len(boto_instances)

                logger.info(
                    "[Pool %d] Successfully created %s instances, %s requests timed out and were canceled"
                    % (pool.id, len(boto_instances), canceled_requests))

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    # state_code is a 16-bit value where the high byte is
                    # an opaque internal value and should be ignored.
                    instances[
                        i].status_code = boto_instances[i].state_code & 255
                    instances[i].save()

                    assert (instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if canceled_requests > 0:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info(
                            "[Pool %d] Deleting instance with id %s (belongs to canceled request)"
                            % (pool.id, instances[i].pk))
                        instances[i].delete()

                # Delete certain warnings we might have created earlier that no longer apply

                # If we ever exceeded the maximum spot instance count, we can clear
                # the warning now because we obviously succeeded in starting some instances.
                PoolStatusEntry.objects.filter(
                    pool=pool,
                    type=POOL_STATUS_ENTRY_TYPE[
                        'max-spot-instance-count-exceeded']).delete()

                # The same holds for temporary failures of any sort
                PoolStatusEntry.objects.filter(
                    pool=pool,
                    type=POOL_STATUS_ENTRY_TYPE['temporary-failure']).delete()

                # Do not delete unclassified errors here for now, so the user can see them.

            except (boto.exception.EC2ResponseError,
                    boto.exception.BotoServerError, ssl.SSLError,
                    socket.error) as msg:
                if "MaxSpotInstanceCountExceeded" in str(msg):
                    logger.warning(
                        "[Pool %d] Maximum instance count exceeded for region %s"
                        % (pool.id, region))
                    if not PoolStatusEntry.objects.filter(
                            pool=pool,
                            type=POOL_STATUS_ENTRY_TYPE[
                                'max-spot-instance-count-exceeded']):
                        entry = PoolStatusEntry()
                        entry.pool = pool
                        entry.type = POOL_STATUS_ENTRY_TYPE[
                            'max-spot-instance-count-exceeded']
                        entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                        entry.save()
                elif "Service Unavailable" in str(msg):
                    logger.warning(
                        "[Pool %d] Temporary failure in region %s: %s" %
                        (pool.id, region, msg))
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                    entry.msg = "Temporary failure occurred: %s" % msg
                    entry.save()
                else:
                    logger.exception("[Pool %d] %s: boto failure: %s" %
                                     (pool.id, "start_instances_async", msg))
                    entry = PoolStatusEntry()
                    entry.type = 0
                    entry.pool = pool
                    entry.isCritical = True
                    entry.msg = "Unclassified error occurred: %s" % msg
                    entry.save()

                # Delete all pending instances, assuming that an exception from laniakea
                # means that all instance requests failed.
                for instance in instances:
                    instance.delete()

                return
        def start_instances_async(pool, config, count, images, region, zone,
                                  instances):
            userdata = LaniakeaCommandLine.handle_import_tags(
                config.ec2_userdata)
            userdata = LaniakeaCommandLine.handle_tags(
                userdata, config.ec2_userdata_macros)
            if not userdata:
                logger.error("[Pool %d] Failed to compile userdata." % pool.id)
                raise RuntimeError(
                    "start_instances_async: Failed to compile userdata")

            images["default"]['user_data'] = userdata
            images["default"]['placement'] = zone
            images["default"]['count'] = count

            cluster = Laniakea(images)
            try:
                cluster.connect(
                    region=region,
                    aws_access_key_id=config.aws_access_key_id,
                    aws_secret_access_key=config.aws_secret_access_key)
            except Exception as msg:
                logger.exception("[Pool %d] %s: laniakea failure: %s" %
                                 (pool.id, "start_instances_async", msg))

                # Log this error to the pool status messages
                entry = PoolStatusEntry()
                entry.pool = pool
                entry.msg = str(msg)
                entry.isCritical = True
                entry.save()

                # Delete all pending instances as we failed to create them
                for instance in instances:
                    instance.delete()

                return

            config.ec2_tags['SpotManager-PoolId'] = str(pool.pk)

            try:
                logger.info("[Pool %d] Creating %s instances..." %
                            (pool.id, count))
                (boto_instances, boto_pending) = cluster.create_spot(
                    config.ec2_max_price,
                    tags=config.ec2_tags,
                    delete_on_termination=True,
                    timeout=20 * 60)

                logger.info(
                    "[Pool %d] Successfully created %s instances, %s requests timed out and were canceled"
                    % (pool.id, len(boto_instances), len(boto_pending)))

                assert (len(boto_instances) +
                        len(boto_pending)) == len(instances) == count

                for i in range(0, len(boto_instances)):
                    instances[i].hostname = boto_instances[i].public_dns_name
                    instances[i].ec2_instance_id = boto_instances[i].id
                    instances[i].status_code = boto_instances[i].state_code
                    instances[i].save()

                    assert (instances[i].ec2_instance_id != None)

                    # Now that we saved the object into our database, mark the instance as updatable
                    # so our update code can pick it up and update it accordingly when it changes states
                    boto_instances[i].add_tag("SpotManager-Updatable", "1")

                if boto_pending:
                    for i in range(len(boto_instances), count):
                        # Delete instances belong to canceled spot requests
                        logger.info(
                            "[Pool %d] Deleting instance with id %s (belongs to canceled request)"
                            % (pool.id, instances[i].pk))
                        instances[i].delete()

            except boto.exception.EC2ResponseError as msg:
                logger.exception("[Pool %d] %s: boto failure: %s" %
                                 (pool.id, "start_instances_async", msg))
                return