Example #1
0
def _start_pool_instances(pool, config, count=1):
    """ Start an instance with the given configuration """
    from .models import Instance, INSTANCE_STATE, PoolStatusEntry, POOL_STATUS_ENTRY_TYPE
    images = _create_laniakea_images(config)

    # Figure out where to put our instances
    try:
        (region, zone, instance_type, rejected) = _get_best_region_zone(config)
    except (boto.exception.EC2ResponseError, boto.exception.BotoServerError,
            ssl.SSLError, socket.error):
        # In case of temporary failures here, we will retry again in the next cycle
        logger.warning("[Pool %d] Failed to acquire spot instance prices: %s.",
                       pool.id, traceback.format_exc())
        return
    except RuntimeError:
        logger.error("[Pool %d] Failed to compile userdata.", pool.id)
        entry = PoolStatusEntry()
        entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
        entry.pool = pool
        entry.isCritical = True
        entry.msg = "Configuration error: %s" % traceback.format_exc()
        entry.save()
        return

    priceLowEntries = PoolStatusEntry.objects.filter(
        pool=pool, type=POOL_STATUS_ENTRY_TYPE['price-too-low'])

    if not region:
        logger.warning(
            "[Pool %d] No allowed region was cheap enough to spawn instances.",
            pool.id)

        if not priceLowEntries:
            entry = PoolStatusEntry()
            entry.pool = pool
            entry.type = POOL_STATUS_ENTRY_TYPE['price-too-low']
            entry.msg = "No allowed region was cheap enough to spawn instances."
            for zone in rejected:
                entry.msg += "\n%s at %s" % (zone, rejected[zone])
            entry.save()
        return
    else:
        if priceLowEntries:
            priceLowEntries.delete()

    logger.debug(
        "[Pool %d] Using instance type %s in region %s with availability zone %s.",
        pool.id, instance_type, region, zone)

    try:
        userdata = LaniakeaCommandLine.handle_import_tags(
            config.ec2_userdata.decode('utf-8'))

        # Copy the userdata_macros and populate with internal variables
        ec2_userdata_macros = dict(config.ec2_userdata_macros)
        ec2_userdata_macros["EC2SPOTMANAGER_POOLID"] = str(pool.id)

        userdata = LaniakeaCommandLine.handle_tags(userdata,
                                                   ec2_userdata_macros)
        if not userdata:
            logger.error("[Pool %d] Failed to compile userdata.", pool.id)

            entry = PoolStatusEntry()
            entry.type = POOL_STATUS_ENTRY_TYPE['config-error']
            entry.pool = pool
            entry.isCritical = True
            entry.msg = "Configuration error: Failed to compile userdata"
            entry.save()

            raise RuntimeError(
                "start_pool_instances: Failed to compile userdata")

        images["default"]['user_data'] = userdata.encode("utf-8")
        images["default"]['placement'] = zone
        images["default"]['count'] = count
        images["default"]['instance_type'] = instance_type

        cluster = Laniakea(images)
        try:
            cluster.connect(region=region,
                            aws_access_key_id=config.aws_access_key_id,
                            aws_secret_access_key=config.aws_secret_access_key)
        except ssl.SSLError as msg:
            logger.warning(
                "[Pool %d] start_pool_instances: Temporary failure in region %s: %s",
                pool.id, region, msg)
            entry = PoolStatusEntry()
            entry.pool = pool
            entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
            entry.msg = "Temporary failure occurred: %s" % msg
            entry.save()

            return

        except Exception as msg:
            logger.exception(
                "[Pool %d] start_pool_instances: laniakea failure: %s",
                pool.id, msg)

            # Log this error to the pool status messages
            entry = PoolStatusEntry()
            entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
            entry.pool = pool
            entry.msg = str(msg)
            entry.isCritical = True
            entry.save()

            return

        try:
            logger.info("[Pool %d] Creating %d instances...", pool.id, count)
            for ec2_request in cluster.create_spot_requests(
                    config.ec2_max_price,
                    delete_on_termination=True,
                    timeout=10 * 60):
                instance = Instance()
                instance.ec2_instance_id = ec2_request
                instance.ec2_region = region
                instance.ec2_zone = zone
                instance.status_code = INSTANCE_STATE["requested"]
                instance.pool = pool
                instance.save()

        except (boto.exception.EC2ResponseError,
                boto.exception.BotoServerError, ssl.SSLError,
                socket.error) as msg:
            if "MaxSpotInstanceCountExceeded" in str(msg):
                logger.warning(
                    "[Pool %d] start_pool_instances: Maximum instance count exceeded for region %s",
                    pool.id, region)
                if not PoolStatusEntry.objects.filter(
                        pool=pool,
                        type=POOL_STATUS_ENTRY_TYPE[
                            'max-spot-instance-count-exceeded']):
                    entry = PoolStatusEntry()
                    entry.pool = pool
                    entry.type = POOL_STATUS_ENTRY_TYPE[
                        'max-spot-instance-count-exceeded']
                    entry.msg = "Auto-selected region exceeded its maximum spot instance count."
                    entry.save()
            elif "Service Unavailable" in str(msg):
                logger.warning(
                    "[Pool %d] start_pool_instances: Temporary failure in region %s: %s",
                    pool.id, region, msg)
                entry = PoolStatusEntry()
                entry.pool = pool
                entry.type = POOL_STATUS_ENTRY_TYPE['temporary-failure']
                entry.msg = "Temporary failure occurred: %s" % msg
                entry.save()
            else:
                logger.exception(
                    "[Pool %d] start_pool_instances: boto failure: %s",
                    pool.id, msg)
                entry = PoolStatusEntry()
                entry.type = POOL_STATUS_ENTRY_TYPE['unclassified']
                entry.pool = pool
                entry.isCritical = True
                entry.msg = "Unclassified error occurred: %s" % msg
                entry.save()

    except Exception as msg:
        logger.exception(
            "[Pool %d] start_pool_instances: unhandled failure: %s", pool.id,
            msg)
        raise